endoreg-db 0.8.3.3__py3-none-any.whl → 0.8.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of endoreg-db might be problematic. Click here for more details.

Files changed (41) hide show
  1. endoreg_db/data/ai_model_meta/default_multilabel_classification.yaml +23 -1
  2. endoreg_db/data/setup_config.yaml +38 -0
  3. endoreg_db/management/commands/create_model_meta_from_huggingface.py +1 -2
  4. endoreg_db/management/commands/load_ai_model_data.py +18 -15
  5. endoreg_db/management/commands/setup_endoreg_db.py +218 -33
  6. endoreg_db/models/media/pdf/raw_pdf.py +241 -97
  7. endoreg_db/models/media/video/pipe_1.py +30 -33
  8. endoreg_db/models/media/video/video_file.py +300 -187
  9. endoreg_db/models/medical/hardware/endoscopy_processor.py +10 -1
  10. endoreg_db/models/metadata/model_meta_logic.py +34 -45
  11. endoreg_db/models/metadata/sensitive_meta_logic.py +555 -150
  12. endoreg_db/serializers/__init__.py +26 -55
  13. endoreg_db/serializers/misc/__init__.py +1 -1
  14. endoreg_db/serializers/misc/file_overview.py +65 -35
  15. endoreg_db/serializers/misc/{vop_patient_data.py → sensitive_patient_data.py} +1 -1
  16. endoreg_db/serializers/video_examination.py +198 -0
  17. endoreg_db/services/lookup_service.py +228 -58
  18. endoreg_db/services/lookup_store.py +174 -30
  19. endoreg_db/services/pdf_import.py +585 -282
  20. endoreg_db/services/video_import.py +493 -240
  21. endoreg_db/urls/__init__.py +36 -23
  22. endoreg_db/urls/label_video_segments.py +2 -0
  23. endoreg_db/urls/media.py +103 -66
  24. endoreg_db/utils/setup_config.py +177 -0
  25. endoreg_db/views/__init__.py +5 -3
  26. endoreg_db/views/media/pdf_media.py +3 -1
  27. endoreg_db/views/media/video_media.py +1 -1
  28. endoreg_db/views/media/video_segments.py +187 -259
  29. endoreg_db/views/pdf/__init__.py +5 -8
  30. endoreg_db/views/pdf/pdf_stream.py +186 -0
  31. endoreg_db/views/pdf/reimport.py +110 -94
  32. endoreg_db/views/requirement/lookup.py +171 -287
  33. endoreg_db/views/video/__init__.py +0 -2
  34. endoreg_db/views/video/video_examination_viewset.py +202 -289
  35. {endoreg_db-0.8.3.3.dist-info → endoreg_db-0.8.6.5.dist-info}/METADATA +1 -2
  36. {endoreg_db-0.8.3.3.dist-info → endoreg_db-0.8.6.5.dist-info}/RECORD +38 -37
  37. endoreg_db/views/pdf/pdf_media.py +0 -239
  38. endoreg_db/views/pdf/pdf_stream_views.py +0 -127
  39. endoreg_db/views/video/video_media.py +0 -158
  40. {endoreg_db-0.8.3.3.dist-info → endoreg_db-0.8.6.5.dist-info}/WHEEL +0 -0
  41. {endoreg_db-0.8.3.3.dist-info → endoreg_db-0.8.6.5.dist-info}/licenses/LICENSE +0 -0
@@ -4,22 +4,25 @@ PDF import service module.
4
4
  Provides high-level functions for importing and anonymizing PDF files,
5
5
  combining RawPdfFile creation with text extraction and anonymization.
6
6
  """
7
- from datetime import date, datetime
7
+
8
8
  import errno
9
+ import hashlib
9
10
  import logging
11
+ import os
10
12
  import shutil
11
13
  import sys
12
- import os
13
- import hashlib
14
+ import time
15
+ from contextlib import contextmanager
16
+ from datetime import date, datetime
14
17
  from pathlib import Path
15
18
  from typing import TYPE_CHECKING, Union
16
- from contextlib import contextmanager
19
+
17
20
  from django.db import transaction
21
+
22
+ from endoreg_db.models import SensitiveMeta
18
23
  from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
19
24
  from endoreg_db.models.state.raw_pdf import RawPdfState
20
- from endoreg_db.models import SensitiveMeta
21
25
  from endoreg_db.utils import paths as path_utils
22
- import time
23
26
 
24
27
  logger = logging.getLogger(__name__)
25
28
 
@@ -34,24 +37,69 @@ class PdfImportService:
34
37
  """
35
38
  Service class for importing and processing PDF files with text extraction and anonymization.
36
39
  Uses a central PDF instance pattern for cleaner state management.
40
+
41
+ Supports two processing modes:
42
+ - 'blackening': Simple PDF masking with black rectangles over sensitive areas
43
+ - 'cropping': Advanced mode that crops sensitive regions to separate images
37
44
  """
38
-
39
- def __init__(self, allow_meta_overwrite: bool = False):
45
+
46
+ def __init__(
47
+ self, allow_meta_overwrite: bool = False, processing_mode: str = "blackening"
48
+ ):
40
49
  """
41
50
  Initialize the PDF import service.
42
-
51
+
43
52
  Args:
44
53
  allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
54
+ processing_mode: Processing mode - 'blackening' for simple masking, 'cropping' for advanced cropping
45
55
  """
46
56
  self.processed_files = set()
47
57
  self._report_reader_available = None
48
58
  self._report_reader_class = None
49
59
  self.allow_meta_overwrite = allow_meta_overwrite
50
-
60
+
61
+ # Validate and set processing mode
62
+ valid_modes = ["blackening", "cropping"]
63
+ if processing_mode not in valid_modes:
64
+ raise ValueError(
65
+ f"Invalid processing_mode '{processing_mode}'. Must be one of: {valid_modes}"
66
+ )
67
+ self.processing_mode = processing_mode
68
+
51
69
  # Central PDF instance management
52
70
  self.current_pdf = None
53
71
  self.processing_context = {}
54
-
72
+
73
+ @classmethod
74
+ def with_blackening(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
75
+ """
76
+ Create a PdfImportService configured for simple PDF blackening mode.
77
+
78
+ Args:
79
+ allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
80
+
81
+ Returns:
82
+ PdfImportService instance configured for blackening mode
83
+ """
84
+ return cls(
85
+ allow_meta_overwrite=allow_meta_overwrite, processing_mode="blackening"
86
+ )
87
+
88
+ @classmethod
89
+ def with_cropping(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
90
+ """
91
+ Create a PdfImportService configured for advanced cropping mode.
92
+
93
+ Args:
94
+ allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
95
+
96
+ Returns:
97
+ PdfImportService instance configured for cropping mode
98
+ """
99
+ return cls(
100
+ allow_meta_overwrite=allow_meta_overwrite, processing_mode="cropping"
101
+ )
102
+
55
103
  @contextmanager
56
104
  def _file_lock(self, path: Path):
57
105
  """Create a file lock to prevent duplicate processing.
@@ -77,11 +125,14 @@ class PdfImportService:
77
125
  try:
78
126
  logger.warning(
79
127
  "Stale lock detected for %s (age %.0fs). Reclaiming lock...",
80
- path, age
128
+ path,
129
+ age,
81
130
  )
82
131
  lock_path.unlink()
83
132
  except Exception as e:
84
- logger.warning("Failed to remove stale lock %s: %s", lock_path, e)
133
+ logger.warning(
134
+ "Failed to remove stale lock %s: %s", lock_path, e
135
+ )
85
136
  # retry acquire
86
137
  fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
87
138
  else:
@@ -100,7 +151,7 @@ class PdfImportService:
100
151
  lock_path.unlink()
101
152
  except OSError:
102
153
  pass
103
-
154
+
104
155
  def _sha256(self, path: Path, chunk: int = 1024 * 1024) -> str:
105
156
  """Compute SHA256 hash of a file."""
106
157
  h = hashlib.sha256()
@@ -134,7 +185,7 @@ class PdfImportService:
134
185
  return Path(str(candidate))
135
186
  except Exception:
136
187
  return None
137
-
188
+
138
189
  def _quarantine(self, source: Path) -> Path:
139
190
  """Move file to quarantine directory to prevent re-processing."""
140
191
  qdir = path_utils.PDF_DIR / "_processing"
@@ -150,7 +201,7 @@ class PdfImportService:
150
201
  else:
151
202
  raise
152
203
  return target
153
-
204
+
154
205
  def _ensure_state(self, pdf_file: "RawPdfFile"):
155
206
  """Ensure PDF file has a state object."""
156
207
  if getattr(pdf_file, "state", None):
@@ -166,134 +217,156 @@ class PdfImportService:
166
217
  return state
167
218
  except Exception:
168
219
  return None
169
-
220
+
170
221
  def _ensure_report_reading_available(self):
171
222
  """
172
223
  Ensure report reading modules are available by adding lx-anonymizer to path.
173
-
224
+
174
225
  Returns:
175
226
  Tuple of (availability_flag, ReportReader_class)
176
227
  """
177
228
  if self._report_reader_available is not None:
178
229
  return self._report_reader_available, self._report_reader_class
179
-
230
+
180
231
  try:
181
232
  # Try direct import first
182
233
  from lx_anonymizer import ReportReader
183
-
234
+
184
235
  logger.info("Successfully imported lx_anonymizer ReportReader module")
185
236
  self._report_reader_available = True
186
237
  self._report_reader_class = ReportReader
187
238
  return True, ReportReader
188
-
239
+
189
240
  except ImportError:
190
241
  # Optional: honor LX_ANONYMIZER_PATH=/abs/path/to/src
191
242
  import importlib
243
+
192
244
  extra = os.getenv("LX_ANONYMIZER_PATH")
193
245
  if extra and extra not in sys.path and Path(extra).exists():
194
246
  sys.path.insert(0, extra)
195
247
  try:
196
248
  mod = importlib.import_module("lx_anonymizer")
197
249
  ReportReader = getattr(mod, "ReportReader")
198
- logger.info("Imported lx_anonymizer.ReportReader via LX_ANONYMIZER_PATH")
250
+ logger.info(
251
+ "Imported lx_anonymizer.ReportReader via LX_ANONYMIZER_PATH"
252
+ )
199
253
  self._report_reader_available = True
200
254
  self._report_reader_class = ReportReader
201
255
  return True, ReportReader
202
256
  except Exception as e:
203
- logger.warning("Failed importing lx_anonymizer via LX_ANONYMIZER_PATH: %s", e)
257
+ logger.warning(
258
+ "Failed importing lx_anonymizer via LX_ANONYMIZER_PATH: %s", e
259
+ )
204
260
  finally:
205
261
  # Keep path for future imports if it worked; otherwise remove.
206
262
  if "ReportReader" not in locals() and extra in sys.path:
207
263
  sys.path.remove(extra)
208
-
264
+
209
265
  self._report_reader_available = False
210
266
  self._report_reader_class = None
211
267
  return False, None
212
268
 
213
-
214
269
  def _ensure_default_patient_data(self, pdf_instance: "RawPdfFile" = None) -> None:
215
270
  """
216
271
  Ensure PDF has minimum required patient data in SensitiveMeta.
217
272
  Creates default values if data is missing after text processing.
218
273
  Uses the central PDF instance if no specific instance provided.
219
-
274
+
220
275
  Args:
221
276
  pdf_instance: Optional specific PDF instance, defaults to self.current_pdf
222
277
  """
223
278
  pdf_file = pdf_instance or self.current_pdf
224
279
  if not pdf_file:
225
- logger.warning("No PDF instance available for ensuring default patient data")
280
+ logger.warning(
281
+ "No PDF instance available for ensuring default patient data"
282
+ )
226
283
  return
227
-
284
+
228
285
  if not pdf_file.sensitive_meta:
229
- logger.info(f"No SensitiveMeta found for PDF {pdf_file.pdf_hash}, creating default")
230
-
286
+ logger.info(
287
+ f"No SensitiveMeta found for PDF {pdf_file.pdf_hash}, creating default"
288
+ )
289
+
231
290
  # Create default SensitiveMeta with placeholder data
232
291
  default_data = {
233
292
  "patient_first_name": "Patient",
234
- "patient_last_name": "Unknown",
293
+ "patient_last_name": "Unknown",
235
294
  "patient_dob": date(1990, 1, 1), # Default DOB
236
295
  "examination_date": date.today(),
237
- "center_name": pdf_file.center.name if pdf_file.center else "university_hospital_wuerzburg"
296
+ "center_name": pdf_file.center.name
297
+ if pdf_file.center
298
+ else "university_hospital_wuerzburg",
238
299
  }
239
-
300
+
240
301
  try:
241
302
  sensitive_meta = SensitiveMeta.create_from_dict(default_data)
242
303
  pdf_file.sensitive_meta = sensitive_meta
243
- pdf_file.save(update_fields=['sensitive_meta'])
244
- logger.info(f"Created default SensitiveMeta for PDF {pdf_file.pdf_hash}")
304
+ pdf_file.save(update_fields=["sensitive_meta"])
305
+ logger.info(
306
+ f"Created default SensitiveMeta for PDF {pdf_file.pdf_hash}"
307
+ )
245
308
  except Exception as e:
246
- logger.error(f"Failed to create default SensitiveMeta for PDF {pdf_file.pdf_hash}: {e}")
309
+ logger.error(
310
+ f"Failed to create default SensitiveMeta for PDF {pdf_file.pdf_hash}: {e}"
311
+ )
247
312
 
248
313
  def import_and_anonymize(
249
- self,
250
- file_path: Union[Path, str],
251
- center_name: str,
314
+ self,
315
+ file_path: Union[Path, str],
316
+ center_name: str,
252
317
  delete_source: bool = False,
253
318
  retry: bool = False,
254
319
  ) -> "RawPdfFile":
255
320
  """
256
321
  Import a PDF file and anonymize it using ReportReader.
257
322
  Uses centralized PDF instance management pattern.
258
-
323
+
324
+ The processing mode is determined by the service initialization:
325
+ - 'blackening': Creates an anonymized PDF with black rectangles over sensitive regions
326
+ - 'cropping': Advanced mode that crops sensitive regions to separate images
327
+
259
328
  Args:
260
329
  file_path: Path to the PDF file to import
261
330
  center_name: Name of the center to associate with PDF
262
331
  delete_source: Whether to delete the source file after import
263
332
  retry: Whether this is a retry attempt
264
-
333
+
265
334
  Returns:
266
335
  RawPdfFile instance after import and processing
267
-
336
+
268
337
  Raises:
269
338
  Exception: On any failure during import or processing
270
339
  """
271
340
  try:
272
341
  # Initialize processing context
273
- self._initialize_processing_context(file_path, center_name, delete_source, retry)
274
-
342
+ self._initialize_processing_context(
343
+ file_path, center_name, delete_source, retry
344
+ )
345
+
275
346
  # Step 1: Validate and prepare file
276
347
  self._validate_and_prepare_file()
277
-
348
+
278
349
  # Step 2: Create or retrieve PDF instance
279
350
  self._create_or_retrieve_pdf_instance()
280
-
351
+
281
352
  # Early return check - if no PDF instance was created, return None
282
353
  if not self.current_pdf:
283
- logger.warning(f"No PDF instance created for {file_path}, returning None")
354
+ logger.warning(
355
+ f"No PDF instance created for {file_path}, returning None"
356
+ )
284
357
  return None
285
-
358
+
286
359
  # Step 3: Setup processing environment
287
360
  self._setup_processing_environment()
288
-
361
+
289
362
  # Step 4: Process text and metadata
290
363
  self._process_text_and_metadata()
291
-
364
+
292
365
  # Step 5: Finalize processing
293
366
  self._finalize_processing()
294
-
367
+
295
368
  return self.current_pdf
296
-
369
+
297
370
  except ValueError as e:
298
371
  # Handle "File already being processed" case specifically
299
372
  if "already being processed" in str(e):
@@ -312,50 +385,57 @@ class PdfImportService:
312
385
  # Always cleanup context
313
386
  self._cleanup_processing_context()
314
387
 
315
- def _initialize_processing_context(self, file_path: Union[Path, str], center_name: str,
316
- delete_source: bool, retry: bool):
388
+ def _initialize_processing_context(
389
+ self,
390
+ file_path: Union[Path, str],
391
+ center_name: str,
392
+ delete_source: bool,
393
+ retry: bool,
394
+ ):
317
395
  """Initialize the processing context for the current PDF."""
318
396
  self.processing_context = {
319
- 'file_path': Path(file_path),
320
- 'original_file_path': Path(file_path),
321
- 'center_name': center_name,
322
- 'delete_source': delete_source,
323
- 'retry': retry,
324
- 'file_hash': None,
325
- 'processing_started': False,
326
- 'text_extracted': False,
327
- 'metadata_processed': False,
328
- 'anonymization_completed': False
397
+ "file_path": Path(file_path),
398
+ "original_file_path": Path(file_path),
399
+ "center_name": center_name,
400
+ "delete_source": delete_source,
401
+ "retry": retry,
402
+ "file_hash": None,
403
+ "processing_started": False,
404
+ "text_extracted": False,
405
+ "metadata_processed": False,
406
+ "anonymization_completed": False,
329
407
  }
330
-
408
+
331
409
  # Check if already processed (only during current session to prevent race conditions)
332
410
  if str(file_path) in self.processed_files:
333
- logger.info(f"File {file_path} already being processed in current session, skipping")
411
+ logger.info(
412
+ f"File {file_path} already being processed in current session, skipping"
413
+ )
334
414
  raise ValueError("File already being processed")
335
-
415
+
336
416
  logger.info(f"Starting import and processing for: {file_path}")
337
417
 
338
418
  def _validate_and_prepare_file(self):
339
419
  """Validate file existence and calculate hash."""
340
- file_path = self.processing_context['file_path']
341
-
420
+ file_path = self.processing_context["file_path"]
421
+
342
422
  if not file_path.exists():
343
423
  raise FileNotFoundError(f"PDF file not found: {file_path}")
344
-
424
+
345
425
  try:
346
- self.processing_context['file_hash'] = self._sha256(file_path)
426
+ self.processing_context["file_hash"] = self._sha256(file_path)
347
427
  except Exception as e:
348
428
  logger.warning(f"Could not calculate file hash: {e}")
349
- self.processing_context['file_hash'] = None
429
+ self.processing_context["file_hash"] = None
350
430
 
351
431
  def _create_or_retrieve_pdf_instance(self):
352
432
  """Create new or retrieve existing PDF instance."""
353
- file_path = self.processing_context['file_path']
354
- center_name = self.processing_context['center_name']
355
- delete_source = self.processing_context['delete_source']
356
- retry = self.processing_context['retry']
357
- file_hash = self.processing_context['file_hash']
358
-
433
+ file_path = self.processing_context["file_path"]
434
+ center_name = self.processing_context["center_name"]
435
+ delete_source = self.processing_context["delete_source"]
436
+ retry = self.processing_context["retry"]
437
+ file_hash = self.processing_context["file_hash"]
438
+
359
439
  if not retry:
360
440
  # Check for existing PDF and handle duplicates
361
441
  with self._file_lock(file_path):
@@ -366,18 +446,20 @@ class PdfImportService:
366
446
  if existing:
367
447
  logger.info(f"Found existing RawPdfFile {existing.pdf_hash}")
368
448
  if existing.text:
369
- logger.info(f"Existing PDF {existing.pdf_hash} already processed - returning")
449
+ logger.info(
450
+ f"Existing PDF {existing.pdf_hash} already processed - returning"
451
+ )
370
452
  self.current_pdf = existing
371
453
  return
372
454
  else:
373
455
  # Retry processing
374
456
  logger.info(f"Reprocessing existing PDF {existing.pdf_hash}")
375
457
  return self._retry_existing_pdf(existing)
376
-
458
+
377
459
  # Create new PDF instance
378
460
  logger.info("Creating new RawPdfFile instance...")
379
461
  from django.db import IntegrityError
380
-
462
+
381
463
  try:
382
464
  if not retry:
383
465
  self.current_pdf = RawPdfFile.create_from_file_initialized(
@@ -388,18 +470,22 @@ class PdfImportService:
388
470
  else:
389
471
  # Retrieve existing for retry
390
472
  self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
391
- logger.info(f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}")
392
-
473
+ logger.info(
474
+ f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}"
475
+ )
476
+
393
477
  # Check if retry is actually needed
394
478
  if self.current_pdf.text:
395
- logger.info(f"Existing PDF {self.current_pdf.pdf_hash} already processed during retry - returning")
479
+ logger.info(
480
+ f"Existing PDF {self.current_pdf.pdf_hash} already processed during retry - returning"
481
+ )
396
482
  return
397
-
483
+
398
484
  if not self.current_pdf:
399
485
  raise RuntimeError("Failed to create RawPdfFile instance")
400
-
486
+
401
487
  logger.info(f"PDF instance ready: {self.current_pdf.pdf_hash}")
402
-
488
+
403
489
  except IntegrityError:
404
490
  # Race condition - another worker created it
405
491
  if file_hash:
@@ -410,27 +496,29 @@ class PdfImportService:
410
496
 
411
497
  def _setup_processing_environment(self):
412
498
  """Setup processing environment and state."""
413
- original_path = self.processing_context.get('file_path')
499
+ original_path = self.processing_context.get("file_path")
414
500
 
415
501
  # Create sensitive file copy
416
502
  self.create_sensitive_file(self.current_pdf, original_path)
417
-
503
+
418
504
  # Update file path to point to sensitive copy
419
- self.processing_context['file_path'] = self.current_pdf.file.path
420
- self.processing_context['sensitive_copy_created'] = True
505
+ self.processing_context["file_path"] = self.current_pdf.file.path
506
+ self.processing_context["sensitive_copy_created"] = True
421
507
  try:
422
- self.processing_context['sensitive_file_path'] = Path(self.current_pdf.file.path)
508
+ self.processing_context["sensitive_file_path"] = Path(
509
+ self.current_pdf.file.path
510
+ )
423
511
  except Exception:
424
- self.processing_context['sensitive_file_path'] = None
425
-
512
+ self.processing_context["sensitive_file_path"] = None
513
+
426
514
  # Ensure state exists
427
515
  state = self.current_pdf.get_or_create_state()
428
516
  state.mark_processing_started()
429
- self.processing_context['processing_started'] = True
430
-
517
+ self.processing_context["processing_started"] = True
518
+
431
519
  # Mark as processed to prevent duplicates
432
- self.processed_files.add(str(self.processing_context['file_path']))
433
-
520
+ self.processed_files.add(str(self.processing_context["file_path"]))
521
+
434
522
  # Ensure default patient data
435
523
  logger.info("Ensuring default patient data...")
436
524
  self._ensure_default_patient_data(self.current_pdf)
@@ -438,83 +526,154 @@ class PdfImportService:
438
526
  def _process_text_and_metadata(self):
439
527
  """Process text extraction and metadata using ReportReader."""
440
528
  report_reading_available, ReportReader = self._ensure_report_reading_available()
441
-
529
+
442
530
  if not report_reading_available:
443
531
  logger.warning("Report reading not available (lx_anonymizer not found)")
444
532
  self._mark_processing_incomplete("no_report_reader")
445
533
  return
446
-
534
+
447
535
  if not self.current_pdf.file:
448
536
  logger.warning("No file available for text processing")
449
537
  self._mark_processing_incomplete("no_file")
450
538
  return
451
-
539
+
452
540
  try:
453
- logger.info("Starting text extraction and metadata processing with ReportReader...")
454
-
455
- # Setup output directories
456
- crops_dir = path_utils.PDF_DIR / 'cropped_regions'
457
- anonymized_dir = path_utils.PDF_DIR / 'anonymized'
458
- crops_dir.mkdir(parents=True, exist_ok=True)
459
- anonymized_dir.mkdir(parents=True, exist_ok=True)
541
+ logger.info(
542
+ f"Starting text extraction and metadata processing with ReportReader (mode: {self.processing_mode})..."
543
+ )
460
544
 
461
545
  # Initialize ReportReader
462
546
  report_reader = ReportReader(
463
547
  report_root_path=str(path_utils.STORAGE_DIR),
464
548
  locale="de_DE",
465
- text_date_format="%d.%m.%Y"
549
+ text_date_format="%d.%m.%Y",
466
550
  )
467
551
 
468
- # Process with cropping
469
- original_text, anonymized_text, extracted_metadata, cropped_regions, anonymized_pdf_path = report_reader.process_report_with_cropping(
470
- pdf_path=self.processing_context['file_path'],
471
- crop_sensitive_regions=True,
472
- crop_output_dir=str(crops_dir),
473
- anonymization_output_dir=str(anonymized_dir)
474
- )
475
-
476
- # Store results in context
477
- self.processing_context.update({
478
- 'original_text': original_text,
479
- 'anonymized_text': anonymized_text,
480
- 'extracted_metadata': extracted_metadata,
481
- 'cropped_regions': cropped_regions,
482
- 'anonymized_pdf_path': anonymized_pdf_path
483
- })
484
-
485
- if original_text:
486
- self._apply_text_results()
487
- self.processing_context['text_extracted'] = True
488
-
489
- if extracted_metadata:
490
- self._apply_metadata_results()
491
- self.processing_context['metadata_processed'] = True
492
-
493
- if anonymized_pdf_path:
494
- self._apply_anonymized_pdf()
495
- self.processing_context['anonymization_completed'] = True
496
-
552
+ if self.processing_mode == "cropping":
553
+ # Use advanced cropping method (existing implementation)
554
+ self._process_with_cropping(report_reader)
555
+ else: # blackening mode
556
+ # Use enhanced process_report with PDF masking
557
+ self._process_with_blackening(report_reader)
558
+
497
559
  except Exception as e:
498
560
  logger.warning(f"Text processing failed: {e}")
499
561
  self._mark_processing_incomplete("text_processing_failed")
500
562
 
563
+ def _process_with_blackening(self, report_reader):
564
+ """Process PDF using simple blackening/masking mode."""
565
+ logger.info("Using simple PDF blackening mode...")
566
+
567
+ # Setup anonymized directory
568
+ anonymized_dir = path_utils.PDF_DIR / "anonymized"
569
+ anonymized_dir.mkdir(parents=True, exist_ok=True)
570
+
571
+ # Generate output path for anonymized PDF
572
+ pdf_hash = self.current_pdf.pdf_hash
573
+ anonymized_output_path = anonymized_dir / f"{pdf_hash}_anonymized.pdf"
574
+
575
+ # Process with enhanced process_report method (returns 4-tuple now)
576
+ original_text, anonymized_text, extracted_metadata, anonymized_pdf_path = (
577
+ report_reader.process_report(
578
+ pdf_path=self.processing_context["file_path"],
579
+ create_anonymized_pdf=True,
580
+ anonymized_pdf_output_path=str(anonymized_output_path),
581
+ )
582
+ )
583
+
584
+ # Store results in context
585
+ self.processing_context.update(
586
+ {
587
+ "original_text": original_text,
588
+ "anonymized_text": anonymized_text,
589
+ "extracted_metadata": extracted_metadata,
590
+ "cropped_regions": None, # Not available in blackening mode
591
+ "anonymized_pdf_path": anonymized_pdf_path,
592
+ }
593
+ )
594
+
595
+ # Apply results
596
+ if original_text:
597
+ self._apply_text_results()
598
+ self.processing_context["text_extracted"] = True
599
+
600
+ if extracted_metadata:
601
+ self._apply_metadata_results()
602
+ self.processing_context["metadata_processed"] = True
603
+
604
+ if anonymized_pdf_path:
605
+ self._apply_anonymized_pdf()
606
+ self.processing_context["anonymization_completed"] = True
607
+
608
+ logger.info("PDF blackening processing completed")
609
+
610
+ def _process_with_cropping(self, report_reader):
611
+ """Process PDF using advanced cropping mode (existing implementation)."""
612
+ logger.info("Using advanced cropping mode...")
613
+
614
+ # Setup output directories
615
+ crops_dir = path_utils.PDF_DIR / "cropped_regions"
616
+ anonymized_dir = path_utils.PDF_DIR / "anonymized"
617
+ crops_dir.mkdir(parents=True, exist_ok=True)
618
+ anonymized_dir.mkdir(parents=True, exist_ok=True)
619
+
620
+ # Process with cropping (returns 5-tuple)
621
+ (
622
+ original_text,
623
+ anonymized_text,
624
+ extracted_metadata,
625
+ cropped_regions,
626
+ anonymized_pdf_path,
627
+ ) = report_reader.process_report_with_cropping(
628
+ pdf_path=self.processing_context["file_path"],
629
+ crop_sensitive_regions=True,
630
+ crop_output_dir=str(crops_dir),
631
+ anonymization_output_dir=str(anonymized_dir),
632
+ )
633
+
634
+ # Store results in context
635
+ self.processing_context.update(
636
+ {
637
+ "original_text": original_text,
638
+ "anonymized_text": anonymized_text,
639
+ "extracted_metadata": extracted_metadata,
640
+ "cropped_regions": cropped_regions,
641
+ "anonymized_pdf_path": anonymized_pdf_path,
642
+ }
643
+ )
644
+
645
+ # Apply results
646
+ if original_text:
647
+ self._apply_text_results()
648
+ self.processing_context["text_extracted"] = True
649
+
650
+ if extracted_metadata:
651
+ self._apply_metadata_results()
652
+ self.processing_context["metadata_processed"] = True
653
+
654
+ if anonymized_pdf_path:
655
+ self._apply_anonymized_pdf()
656
+ self.processing_context["anonymization_completed"] = True
657
+
658
+ logger.info("PDF cropping processing completed")
659
+
501
660
  def _apply_text_results(self):
502
661
  """Apply text extraction results to the PDF instance."""
503
662
  if not self.current_pdf:
504
663
  logger.warning("Cannot apply text results - no PDF instance available")
505
664
  return
506
-
507
- original_text = self.processing_context.get('original_text')
508
- anonymized_text = self.processing_context.get('anonymized_text')
509
-
665
+
666
+ original_text = self.processing_context.get("original_text")
667
+ anonymized_text = self.processing_context.get("anonymized_text")
668
+
510
669
  if not original_text:
511
670
  logger.warning("No original text available to apply")
512
671
  return
513
-
672
+
514
673
  # Store extracted text
515
674
  self.current_pdf.text = original_text
516
675
  logger.info(f"Extracted {len(original_text)} characters of text from PDF")
517
-
676
+
518
677
  # Handle anonymized text
519
678
  if anonymized_text and anonymized_text != original_text:
520
679
  self.current_pdf.anonymized = True
@@ -525,56 +684,56 @@ class PdfImportService:
525
684
  if not self.current_pdf:
526
685
  logger.warning("Cannot apply metadata results - no PDF instance available")
527
686
  return
528
-
529
- extracted_metadata = self.processing_context.get('extracted_metadata')
530
-
687
+
688
+ extracted_metadata = self.processing_context.get("extracted_metadata")
689
+
531
690
  if not self.current_pdf.sensitive_meta or not extracted_metadata:
532
691
  logger.debug("No sensitive meta or extracted metadata available")
533
692
  return
534
-
693
+
535
694
  sm = self.current_pdf.sensitive_meta
536
-
695
+
537
696
  # Map ReportReader metadata to SensitiveMeta fields
538
697
  metadata_mapping = {
539
- 'patient_first_name': 'patient_first_name',
540
- 'patient_last_name': 'patient_last_name',
541
- 'patient_dob': 'patient_dob',
542
- 'examination_date': 'examination_date',
543
- 'examiner_first_name': 'examiner_first_name',
544
- 'examiner_last_name': 'examiner_last_name',
545
- 'endoscope_type': 'endoscope_type',
546
- 'casenumber': 'case_number'
698
+ "patient_first_name": "patient_first_name",
699
+ "patient_last_name": "patient_last_name",
700
+ "patient_dob": "patient_dob",
701
+ "examination_date": "examination_date",
702
+ "examiner_first_name": "examiner_first_name",
703
+ "examiner_last_name": "examiner_last_name",
704
+ "endoscope_type": "endoscope_type",
705
+ "casenumber": "case_number",
547
706
  }
548
-
707
+
549
708
  # Update fields with extracted information
550
709
  updated_fields = []
551
710
  for meta_key, sm_field in metadata_mapping.items():
552
711
  if extracted_metadata.get(meta_key) and hasattr(sm, sm_field):
553
712
  old_value = getattr(sm, sm_field)
554
713
  raw_value = extracted_metadata[meta_key]
555
-
714
+
556
715
  # Skip if we just got the field name as a string (indicates no actual data)
557
716
  if isinstance(raw_value, str) and raw_value == meta_key:
558
717
  continue
559
-
718
+
560
719
  # Handle date fields specially
561
- if sm_field in ['patient_dob', 'examination_date']:
720
+ if sm_field in ["patient_dob", "examination_date"]:
562
721
  new_value = self._parse_date_field(raw_value, meta_key, sm_field)
563
722
  if new_value is None:
564
723
  continue
565
724
  else:
566
725
  new_value = raw_value
567
-
726
+
568
727
  # Configurable overwrite policy
569
728
  should_overwrite = (
570
729
  self.allow_meta_overwrite
571
730
  or not old_value
572
- or old_value in ['Patient', 'Unknown']
731
+ or old_value in ["Patient", "Unknown"]
573
732
  )
574
733
  if new_value and should_overwrite:
575
734
  setattr(sm, sm_field, new_value)
576
735
  updated_fields.append(sm_field)
577
-
736
+
578
737
  if updated_fields:
579
738
  sm.save()
580
739
  logger.info(f"Updated SensitiveMeta fields: {updated_fields}")
@@ -587,26 +746,29 @@ class PdfImportService:
587
746
  if raw_value == meta_key:
588
747
  logger.warning(
589
748
  "Skipping date field %s - got field name '%s' instead of actual date",
590
- sm_field, raw_value
749
+ sm_field,
750
+ raw_value,
591
751
  )
592
752
  return None
593
-
753
+
594
754
  # Try common date formats
595
- date_formats = ['%Y-%m-%d', '%d.%m.%Y', '%d/%m/%Y', '%m/%d/%Y']
755
+ date_formats = ["%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y"]
596
756
  for fmt in date_formats:
597
757
  try:
598
758
  return datetime.strptime(raw_value, fmt).date()
599
759
  except ValueError:
600
760
  continue
601
-
602
- logger.warning("Could not parse date '%s' for field %s", raw_value, sm_field)
761
+
762
+ logger.warning(
763
+ "Could not parse date '%s' for field %s", raw_value, sm_field
764
+ )
603
765
  return None
604
-
605
- elif hasattr(raw_value, 'date'):
766
+
767
+ elif hasattr(raw_value, "date"):
606
768
  return raw_value.date()
607
769
  else:
608
770
  return raw_value
609
-
771
+
610
772
  except (ValueError, AttributeError) as e:
611
773
  logger.warning("Date parsing failed for %s: %s", sm_field, e)
612
774
  return None
@@ -626,14 +788,17 @@ class PdfImportService:
626
788
  logger.warning("Cannot apply anonymized PDF - no PDF instance available")
627
789
  return
628
790
 
629
- anonymized_pdf_path = self.processing_context.get('anonymized_pdf_path')
791
+ anonymized_pdf_path = self.processing_context.get("anonymized_pdf_path")
630
792
  if not anonymized_pdf_path:
631
793
  logger.debug("No anonymized_pdf_path present in processing context")
632
794
  return
633
795
 
634
796
  anonymized_path = Path(anonymized_pdf_path)
635
797
  if not anonymized_path.exists():
636
- logger.warning("Anonymized PDF path returned but file does not exist: %s", anonymized_path)
798
+ logger.warning(
799
+ "Anonymized PDF path returned but file does not exist: %s",
800
+ anonymized_path,
801
+ )
637
802
  return
638
803
 
639
804
  logger.info("Anonymized PDF created by ReportReader at: %s", anonymized_path)
@@ -647,7 +812,7 @@ class PdfImportService:
647
812
  relative_name = str(anonymized_path)
648
813
 
649
814
  # Only update if something actually changed
650
- if getattr(self.current_pdf.anonymized_file, 'name', None) != relative_name:
815
+ if getattr(self.current_pdf.anonymized_file, "name", None) != relative_name:
651
816
  self.current_pdf.anonymized_file.name = relative_name
652
817
 
653
818
  # Ensure model/state reflect anonymization even if text didn't differ
@@ -656,14 +821,16 @@ class PdfImportService:
656
821
 
657
822
  # Persist cropped regions info somewhere useful (optional & non-breaking)
658
823
  # If your model has a field for this, persist there; otherwise we just log.
659
- cropped_regions = self.processing_context.get('cropped_regions')
824
+ cropped_regions = self.processing_context.get("cropped_regions")
660
825
  if cropped_regions:
661
- logger.debug("Cropped regions recorded (%d regions).", len(cropped_regions))
826
+ logger.debug(
827
+ "Cropped regions recorded (%d regions).", len(cropped_regions)
828
+ )
662
829
 
663
830
  # Save model changes
664
- update_fields = ['anonymized_file']
665
- if 'anonymized' in self.current_pdf.__dict__:
666
- update_fields.append('anonymized')
831
+ update_fields = ["anonymized_file"]
832
+ if "anonymized" in self.current_pdf.__dict__:
833
+ update_fields.append("anonymized")
667
834
  self.current_pdf.save(update_fields=update_fields)
668
835
 
669
836
  # Mark state as anonymized immediately; this keeps downstream flows working
@@ -671,31 +838,41 @@ class PdfImportService:
671
838
  if state and not state.anonymized:
672
839
  state.mark_anonymized(save=True)
673
840
 
674
- logger.info("Updated anonymized_file reference to: %s", self.current_pdf.anonymized_file.name)
841
+ logger.info(
842
+ "Updated anonymized_file reference to: %s",
843
+ self.current_pdf.anonymized_file.name,
844
+ )
675
845
 
676
846
  except Exception as e:
677
847
  logger.warning("Could not set anonymized file reference: %s", e)
678
848
 
679
-
680
849
  def _finalize_processing(self):
681
850
  """Finalize processing and update state."""
682
851
  if not self.current_pdf:
683
852
  logger.warning("Cannot finalize processing - no PDF instance available")
684
853
  return
685
-
854
+
686
855
  try:
687
856
  # Update state based on processing results
688
857
  state = self._ensure_state(self.current_pdf)
689
-
690
- if self.processing_context.get('text_extracted') and state:
858
+
859
+ if self.processing_context.get("text_extracted") and state:
691
860
  state.mark_anonymized()
692
-
861
+
862
+ # Mark as ready for validation after successful anonymization
863
+ if self.processing_context.get("anonymization_completed") and state:
864
+ state.mark_sensitive_meta_processed()
865
+ logger.info(
866
+ f"PDF {self.current_pdf.pdf_hash} processing completed - "
867
+ f"ready for validation (status: {state.anonymization_status})"
868
+ )
869
+
693
870
  # Save all changes
694
871
  with transaction.atomic():
695
872
  self.current_pdf.save()
696
873
  if state:
697
874
  state.save()
698
-
875
+
699
876
  logger.info("PDF processing completed successfully")
700
877
  except Exception as e:
701
878
  logger.warning(f"Failed to finalize processing: {e}")
@@ -703,9 +880,11 @@ class PdfImportService:
703
880
  def _mark_processing_incomplete(self, reason: str):
704
881
  """Mark processing as incomplete with reason."""
705
882
  if not self.current_pdf:
706
- logger.warning(f"Cannot mark processing incomplete - no PDF instance available. Reason: {reason}")
883
+ logger.warning(
884
+ f"Cannot mark processing incomplete - no PDF instance available. Reason: {reason}"
885
+ )
707
886
  return
708
-
887
+
709
888
  try:
710
889
  state = self._ensure_state(self.current_pdf)
711
890
  if state:
@@ -714,7 +893,7 @@ class PdfImportService:
714
893
  state.sensitive_meta_processed = False
715
894
  state.save()
716
895
  logger.info(f"Set PDF state: processed=False due to {reason}")
717
-
896
+
718
897
  # Save changes
719
898
  with transaction.atomic():
720
899
  self.current_pdf.save()
@@ -722,31 +901,53 @@ class PdfImportService:
722
901
  logger.warning(f"Failed to mark processing incomplete: {e}")
723
902
 
724
903
  def _retry_existing_pdf(self, existing_pdf):
725
- """Retry processing for existing PDF."""
904
+ """
905
+ Retry processing for existing PDF.
906
+
907
+ Uses get_raw_file_path() to find the original raw file instead of
908
+ relying on the file field which may point to a deleted sensitive file.
909
+ """
726
910
  try:
911
+ # ✅ FIX: Use get_raw_file_path() to find original file
912
+ raw_file_path = existing_pdf.get_raw_file_path()
913
+
914
+ if not raw_file_path or not raw_file_path.exists():
915
+ logger.error(
916
+ f"Cannot retry PDF {existing_pdf.pdf_hash}: Raw file not found. "
917
+ f"Please re-upload the original PDF file."
918
+ )
919
+ self.current_pdf = existing_pdf
920
+ return existing_pdf
921
+
922
+ logger.info(f"Found raw file for retry at: {raw_file_path}")
923
+
727
924
  # Remove from processed files to allow retry
728
- file_path_str = str(existing_pdf.file.path) if existing_pdf.file else None
729
- if file_path_str and file_path_str in self.processed_files:
925
+ file_path_str = str(raw_file_path)
926
+ if file_path_str in self.processed_files:
730
927
  self.processed_files.remove(file_path_str)
731
928
  logger.debug(f"Removed {file_path_str} from processed files for retry")
732
-
929
+
733
930
  return self.import_and_anonymize(
734
- file_path=existing_pdf.file.path,
735
- center_name=existing_pdf.center.name if existing_pdf.center else "unknown_center",
736
- delete_source=False,
737
- retry=True
931
+ file_path=raw_file_path, # ✅ Use raw file path, not sensitive path
932
+ center_name=existing_pdf.center.name
933
+ if existing_pdf.center
934
+ else "unknown_center",
935
+ delete_source=False, # Never delete during retry
936
+ retry=True,
738
937
  )
739
938
  except Exception as e:
740
- logger.error(f"Failed to re-import existing PDF {existing_pdf.pdf_hash}: {e}")
939
+ logger.error(
940
+ f"Failed to re-import existing PDF {existing_pdf.pdf_hash}: {e}"
941
+ )
741
942
  self.current_pdf = existing_pdf
742
943
  return existing_pdf
743
944
 
744
945
  def _cleanup_on_error(self):
745
946
  """Cleanup processing context on error."""
746
947
  try:
747
- if self.current_pdf and hasattr(self.current_pdf, 'state'):
948
+ if self.current_pdf and hasattr(self.current_pdf, "state"):
748
949
  state = self._ensure_state(self.current_pdf)
749
- if state and self.processing_context.get('processing_started'):
950
+ if state and self.processing_context.get("processing_started"):
750
951
  state.text_meta_extracted = False
751
952
  state.pdf_meta_extracted = False
752
953
  state.sensitive_meta_processed = False
@@ -756,7 +957,7 @@ class PdfImportService:
756
957
  logger.warning(f"Error during cleanup: {e}")
757
958
  finally:
758
959
  # Remove any sensitive copy created during this processing run
759
- sensitive_created = self.processing_context.get('sensitive_copy_created')
960
+ sensitive_created = self.processing_context.get("sensitive_copy_created")
760
961
  if sensitive_created:
761
962
  pdf_obj = self.current_pdf
762
963
  try:
@@ -765,30 +966,51 @@ class PdfImportService:
765
966
  if file_field and getattr(file_field, "name", None):
766
967
  storage_name = file_field.name
767
968
  file_field.delete(save=False)
768
- logger.debug("Deleted sensitive copy %s during error cleanup", storage_name)
969
+ logger.debug(
970
+ "Deleted sensitive copy %s during error cleanup",
971
+ storage_name,
972
+ )
769
973
  except Exception as cleanup_exc:
770
- logger.warning("Failed to remove sensitive copy during error cleanup: %s", cleanup_exc)
974
+ logger.warning(
975
+ "Failed to remove sensitive copy during error cleanup: %s",
976
+ cleanup_exc,
977
+ )
771
978
 
772
979
  # Always clean up processed files set to prevent blocks
773
- file_path = self.processing_context.get('file_path')
980
+ file_path = self.processing_context.get("file_path")
774
981
  if file_path and str(file_path) in self.processed_files:
775
982
  self.processed_files.remove(str(file_path))
776
- logger.debug(f"Removed {file_path} from processed files during error cleanup")
983
+ logger.debug(
984
+ f"Removed {file_path} from processed files during error cleanup"
985
+ )
777
986
 
778
987
  try:
779
- original_path = self.processing_context.get('original_file_path')
780
- logger.debug("PDF cleanup original path: %s (%s)", original_path, type(original_path))
781
- raw_dir = original_path.parent if isinstance(original_path, Path) else None
988
+ original_path = self.processing_context.get("original_file_path")
989
+ logger.debug(
990
+ "PDF cleanup original path: %s (%s)",
991
+ original_path,
992
+ type(original_path),
993
+ )
994
+ raw_dir = (
995
+ original_path.parent if isinstance(original_path, Path) else None
996
+ )
782
997
  if (
783
998
  isinstance(original_path, Path)
784
999
  and original_path.exists()
785
- and not self.processing_context.get('sensitive_copy_created')
1000
+ and not self.processing_context.get("sensitive_copy_created")
786
1001
  ):
787
1002
  try:
788
1003
  original_path.unlink()
789
- logger.info("Removed original file %s during error cleanup", original_path)
1004
+ logger.info(
1005
+ "Removed original file %s during error cleanup",
1006
+ original_path,
1007
+ )
790
1008
  except Exception as remove_exc:
791
- logger.warning("Could not remove original file %s during error cleanup: %s", original_path, remove_exc)
1009
+ logger.warning(
1010
+ "Could not remove original file %s during error cleanup: %s",
1011
+ original_path,
1012
+ remove_exc,
1013
+ )
792
1014
  pdf_dir = self._get_pdf_dir()
793
1015
  if not pdf_dir and raw_dir:
794
1016
  base_dir = raw_dir.parent
@@ -805,7 +1027,12 @@ class PdfImportService:
805
1027
 
806
1028
  # Remove empty PDF subdirectories that might have been created during setup
807
1029
  if pdf_dir and pdf_dir.exists():
808
- for subdir_name in ("sensitive", "cropped_regions", "anonymized", "_processing"):
1030
+ for subdir_name in (
1031
+ "sensitive",
1032
+ "cropped_regions",
1033
+ "anonymized",
1034
+ "_processing",
1035
+ ):
809
1036
  subdir_path = pdf_dir / subdir_name
810
1037
  if subdir_path.exists() and subdir_path.is_dir():
811
1038
  try:
@@ -813,22 +1040,49 @@ class PdfImportService:
813
1040
  except StopIteration:
814
1041
  try:
815
1042
  subdir_path.rmdir()
816
- logger.debug("Removed empty directory %s during error cleanup", subdir_path)
1043
+ logger.debug(
1044
+ "Removed empty directory %s during error cleanup",
1045
+ subdir_path,
1046
+ )
817
1047
  except OSError as rm_err:
818
- logger.debug("Could not remove directory %s: %s", subdir_path, rm_err)
1048
+ logger.debug(
1049
+ "Could not remove directory %s: %s",
1050
+ subdir_path,
1051
+ rm_err,
1052
+ )
819
1053
  except Exception as iter_err:
820
- logger.debug("Could not inspect directory %s: %s", subdir_path, iter_err)
821
-
822
- raw_count = len(list(raw_dir.glob("*"))) if raw_dir and raw_dir.exists() else None
823
- pdf_count = len(list(pdf_dir.glob("*"))) if pdf_dir and pdf_dir.exists() else None
1054
+ logger.debug(
1055
+ "Could not inspect directory %s: %s",
1056
+ subdir_path,
1057
+ iter_err,
1058
+ )
1059
+
1060
+ raw_count = (
1061
+ len(list(raw_dir.glob("*")))
1062
+ if raw_dir and raw_dir.exists()
1063
+ else None
1064
+ )
1065
+ pdf_count = (
1066
+ len(list(pdf_dir.glob("*")))
1067
+ if pdf_dir and pdf_dir.exists()
1068
+ else None
1069
+ )
824
1070
 
825
- sensitive_path = self.processing_context.get('sensitive_file_path')
1071
+ sensitive_path = self.processing_context.get("sensitive_file_path")
826
1072
  if sensitive_path:
827
1073
  sensitive_parent = Path(sensitive_path).parent
828
- sensitive_count = len(list(sensitive_parent.glob("*"))) if sensitive_parent.exists() else None
1074
+ sensitive_count = (
1075
+ len(list(sensitive_parent.glob("*")))
1076
+ if sensitive_parent.exists()
1077
+ else None
1078
+ )
829
1079
  else:
830
1080
  sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
831
- sensitive_count = len(list(sensitive_dir.glob("*"))) if sensitive_dir and sensitive_dir.exists() else None
1081
+ sensitive_count = (
1082
+ len(list(sensitive_dir.glob("*")))
1083
+ if sensitive_dir and sensitive_dir.exists()
1084
+ else None
1085
+ )
832
1086
 
833
1087
  logger.info(
834
1088
  "PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
@@ -843,17 +1097,17 @@ class PdfImportService:
843
1097
  """Cleanup processing context."""
844
1098
  try:
845
1099
  # Clean up temporary directories
846
- if self.processing_context.get('text_extracted'):
847
- crops_dir = path_utils.PDF_DIR / 'cropped_regions'
1100
+ if self.processing_context.get("text_extracted"):
1101
+ crops_dir = path_utils.PDF_DIR / "cropped_regions"
848
1102
  if crops_dir.exists() and not any(crops_dir.iterdir()):
849
1103
  crops_dir.rmdir()
850
-
1104
+
851
1105
  # Always remove from processed files set after processing attempt
852
- file_path = self.processing_context.get('file_path')
1106
+ file_path = self.processing_context.get("file_path")
853
1107
  if file_path and str(file_path) in self.processed_files:
854
1108
  self.processed_files.remove(str(file_path))
855
1109
  logger.debug(f"Removed {file_path} from processed files set")
856
-
1110
+
857
1111
  except Exception as e:
858
1112
  logger.warning(f"Error during context cleanup: {e}")
859
1113
  finally:
@@ -862,44 +1116,43 @@ class PdfImportService:
862
1116
  self.processing_context = {}
863
1117
 
864
1118
  def import_simple(
865
- self,
866
- file_path: Union[Path, str],
867
- center_name: str,
868
- delete_source: bool = False
1119
+ self, file_path: Union[Path, str], center_name: str, delete_source: bool = False
869
1120
  ) -> "RawPdfFile":
870
1121
  """
871
1122
  Simple PDF import without text processing or anonymization.
872
1123
  Uses centralized PDF instance management pattern.
873
-
1124
+
874
1125
  Args:
875
1126
  file_path: Path to the PDF file to import
876
1127
  center_name: Name of the center to associate with PDF
877
1128
  delete_source: Whether to delete the source file after import
878
-
1129
+
879
1130
  Returns:
880
1131
  RawPdfFile instance after basic import
881
1132
  """
882
1133
  try:
883
1134
  # Initialize simple processing context
884
- self._initialize_processing_context(file_path, center_name, delete_source, False)
885
-
1135
+ self._initialize_processing_context(
1136
+ file_path, center_name, delete_source, False
1137
+ )
1138
+
886
1139
  # Validate file
887
1140
  self._validate_and_prepare_file()
888
-
1141
+
889
1142
  # Create PDF instance
890
1143
  logger.info("Starting simple import - creating RawPdfFile instance...")
891
1144
  self.current_pdf = RawPdfFile.create_from_file_initialized(
892
- file_path=self.processing_context['file_path'],
1145
+ file_path=self.processing_context["file_path"],
893
1146
  center_name=center_name,
894
1147
  delete_source=delete_source,
895
1148
  )
896
-
1149
+
897
1150
  if not self.current_pdf:
898
1151
  raise RuntimeError("Failed to create RawPdfFile instance")
899
-
1152
+
900
1153
  # Mark as processed
901
- self.processed_files.add(str(self.processing_context['file_path']))
902
-
1154
+ self.processed_files.add(str(self.processing_context["file_path"]))
1155
+
903
1156
  # Set basic state for simple import
904
1157
  state = self._ensure_state(self.current_pdf)
905
1158
  if state:
@@ -908,56 +1161,68 @@ class PdfImportService:
908
1161
  state.sensitive_meta_processed = False
909
1162
  state.save()
910
1163
  logger.info("Set PDF state: processed=False for simple import")
911
-
1164
+
912
1165
  # Save changes
913
1166
  with transaction.atomic():
914
1167
  self.current_pdf.save()
915
-
916
- logger.info("Simple import completed for RawPdfFile hash: %s", self.current_pdf.pdf_hash)
1168
+
1169
+ logger.info(
1170
+ "Simple import completed for RawPdfFile hash: %s",
1171
+ self.current_pdf.pdf_hash,
1172
+ )
917
1173
  return self.current_pdf
918
-
1174
+
919
1175
  except Exception as e:
920
1176
  logger.error(f"Simple PDF import failed for {file_path}: {e}")
921
1177
  self._cleanup_on_error()
922
1178
  raise
923
1179
  finally:
924
1180
  self._cleanup_processing_context()
925
-
926
- def check_storage_capacity(self, file_path: Union[Path, str], storage_root, min_required_space) -> None:
1181
+
1182
+ def check_storage_capacity(
1183
+ self, file_path: Union[Path, str], storage_root, min_required_space
1184
+ ) -> None:
927
1185
  """
928
1186
  Check if there is sufficient storage capacity for the PDF file.
929
-
1187
+
930
1188
  Args:
931
1189
  file_path: Path to the PDF file to check
932
-
1190
+
933
1191
  Raises:
934
1192
  InsufficientStorageError: If there is not enough space
935
1193
  """
936
1194
  import shutil
1195
+
937
1196
  from endoreg_db.exceptions import InsufficientStorageError
938
-
1197
+
939
1198
  file_path = Path(file_path)
940
1199
  if not file_path.exists():
941
1200
  raise FileNotFoundError(f"File not found for storage check: {file_path}")
942
-
1201
+
943
1202
  # Get the size of the file
944
1203
  file_size = file_path.stat().st_size
945
-
1204
+
946
1205
  # Get available space in the storage directory
947
1206
 
948
1207
  total, used, free = shutil.disk_usage(storage_root)
949
-
1208
+
950
1209
  if file_size:
951
1210
  min_required_space = file_size if isinstance(min_required_space, int) else 0
952
1211
 
953
1212
  # Check if there is enough space
954
1213
  if file_size > free:
955
- raise InsufficientStorageError(f"Not enough space to store PDF file: {file_path}")
956
- logger.info(f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available")
957
-
1214
+ raise InsufficientStorageError(
1215
+ f"Not enough space to store PDF file: {file_path}"
1216
+ )
1217
+ logger.info(
1218
+ f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available"
1219
+ )
1220
+
958
1221
  return True
959
-
960
- def create_sensitive_file(self, pdf_instance: "RawPdfFile" = None, file_path: Union[Path, str] = None) -> None:
1222
+
1223
+ def create_sensitive_file(
1224
+ self, pdf_instance: "RawPdfFile" = None, file_path: Union[Path, str] = None
1225
+ ) -> None:
961
1226
  """
962
1227
  Create a copy of the PDF file in the sensitive directory and update the file reference.
963
1228
  Delete the source path to avoid duplicates.
@@ -966,7 +1231,9 @@ class PdfImportService:
966
1231
  Ensures the FileField points to the file under STORAGE_DIR/pdfs/sensitive and never back to raw_pdfs.
967
1232
  """
968
1233
  pdf_file = pdf_instance or self.current_pdf
969
- source_path = Path(file_path) if file_path else self.processing_context.get('file_path')
1234
+ source_path = (
1235
+ Path(file_path) if file_path else self.processing_context.get("file_path")
1236
+ )
970
1237
 
971
1238
  if not pdf_file:
972
1239
  raise ValueError("No PDF instance available for creating sensitive file")
@@ -989,25 +1256,37 @@ class PdfImportService:
989
1256
  try:
990
1257
  target.unlink()
991
1258
  except Exception as e:
992
- logger.warning("Could not remove existing sensitive target %s: %s", target, e)
1259
+ logger.warning(
1260
+ "Could not remove existing sensitive target %s: %s",
1261
+ target,
1262
+ e,
1263
+ )
993
1264
  shutil.move(str(source_path), str(target))
994
1265
  logger.info(f"Moved PDF to sensitive directory: {target}")
995
1266
 
996
1267
  # Update FileField to reference the file under STORAGE_DIR
997
1268
  # We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
998
1269
  try:
999
- relative_name = str(target.relative_to(path_utils.STORAGE_DIR)) # Point Django FileField to sensitive storage
1270
+ relative_name = str(
1271
+ target.relative_to(path_utils.STORAGE_DIR)
1272
+ ) # Point Django FileField to sensitive storage
1000
1273
  except ValueError:
1001
1274
  # Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
1002
1275
  relative_name = str(target)
1003
1276
 
1004
1277
  # Only update when changed
1005
- if getattr(pdf_file.file, 'name', None) != relative_name:
1278
+ if getattr(pdf_file.file, "name", None) != relative_name:
1006
1279
  pdf_file.file.name = relative_name
1007
- pdf_file.save(update_fields=['file'])
1008
- logger.info("Updated PDF FileField reference to sensitive path: %s", pdf_file.file.path)
1280
+ pdf_file.save(update_fields=["file"])
1281
+ logger.info(
1282
+ "Updated PDF FileField reference to sensitive path: %s",
1283
+ pdf_file.file.path,
1284
+ )
1009
1285
  else:
1010
- logger.debug("PDF FileField already points to sensitive path: %s", pdf_file.file.path)
1286
+ logger.debug(
1287
+ "PDF FileField already points to sensitive path: %s",
1288
+ pdf_file.file.path,
1289
+ )
1011
1290
 
1012
1291
  # Best-effort: if original source still exists (e.g., copy), remove it to avoid re-triggers
1013
1292
  try:
@@ -1018,57 +1297,81 @@ class PdfImportService:
1018
1297
  logger.warning(f"Could not delete original PDF file {source_path}: {e}")
1019
1298
 
1020
1299
  except Exception as e:
1021
- logger.warning(f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}", exc_info=True)
1300
+ logger.warning(
1301
+ f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}",
1302
+ exc_info=True,
1303
+ )
1022
1304
 
1023
- def archive_or_quarantine_file(self, pdf_instance: "RawPdfFile" = None, source_file_path: Union[Path, str] = None,
1024
- quarantine_reason: str = None, is_pdf_problematic: bool = None) -> bool:
1305
+ def archive_or_quarantine_file(
1306
+ self,
1307
+ pdf_instance: "RawPdfFile" = None,
1308
+ source_file_path: Union[Path, str] = None,
1309
+ quarantine_reason: str = None,
1310
+ is_pdf_problematic: bool = None,
1311
+ ) -> bool:
1025
1312
  """
1026
1313
  Archive or quarantine file based on the state of the PDF processing.
1027
1314
  Uses the central PDF instance and processing context if parameters not provided.
1028
-
1315
+
1029
1316
  Args:
1030
1317
  pdf_instance: Optional PDF instance, defaults to self.current_pdf
1031
1318
  source_file_path: Optional source file path, defaults to processing_context['file_path']
1032
1319
  quarantine_reason: Optional quarantine reason, defaults to processing_context['error_reason']
1033
1320
  is_pdf_problematic: Optional override for problematic state
1034
-
1321
+
1035
1322
  Returns:
1036
1323
  bool: True if file was quarantined, False if archived successfully
1037
1324
  """
1038
1325
  pdf_file = pdf_instance or self.current_pdf
1039
- file_path = Path(source_file_path) if source_file_path else self.processing_context.get('file_path')
1040
- quarantine_reason = quarantine_reason or self.processing_context.get('error_reason')
1041
-
1326
+ file_path = (
1327
+ Path(source_file_path)
1328
+ if source_file_path
1329
+ else self.processing_context.get("file_path")
1330
+ )
1331
+ quarantine_reason = quarantine_reason or self.processing_context.get(
1332
+ "error_reason"
1333
+ )
1334
+
1042
1335
  if not pdf_file:
1043
1336
  raise ValueError("No PDF instance available for archiving/quarantine")
1044
1337
  if not file_path:
1045
1338
  raise ValueError("No file path available for archiving/quarantine")
1046
-
1339
+
1047
1340
  # Determine if the PDF is problematic
1048
- pdf_problematic = is_pdf_problematic if is_pdf_problematic is not None else pdf_file.is_problematic
1049
-
1341
+ pdf_problematic = (
1342
+ is_pdf_problematic
1343
+ if is_pdf_problematic is not None
1344
+ else pdf_file.is_problematic
1345
+ )
1346
+
1050
1347
  if pdf_problematic:
1051
1348
  # Quarantine the file
1052
- logger.warning(f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}")
1349
+ logger.warning(
1350
+ f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}"
1351
+ )
1053
1352
  quarantine_dir = path_utils.PDF_DIR / "quarantine"
1054
1353
  os.makedirs(quarantine_dir, exist_ok=True)
1055
-
1354
+
1056
1355
  quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
1057
1356
  try:
1058
1357
  shutil.move(file_path, quarantine_path)
1059
- pdf_file.quarantine_reason = quarantine_reason or "File processing failed"
1060
- pdf_file.save(update_fields=['quarantine_reason'])
1358
+ pdf_file.quarantine_reason = (
1359
+ quarantine_reason or "File processing failed"
1360
+ )
1361
+ pdf_file.save(update_fields=["quarantine_reason"])
1061
1362
  logger.info(f"Moved problematic PDF to quarantine: {quarantine_path}")
1062
1363
  return True
1063
1364
  except Exception as e:
1064
1365
  logger.error(f"Failed to quarantine PDF {pdf_file.pdf_hash}: {e}")
1065
- return True # Still consider as quarantined to prevent further processing
1366
+ return (
1367
+ True # Still consider as quarantined to prevent further processing
1368
+ )
1066
1369
  else:
1067
1370
  # Archive the file normally
1068
1371
  logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
1069
1372
  archive_dir = path_utils.PDF_DIR / "processed"
1070
1373
  os.makedirs(archive_dir, exist_ok=True)
1071
-
1374
+
1072
1375
  archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
1073
1376
  try:
1074
1377
  shutil.move(file_path, archive_path)