endoreg-db 0.8.4.7__py3-none-any.whl → 0.8.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of endoreg-db might be problematic. Click here for more details.

@@ -4,22 +4,25 @@ PDF import service module.
4
4
  Provides high-level functions for importing and anonymizing PDF files,
5
5
  combining RawPdfFile creation with text extraction and anonymization.
6
6
  """
7
- from datetime import date, datetime
7
+
8
8
  import errno
9
+ import hashlib
9
10
  import logging
11
+ import os
10
12
  import shutil
11
13
  import sys
12
- import os
13
- import hashlib
14
+ import time
15
+ from contextlib import contextmanager
16
+ from datetime import date, datetime
14
17
  from pathlib import Path
15
18
  from typing import TYPE_CHECKING, Union
16
- from contextlib import contextmanager
19
+
17
20
  from django.db import transaction
21
+
22
+ from endoreg_db.models import SensitiveMeta
18
23
  from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
19
24
  from endoreg_db.models.state.raw_pdf import RawPdfState
20
- from endoreg_db.models import SensitiveMeta
21
25
  from endoreg_db.utils import paths as path_utils
22
- import time
23
26
 
24
27
  logger = logging.getLogger(__name__)
25
28
 
@@ -34,24 +37,61 @@ class PdfImportService:
34
37
  """
35
38
  Service class for importing and processing PDF files with text extraction and anonymization.
36
39
  Uses a central PDF instance pattern for cleaner state management.
40
+
41
+ Supports two processing modes:
42
+ - 'blackening': Simple PDF masking with black rectangles over sensitive areas
43
+ - 'cropping': Advanced mode that crops sensitive regions to separate images
37
44
  """
38
-
39
- def __init__(self, allow_meta_overwrite: bool = False):
45
+
46
+ def __init__(self, allow_meta_overwrite: bool = False, processing_mode: str = "blackening"):
40
47
  """
41
48
  Initialize the PDF import service.
42
-
49
+
43
50
  Args:
44
51
  allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
52
+ processing_mode: Processing mode - 'blackening' for simple masking, 'cropping' for advanced cropping
45
53
  """
46
54
  self.processed_files = set()
47
55
  self._report_reader_available = None
48
56
  self._report_reader_class = None
49
57
  self.allow_meta_overwrite = allow_meta_overwrite
50
-
58
+
59
+ # Validate and set processing mode
60
+ valid_modes = ["blackening", "cropping"]
61
+ if processing_mode not in valid_modes:
62
+ raise ValueError(f"Invalid processing_mode '{processing_mode}'. Must be one of: {valid_modes}")
63
+ self.processing_mode = processing_mode
64
+
51
65
  # Central PDF instance management
52
66
  self.current_pdf = None
53
67
  self.processing_context = {}
54
-
68
+
69
+ @classmethod
70
+ def with_blackening(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
71
+ """
72
+ Create a PdfImportService configured for simple PDF blackening mode.
73
+
74
+ Args:
75
+ allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
76
+
77
+ Returns:
78
+ PdfImportService instance configured for blackening mode
79
+ """
80
+ return cls(allow_meta_overwrite=allow_meta_overwrite, processing_mode="blackening")
81
+
82
+ @classmethod
83
+ def with_cropping(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
84
+ """
85
+ Create a PdfImportService configured for advanced cropping mode.
86
+
87
+ Args:
88
+ allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
89
+
90
+ Returns:
91
+ PdfImportService instance configured for cropping mode
92
+ """
93
+ return cls(allow_meta_overwrite=allow_meta_overwrite, processing_mode="cropping")
94
+
55
95
  @contextmanager
56
96
  def _file_lock(self, path: Path):
57
97
  """Create a file lock to prevent duplicate processing.
@@ -75,10 +115,7 @@ class PdfImportService:
75
115
 
76
116
  if age is not None and age > STALE_LOCK_SECONDS:
77
117
  try:
78
- logger.warning(
79
- "Stale lock detected for %s (age %.0fs). Reclaiming lock...",
80
- path, age
81
- )
118
+ logger.warning("Stale lock detected for %s (age %.0fs). Reclaiming lock...", path, age)
82
119
  lock_path.unlink()
83
120
  except Exception as e:
84
121
  logger.warning("Failed to remove stale lock %s: %s", lock_path, e)
@@ -100,7 +137,7 @@ class PdfImportService:
100
137
  lock_path.unlink()
101
138
  except OSError:
102
139
  pass
103
-
140
+
104
141
  def _sha256(self, path: Path, chunk: int = 1024 * 1024) -> str:
105
142
  """Compute SHA256 hash of a file."""
106
143
  h = hashlib.sha256()
@@ -134,7 +171,7 @@ class PdfImportService:
134
171
  return Path(str(candidate))
135
172
  except Exception:
136
173
  return None
137
-
174
+
138
175
  def _quarantine(self, source: Path) -> Path:
139
176
  """Move file to quarantine directory to prevent re-processing."""
140
177
  qdir = path_utils.PDF_DIR / "_processing"
@@ -150,7 +187,7 @@ class PdfImportService:
150
187
  else:
151
188
  raise
152
189
  return target
153
-
190
+
154
191
  def _ensure_state(self, pdf_file: "RawPdfFile"):
155
192
  """Ensure PDF file has a state object."""
156
193
  if getattr(pdf_file, "state", None):
@@ -166,29 +203,30 @@ class PdfImportService:
166
203
  return state
167
204
  except Exception:
168
205
  return None
169
-
206
+
170
207
  def _ensure_report_reading_available(self):
171
208
  """
172
209
  Ensure report reading modules are available by adding lx-anonymizer to path.
173
-
210
+
174
211
  Returns:
175
212
  Tuple of (availability_flag, ReportReader_class)
176
213
  """
177
214
  if self._report_reader_available is not None:
178
215
  return self._report_reader_available, self._report_reader_class
179
-
216
+
180
217
  try:
181
218
  # Try direct import first
182
219
  from lx_anonymizer import ReportReader
183
-
220
+
184
221
  logger.info("Successfully imported lx_anonymizer ReportReader module")
185
222
  self._report_reader_available = True
186
223
  self._report_reader_class = ReportReader
187
224
  return True, ReportReader
188
-
225
+
189
226
  except ImportError:
190
227
  # Optional: honor LX_ANONYMIZER_PATH=/abs/path/to/src
191
228
  import importlib
229
+
192
230
  extra = os.getenv("LX_ANONYMIZER_PATH")
193
231
  if extra and extra not in sys.path and Path(extra).exists():
194
232
  sys.path.insert(0, extra)
@@ -205,18 +243,17 @@ class PdfImportService:
205
243
  # Keep path for future imports if it worked; otherwise remove.
206
244
  if "ReportReader" not in locals() and extra in sys.path:
207
245
  sys.path.remove(extra)
208
-
246
+
209
247
  self._report_reader_available = False
210
248
  self._report_reader_class = None
211
249
  return False, None
212
250
 
213
-
214
251
  def _ensure_default_patient_data(self, pdf_instance: "RawPdfFile" = None) -> None:
215
252
  """
216
253
  Ensure PDF has minimum required patient data in SensitiveMeta.
217
254
  Creates default values if data is missing after text processing.
218
255
  Uses the central PDF instance if no specific instance provided.
219
-
256
+
220
257
  Args:
221
258
  pdf_instance: Optional specific PDF instance, defaults to self.current_pdf
222
259
  """
@@ -224,76 +261,80 @@ class PdfImportService:
224
261
  if not pdf_file:
225
262
  logger.warning("No PDF instance available for ensuring default patient data")
226
263
  return
227
-
264
+
228
265
  if not pdf_file.sensitive_meta:
229
266
  logger.info(f"No SensitiveMeta found for PDF {pdf_file.pdf_hash}, creating default")
230
-
267
+
231
268
  # Create default SensitiveMeta with placeholder data
232
269
  default_data = {
233
270
  "patient_first_name": "Patient",
234
- "patient_last_name": "Unknown",
271
+ "patient_last_name": "Unknown",
235
272
  "patient_dob": date(1990, 1, 1), # Default DOB
236
273
  "examination_date": date.today(),
237
- "center_name": pdf_file.center.name if pdf_file.center else "university_hospital_wuerzburg"
274
+ "center_name": pdf_file.center.name if pdf_file.center else "university_hospital_wuerzburg",
238
275
  }
239
-
276
+
240
277
  try:
241
278
  sensitive_meta = SensitiveMeta.create_from_dict(default_data)
242
279
  pdf_file.sensitive_meta = sensitive_meta
243
- pdf_file.save(update_fields=['sensitive_meta'])
280
+ pdf_file.save(update_fields=["sensitive_meta"])
244
281
  logger.info(f"Created default SensitiveMeta for PDF {pdf_file.pdf_hash}")
245
282
  except Exception as e:
246
283
  logger.error(f"Failed to create default SensitiveMeta for PDF {pdf_file.pdf_hash}: {e}")
247
284
 
248
285
  def import_and_anonymize(
249
- self,
250
- file_path: Union[Path, str],
251
- center_name: str,
286
+ self,
287
+ file_path: Union[Path, str],
288
+ center_name: str,
252
289
  delete_source: bool = False,
253
290
  retry: bool = False,
254
291
  ) -> "RawPdfFile":
255
292
  """
256
293
  Import a PDF file and anonymize it using ReportReader.
257
294
  Uses centralized PDF instance management pattern.
258
-
295
+
296
+ The processing mode is determined by the service initialization:
297
+ - 'blackening': Creates an anonymized PDF with black rectangles over sensitive regions
298
+ - 'cropping': Advanced mode that crops sensitive regions to separate images
299
+
259
300
  Args:
260
301
  file_path: Path to the PDF file to import
261
302
  center_name: Name of the center to associate with PDF
262
303
  delete_source: Whether to delete the source file after import
263
304
  retry: Whether this is a retry attempt
264
-
305
+
265
306
  Returns:
266
307
  RawPdfFile instance after import and processing
267
-
308
+
268
309
  Raises:
269
310
  Exception: On any failure during import or processing
270
311
  """
271
312
  try:
272
313
  # Initialize processing context
273
314
  self._initialize_processing_context(file_path, center_name, delete_source, retry)
274
-
315
+
275
316
  # Step 1: Validate and prepare file
276
317
  self._validate_and_prepare_file()
277
-
318
+
278
319
  # Step 2: Create or retrieve PDF instance
279
320
  self._create_or_retrieve_pdf_instance()
280
-
321
+
281
322
  # Early return check - if no PDF instance was created, return None
282
323
  if not self.current_pdf:
283
324
  logger.warning(f"No PDF instance created for {file_path}, returning None")
284
325
  return None
285
-
326
+
286
327
  # Step 3: Setup processing environment
287
328
  self._setup_processing_environment()
288
-
329
+
289
330
  # Step 4: Process text and metadata
290
331
  self._process_text_and_metadata()
291
-
332
+
292
333
  # Step 5: Finalize processing
293
334
  self._finalize_processing()
294
-
335
+
295
336
  return self.current_pdf
296
-
337
+
297
338
  except ValueError as e:
298
339
  # Handle "File already being processed" case specifically
299
340
  if "already being processed" in str(e):
@@ -312,50 +353,49 @@ class PdfImportService:
312
353
  # Always cleanup context
313
354
  self._cleanup_processing_context()
314
355
 
315
- def _initialize_processing_context(self, file_path: Union[Path, str], center_name: str,
316
- delete_source: bool, retry: bool):
356
+ def _initialize_processing_context(self, file_path: Union[Path, str], center_name: str, delete_source: bool, retry: bool):
317
357
  """Initialize the processing context for the current PDF."""
318
358
  self.processing_context = {
319
- 'file_path': Path(file_path),
320
- 'original_file_path': Path(file_path),
321
- 'center_name': center_name,
322
- 'delete_source': delete_source,
323
- 'retry': retry,
324
- 'file_hash': None,
325
- 'processing_started': False,
326
- 'text_extracted': False,
327
- 'metadata_processed': False,
328
- 'anonymization_completed': False
359
+ "file_path": Path(file_path),
360
+ "original_file_path": Path(file_path),
361
+ "center_name": center_name,
362
+ "delete_source": delete_source,
363
+ "retry": retry,
364
+ "file_hash": None,
365
+ "processing_started": False,
366
+ "text_extracted": False,
367
+ "metadata_processed": False,
368
+ "anonymization_completed": False,
329
369
  }
330
-
370
+
331
371
  # Check if already processed (only during current session to prevent race conditions)
332
372
  if str(file_path) in self.processed_files:
333
373
  logger.info(f"File {file_path} already being processed in current session, skipping")
334
374
  raise ValueError("File already being processed")
335
-
375
+
336
376
  logger.info(f"Starting import and processing for: {file_path}")
337
377
 
338
378
  def _validate_and_prepare_file(self):
339
379
  """Validate file existence and calculate hash."""
340
- file_path = self.processing_context['file_path']
341
-
380
+ file_path = self.processing_context["file_path"]
381
+
342
382
  if not file_path.exists():
343
383
  raise FileNotFoundError(f"PDF file not found: {file_path}")
344
-
384
+
345
385
  try:
346
- self.processing_context['file_hash'] = self._sha256(file_path)
386
+ self.processing_context["file_hash"] = self._sha256(file_path)
347
387
  except Exception as e:
348
388
  logger.warning(f"Could not calculate file hash: {e}")
349
- self.processing_context['file_hash'] = None
389
+ self.processing_context["file_hash"] = None
350
390
 
351
391
  def _create_or_retrieve_pdf_instance(self):
352
392
  """Create new or retrieve existing PDF instance."""
353
- file_path = self.processing_context['file_path']
354
- center_name = self.processing_context['center_name']
355
- delete_source = self.processing_context['delete_source']
356
- retry = self.processing_context['retry']
357
- file_hash = self.processing_context['file_hash']
358
-
393
+ file_path = self.processing_context["file_path"]
394
+ center_name = self.processing_context["center_name"]
395
+ delete_source = self.processing_context["delete_source"]
396
+ retry = self.processing_context["retry"]
397
+ file_hash = self.processing_context["file_hash"]
398
+
359
399
  if not retry:
360
400
  # Check for existing PDF and handle duplicates
361
401
  with self._file_lock(file_path):
@@ -373,11 +413,11 @@ class PdfImportService:
373
413
  # Retry processing
374
414
  logger.info(f"Reprocessing existing PDF {existing.pdf_hash}")
375
415
  return self._retry_existing_pdf(existing)
376
-
416
+
377
417
  # Create new PDF instance
378
418
  logger.info("Creating new RawPdfFile instance...")
379
419
  from django.db import IntegrityError
380
-
420
+
381
421
  try:
382
422
  if not retry:
383
423
  self.current_pdf = RawPdfFile.create_from_file_initialized(
@@ -389,17 +429,17 @@ class PdfImportService:
389
429
  # Retrieve existing for retry
390
430
  self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
391
431
  logger.info(f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}")
392
-
432
+
393
433
  # Check if retry is actually needed
394
434
  if self.current_pdf.text:
395
435
  logger.info(f"Existing PDF {self.current_pdf.pdf_hash} already processed during retry - returning")
396
436
  return
397
-
437
+
398
438
  if not self.current_pdf:
399
439
  raise RuntimeError("Failed to create RawPdfFile instance")
400
-
440
+
401
441
  logger.info(f"PDF instance ready: {self.current_pdf.pdf_hash}")
402
-
442
+
403
443
  except IntegrityError:
404
444
  # Race condition - another worker created it
405
445
  if file_hash:
@@ -410,27 +450,27 @@ class PdfImportService:
410
450
 
411
451
  def _setup_processing_environment(self):
412
452
  """Setup processing environment and state."""
413
- original_path = self.processing_context.get('file_path')
453
+ original_path = self.processing_context.get("file_path")
414
454
 
415
455
  # Create sensitive file copy
416
456
  self.create_sensitive_file(self.current_pdf, original_path)
417
-
457
+
418
458
  # Update file path to point to sensitive copy
419
- self.processing_context['file_path'] = self.current_pdf.file.path
420
- self.processing_context['sensitive_copy_created'] = True
459
+ self.processing_context["file_path"] = self.current_pdf.file.path
460
+ self.processing_context["sensitive_copy_created"] = True
421
461
  try:
422
- self.processing_context['sensitive_file_path'] = Path(self.current_pdf.file.path)
462
+ self.processing_context["sensitive_file_path"] = Path(self.current_pdf.file.path)
423
463
  except Exception:
424
- self.processing_context['sensitive_file_path'] = None
425
-
464
+ self.processing_context["sensitive_file_path"] = None
465
+
426
466
  # Ensure state exists
427
467
  state = self.current_pdf.get_or_create_state()
428
468
  state.mark_processing_started()
429
- self.processing_context['processing_started'] = True
430
-
469
+ self.processing_context["processing_started"] = True
470
+
431
471
  # Mark as processed to prevent duplicates
432
- self.processed_files.add(str(self.processing_context['file_path']))
433
-
472
+ self.processed_files.add(str(self.processing_context["file_path"]))
473
+
434
474
  # Ensure default patient data
435
475
  logger.info("Ensuring default patient data...")
436
476
  self._ensure_default_patient_data(self.current_pdf)
@@ -438,83 +478,138 @@ class PdfImportService:
438
478
  def _process_text_and_metadata(self):
439
479
  """Process text extraction and metadata using ReportReader."""
440
480
  report_reading_available, ReportReader = self._ensure_report_reading_available()
441
-
481
+
442
482
  if not report_reading_available:
443
483
  logger.warning("Report reading not available (lx_anonymizer not found)")
444
484
  self._mark_processing_incomplete("no_report_reader")
445
485
  return
446
-
486
+
447
487
  if not self.current_pdf.file:
448
488
  logger.warning("No file available for text processing")
449
489
  self._mark_processing_incomplete("no_file")
450
490
  return
451
-
491
+
452
492
  try:
453
- logger.info("Starting text extraction and metadata processing with ReportReader...")
454
-
455
- # Setup output directories
456
- crops_dir = path_utils.PDF_DIR / 'cropped_regions'
457
- anonymized_dir = path_utils.PDF_DIR / 'anonymized'
458
- crops_dir.mkdir(parents=True, exist_ok=True)
459
- anonymized_dir.mkdir(parents=True, exist_ok=True)
493
+ logger.info(f"Starting text extraction and metadata processing with ReportReader (mode: {self.processing_mode})...")
460
494
 
461
495
  # Initialize ReportReader
462
- report_reader = ReportReader(
463
- report_root_path=str(path_utils.STORAGE_DIR),
464
- locale="de_DE",
465
- text_date_format="%d.%m.%Y"
466
- )
496
+ report_reader = ReportReader(report_root_path=str(path_utils.STORAGE_DIR), locale="de_DE", text_date_format="%d.%m.%Y")
497
+
498
+ if self.processing_mode == "cropping":
499
+ # Use advanced cropping method (existing implementation)
500
+ self._process_with_cropping(report_reader)
501
+ else: # blackening mode
502
+ # Use enhanced process_report with PDF masking
503
+ self._process_with_blackening(report_reader)
467
504
 
468
- # Process with cropping
469
- original_text, anonymized_text, extracted_metadata, cropped_regions, anonymized_pdf_path = report_reader.process_report_with_cropping(
470
- pdf_path=self.processing_context['file_path'],
471
- crop_sensitive_regions=True,
472
- crop_output_dir=str(crops_dir),
473
- anonymization_output_dir=str(anonymized_dir)
474
- )
475
-
476
- # Store results in context
477
- self.processing_context.update({
478
- 'original_text': original_text,
479
- 'anonymized_text': anonymized_text,
480
- 'extracted_metadata': extracted_metadata,
481
- 'cropped_regions': cropped_regions,
482
- 'anonymized_pdf_path': anonymized_pdf_path
483
- })
484
-
485
- if original_text:
486
- self._apply_text_results()
487
- self.processing_context['text_extracted'] = True
488
-
489
- if extracted_metadata:
490
- self._apply_metadata_results()
491
- self.processing_context['metadata_processed'] = True
492
-
493
- if anonymized_pdf_path:
494
- self._apply_anonymized_pdf()
495
- self.processing_context['anonymization_completed'] = True
496
-
497
505
  except Exception as e:
498
506
  logger.warning(f"Text processing failed: {e}")
499
507
  self._mark_processing_incomplete("text_processing_failed")
500
508
 
509
+ def _process_with_blackening(self, report_reader):
510
+ """Process PDF using simple blackening/masking mode."""
511
+ logger.info("Using simple PDF blackening mode...")
512
+
513
+ # Setup anonymized directory
514
+ anonymized_dir = path_utils.PDF_DIR / "anonymized"
515
+ anonymized_dir.mkdir(parents=True, exist_ok=True)
516
+
517
+ # Generate output path for anonymized PDF
518
+ pdf_hash = self.current_pdf.pdf_hash
519
+ anonymized_output_path = anonymized_dir / f"{pdf_hash}_anonymized.pdf"
520
+
521
+ # Process with enhanced process_report method (returns 4-tuple now)
522
+ original_text, anonymized_text, extracted_metadata, anonymized_pdf_path = report_reader.process_report(
523
+ pdf_path=self.processing_context["file_path"], create_anonymized_pdf=True, anonymized_pdf_output_path=str(anonymized_output_path)
524
+ )
525
+
526
+ # Store results in context
527
+ self.processing_context.update(
528
+ {
529
+ "original_text": original_text,
530
+ "anonymized_text": anonymized_text,
531
+ "extracted_metadata": extracted_metadata,
532
+ "cropped_regions": None, # Not available in blackening mode
533
+ "anonymized_pdf_path": anonymized_pdf_path,
534
+ }
535
+ )
536
+
537
+ # Apply results
538
+ if original_text:
539
+ self._apply_text_results()
540
+ self.processing_context["text_extracted"] = True
541
+
542
+ if extracted_metadata:
543
+ self._apply_metadata_results()
544
+ self.processing_context["metadata_processed"] = True
545
+
546
+ if anonymized_pdf_path:
547
+ self._apply_anonymized_pdf()
548
+ self.processing_context["anonymization_completed"] = True
549
+
550
+ logger.info("PDF blackening processing completed")
551
+
552
+ def _process_with_cropping(self, report_reader):
553
+ """Process PDF using advanced cropping mode (existing implementation)."""
554
+ logger.info("Using advanced cropping mode...")
555
+
556
+ # Setup output directories
557
+ crops_dir = path_utils.PDF_DIR / "cropped_regions"
558
+ anonymized_dir = path_utils.PDF_DIR / "anonymized"
559
+ crops_dir.mkdir(parents=True, exist_ok=True)
560
+ anonymized_dir.mkdir(parents=True, exist_ok=True)
561
+
562
+ # Process with cropping (returns 5-tuple)
563
+ original_text, anonymized_text, extracted_metadata, cropped_regions, anonymized_pdf_path = report_reader.process_report_with_cropping(
564
+ pdf_path=self.processing_context["file_path"],
565
+ crop_sensitive_regions=True,
566
+ crop_output_dir=str(crops_dir),
567
+ anonymization_output_dir=str(anonymized_dir),
568
+ )
569
+
570
+ # Store results in context
571
+ self.processing_context.update(
572
+ {
573
+ "original_text": original_text,
574
+ "anonymized_text": anonymized_text,
575
+ "extracted_metadata": extracted_metadata,
576
+ "cropped_regions": cropped_regions,
577
+ "anonymized_pdf_path": anonymized_pdf_path,
578
+ }
579
+ )
580
+
581
+ # Apply results
582
+ if original_text:
583
+ self._apply_text_results()
584
+ self.processing_context["text_extracted"] = True
585
+
586
+ if extracted_metadata:
587
+ self._apply_metadata_results()
588
+ self.processing_context["metadata_processed"] = True
589
+
590
+ if anonymized_pdf_path:
591
+ self._apply_anonymized_pdf()
592
+ self.processing_context["anonymization_completed"] = True
593
+
594
+ logger.info("PDF cropping processing completed")
595
+
501
596
  def _apply_text_results(self):
502
597
  """Apply text extraction results to the PDF instance."""
503
598
  if not self.current_pdf:
504
599
  logger.warning("Cannot apply text results - no PDF instance available")
505
600
  return
506
-
507
- original_text = self.processing_context.get('original_text')
508
- anonymized_text = self.processing_context.get('anonymized_text')
509
-
601
+
602
+ original_text = self.processing_context.get("original_text")
603
+ anonymized_text = self.processing_context.get("anonymized_text")
604
+
510
605
  if not original_text:
511
606
  logger.warning("No original text available to apply")
512
607
  return
513
-
608
+
514
609
  # Store extracted text
515
610
  self.current_pdf.text = original_text
516
611
  logger.info(f"Extracted {len(original_text)} characters of text from PDF")
517
-
612
+
518
613
  # Handle anonymized text
519
614
  if anonymized_text and anonymized_text != original_text:
520
615
  self.current_pdf.anonymized = True
@@ -525,56 +620,52 @@ class PdfImportService:
525
620
  if not self.current_pdf:
526
621
  logger.warning("Cannot apply metadata results - no PDF instance available")
527
622
  return
528
-
529
- extracted_metadata = self.processing_context.get('extracted_metadata')
530
-
623
+
624
+ extracted_metadata = self.processing_context.get("extracted_metadata")
625
+
531
626
  if not self.current_pdf.sensitive_meta or not extracted_metadata:
532
627
  logger.debug("No sensitive meta or extracted metadata available")
533
628
  return
534
-
629
+
535
630
  sm = self.current_pdf.sensitive_meta
536
-
631
+
537
632
  # Map ReportReader metadata to SensitiveMeta fields
538
633
  metadata_mapping = {
539
- 'patient_first_name': 'patient_first_name',
540
- 'patient_last_name': 'patient_last_name',
541
- 'patient_dob': 'patient_dob',
542
- 'examination_date': 'examination_date',
543
- 'examiner_first_name': 'examiner_first_name',
544
- 'examiner_last_name': 'examiner_last_name',
545
- 'endoscope_type': 'endoscope_type',
546
- 'casenumber': 'case_number'
634
+ "patient_first_name": "patient_first_name",
635
+ "patient_last_name": "patient_last_name",
636
+ "patient_dob": "patient_dob",
637
+ "examination_date": "examination_date",
638
+ "examiner_first_name": "examiner_first_name",
639
+ "examiner_last_name": "examiner_last_name",
640
+ "endoscope_type": "endoscope_type",
641
+ "casenumber": "case_number",
547
642
  }
548
-
643
+
549
644
  # Update fields with extracted information
550
645
  updated_fields = []
551
646
  for meta_key, sm_field in metadata_mapping.items():
552
647
  if extracted_metadata.get(meta_key) and hasattr(sm, sm_field):
553
648
  old_value = getattr(sm, sm_field)
554
649
  raw_value = extracted_metadata[meta_key]
555
-
650
+
556
651
  # Skip if we just got the field name as a string (indicates no actual data)
557
652
  if isinstance(raw_value, str) and raw_value == meta_key:
558
653
  continue
559
-
654
+
560
655
  # Handle date fields specially
561
- if sm_field in ['patient_dob', 'examination_date']:
656
+ if sm_field in ["patient_dob", "examination_date"]:
562
657
  new_value = self._parse_date_field(raw_value, meta_key, sm_field)
563
658
  if new_value is None:
564
659
  continue
565
660
  else:
566
661
  new_value = raw_value
567
-
662
+
568
663
  # Configurable overwrite policy
569
- should_overwrite = (
570
- self.allow_meta_overwrite
571
- or not old_value
572
- or old_value in ['Patient', 'Unknown']
573
- )
664
+ should_overwrite = self.allow_meta_overwrite or not old_value or old_value in ["Patient", "Unknown"]
574
665
  if new_value and should_overwrite:
575
666
  setattr(sm, sm_field, new_value)
576
667
  updated_fields.append(sm_field)
577
-
668
+
578
669
  if updated_fields:
579
670
  sm.save()
580
671
  logger.info(f"Updated SensitiveMeta fields: {updated_fields}")
@@ -585,28 +676,25 @@ class PdfImportService:
585
676
  if isinstance(raw_value, str):
586
677
  # Skip if the value is just the field name itself
587
678
  if raw_value == meta_key:
588
- logger.warning(
589
- "Skipping date field %s - got field name '%s' instead of actual date",
590
- sm_field, raw_value
591
- )
679
+ logger.warning("Skipping date field %s - got field name '%s' instead of actual date", sm_field, raw_value)
592
680
  return None
593
-
681
+
594
682
  # Try common date formats
595
- date_formats = ['%Y-%m-%d', '%d.%m.%Y', '%d/%m/%Y', '%m/%d/%Y']
683
+ date_formats = ["%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y"]
596
684
  for fmt in date_formats:
597
685
  try:
598
686
  return datetime.strptime(raw_value, fmt).date()
599
687
  except ValueError:
600
688
  continue
601
-
689
+
602
690
  logger.warning("Could not parse date '%s' for field %s", raw_value, sm_field)
603
691
  return None
604
-
605
- elif hasattr(raw_value, 'date'):
692
+
693
+ elif hasattr(raw_value, "date"):
606
694
  return raw_value.date()
607
695
  else:
608
696
  return raw_value
609
-
697
+
610
698
  except (ValueError, AttributeError) as e:
611
699
  logger.warning("Date parsing failed for %s: %s", sm_field, e)
612
700
  return None
@@ -626,7 +714,7 @@ class PdfImportService:
626
714
  logger.warning("Cannot apply anonymized PDF - no PDF instance available")
627
715
  return
628
716
 
629
- anonymized_pdf_path = self.processing_context.get('anonymized_pdf_path')
717
+ anonymized_pdf_path = self.processing_context.get("anonymized_pdf_path")
630
718
  if not anonymized_pdf_path:
631
719
  logger.debug("No anonymized_pdf_path present in processing context")
632
720
  return
@@ -647,7 +735,7 @@ class PdfImportService:
647
735
  relative_name = str(anonymized_path)
648
736
 
649
737
  # Only update if something actually changed
650
- if getattr(self.current_pdf.anonymized_file, 'name', None) != relative_name:
738
+ if getattr(self.current_pdf.anonymized_file, "name", None) != relative_name:
651
739
  self.current_pdf.anonymized_file.name = relative_name
652
740
 
653
741
  # Ensure model/state reflect anonymization even if text didn't differ
@@ -656,14 +744,14 @@ class PdfImportService:
656
744
 
657
745
  # Persist cropped regions info somewhere useful (optional & non-breaking)
658
746
  # If your model has a field for this, persist there; otherwise we just log.
659
- cropped_regions = self.processing_context.get('cropped_regions')
747
+ cropped_regions = self.processing_context.get("cropped_regions")
660
748
  if cropped_regions:
661
749
  logger.debug("Cropped regions recorded (%d regions).", len(cropped_regions))
662
750
 
663
751
  # Save model changes
664
- update_fields = ['anonymized_file']
665
- if 'anonymized' in self.current_pdf.__dict__:
666
- update_fields.append('anonymized')
752
+ update_fields = ["anonymized_file"]
753
+ if "anonymized" in self.current_pdf.__dict__:
754
+ update_fields.append("anonymized")
667
755
  self.current_pdf.save(update_fields=update_fields)
668
756
 
669
757
  # Mark state as anonymized immediately; this keeps downstream flows working
@@ -676,26 +764,25 @@ class PdfImportService:
676
764
  except Exception as e:
677
765
  logger.warning("Could not set anonymized file reference: %s", e)
678
766
 
679
-
680
767
  def _finalize_processing(self):
681
768
  """Finalize processing and update state."""
682
769
  if not self.current_pdf:
683
770
  logger.warning("Cannot finalize processing - no PDF instance available")
684
771
  return
685
-
772
+
686
773
  try:
687
774
  # Update state based on processing results
688
775
  state = self._ensure_state(self.current_pdf)
689
-
690
- if self.processing_context.get('text_extracted') and state:
776
+
777
+ if self.processing_context.get("text_extracted") and state:
691
778
  state.mark_anonymized()
692
-
779
+
693
780
  # Save all changes
694
781
  with transaction.atomic():
695
782
  self.current_pdf.save()
696
783
  if state:
697
784
  state.save()
698
-
785
+
699
786
  logger.info("PDF processing completed successfully")
700
787
  except Exception as e:
701
788
  logger.warning(f"Failed to finalize processing: {e}")
@@ -705,7 +792,7 @@ class PdfImportService:
705
792
  if not self.current_pdf:
706
793
  logger.warning(f"Cannot mark processing incomplete - no PDF instance available. Reason: {reason}")
707
794
  return
708
-
795
+
709
796
  try:
710
797
  state = self._ensure_state(self.current_pdf)
711
798
  if state:
@@ -714,7 +801,7 @@ class PdfImportService:
714
801
  state.sensitive_meta_processed = False
715
802
  state.save()
716
803
  logger.info(f"Set PDF state: processed=False due to {reason}")
717
-
804
+
718
805
  # Save changes
719
806
  with transaction.atomic():
720
807
  self.current_pdf.save()
@@ -729,12 +816,12 @@ class PdfImportService:
729
816
  if file_path_str and file_path_str in self.processed_files:
730
817
  self.processed_files.remove(file_path_str)
731
818
  logger.debug(f"Removed {file_path_str} from processed files for retry")
732
-
819
+
733
820
  return self.import_and_anonymize(
734
821
  file_path=existing_pdf.file.path,
735
822
  center_name=existing_pdf.center.name if existing_pdf.center else "unknown_center",
736
823
  delete_source=False,
737
- retry=True
824
+ retry=True,
738
825
  )
739
826
  except Exception as e:
740
827
  logger.error(f"Failed to re-import existing PDF {existing_pdf.pdf_hash}: {e}")
@@ -744,9 +831,9 @@ class PdfImportService:
744
831
  def _cleanup_on_error(self):
745
832
  """Cleanup processing context on error."""
746
833
  try:
747
- if self.current_pdf and hasattr(self.current_pdf, 'state'):
834
+ if self.current_pdf and hasattr(self.current_pdf, "state"):
748
835
  state = self._ensure_state(self.current_pdf)
749
- if state and self.processing_context.get('processing_started'):
836
+ if state and self.processing_context.get("processing_started"):
750
837
  state.text_meta_extracted = False
751
838
  state.pdf_meta_extracted = False
752
839
  state.sensitive_meta_processed = False
@@ -756,7 +843,7 @@ class PdfImportService:
756
843
  logger.warning(f"Error during cleanup: {e}")
757
844
  finally:
758
845
  # Remove any sensitive copy created during this processing run
759
- sensitive_created = self.processing_context.get('sensitive_copy_created')
846
+ sensitive_created = self.processing_context.get("sensitive_copy_created")
760
847
  if sensitive_created:
761
848
  pdf_obj = self.current_pdf
762
849
  try:
@@ -770,20 +857,16 @@ class PdfImportService:
770
857
  logger.warning("Failed to remove sensitive copy during error cleanup: %s", cleanup_exc)
771
858
 
772
859
  # Always clean up processed files set to prevent blocks
773
- file_path = self.processing_context.get('file_path')
860
+ file_path = self.processing_context.get("file_path")
774
861
  if file_path and str(file_path) in self.processed_files:
775
862
  self.processed_files.remove(str(file_path))
776
863
  logger.debug(f"Removed {file_path} from processed files during error cleanup")
777
864
 
778
865
  try:
779
- original_path = self.processing_context.get('original_file_path')
866
+ original_path = self.processing_context.get("original_file_path")
780
867
  logger.debug("PDF cleanup original path: %s (%s)", original_path, type(original_path))
781
868
  raw_dir = original_path.parent if isinstance(original_path, Path) else None
782
- if (
783
- isinstance(original_path, Path)
784
- and original_path.exists()
785
- and not self.processing_context.get('sensitive_copy_created')
786
- ):
869
+ if isinstance(original_path, Path) and original_path.exists() and not self.processing_context.get("sensitive_copy_created"):
787
870
  try:
788
871
  original_path.unlink()
789
872
  logger.info("Removed original file %s during error cleanup", original_path)
@@ -822,7 +905,7 @@ class PdfImportService:
822
905
  raw_count = len(list(raw_dir.glob("*"))) if raw_dir and raw_dir.exists() else None
823
906
  pdf_count = len(list(pdf_dir.glob("*"))) if pdf_dir and pdf_dir.exists() else None
824
907
 
825
- sensitive_path = self.processing_context.get('sensitive_file_path')
908
+ sensitive_path = self.processing_context.get("sensitive_file_path")
826
909
  if sensitive_path:
827
910
  sensitive_parent = Path(sensitive_path).parent
828
911
  sensitive_count = len(list(sensitive_parent.glob("*"))) if sensitive_parent.exists() else None
@@ -843,17 +926,17 @@ class PdfImportService:
843
926
  """Cleanup processing context."""
844
927
  try:
845
928
  # Clean up temporary directories
846
- if self.processing_context.get('text_extracted'):
847
- crops_dir = path_utils.PDF_DIR / 'cropped_regions'
929
+ if self.processing_context.get("text_extracted"):
930
+ crops_dir = path_utils.PDF_DIR / "cropped_regions"
848
931
  if crops_dir.exists() and not any(crops_dir.iterdir()):
849
932
  crops_dir.rmdir()
850
-
933
+
851
934
  # Always remove from processed files set after processing attempt
852
- file_path = self.processing_context.get('file_path')
935
+ file_path = self.processing_context.get("file_path")
853
936
  if file_path and str(file_path) in self.processed_files:
854
937
  self.processed_files.remove(str(file_path))
855
938
  logger.debug(f"Removed {file_path} from processed files set")
856
-
939
+
857
940
  except Exception as e:
858
941
  logger.warning(f"Error during context cleanup: {e}")
859
942
  finally:
@@ -861,45 +944,40 @@ class PdfImportService:
861
944
  self.current_pdf = None
862
945
  self.processing_context = {}
863
946
 
864
- def import_simple(
865
- self,
866
- file_path: Union[Path, str],
867
- center_name: str,
868
- delete_source: bool = False
869
- ) -> "RawPdfFile":
947
+ def import_simple(self, file_path: Union[Path, str], center_name: str, delete_source: bool = False) -> "RawPdfFile":
870
948
  """
871
949
  Simple PDF import without text processing or anonymization.
872
950
  Uses centralized PDF instance management pattern.
873
-
951
+
874
952
  Args:
875
953
  file_path: Path to the PDF file to import
876
954
  center_name: Name of the center to associate with PDF
877
955
  delete_source: Whether to delete the source file after import
878
-
956
+
879
957
  Returns:
880
958
  RawPdfFile instance after basic import
881
959
  """
882
960
  try:
883
961
  # Initialize simple processing context
884
962
  self._initialize_processing_context(file_path, center_name, delete_source, False)
885
-
963
+
886
964
  # Validate file
887
965
  self._validate_and_prepare_file()
888
-
966
+
889
967
  # Create PDF instance
890
968
  logger.info("Starting simple import - creating RawPdfFile instance...")
891
969
  self.current_pdf = RawPdfFile.create_from_file_initialized(
892
- file_path=self.processing_context['file_path'],
970
+ file_path=self.processing_context["file_path"],
893
971
  center_name=center_name,
894
972
  delete_source=delete_source,
895
973
  )
896
-
974
+
897
975
  if not self.current_pdf:
898
976
  raise RuntimeError("Failed to create RawPdfFile instance")
899
-
977
+
900
978
  # Mark as processed
901
- self.processed_files.add(str(self.processing_context['file_path']))
902
-
979
+ self.processed_files.add(str(self.processing_context["file_path"]))
980
+
903
981
  # Set basic state for simple import
904
982
  state = self._ensure_state(self.current_pdf)
905
983
  if state:
@@ -908,45 +986,46 @@ class PdfImportService:
908
986
  state.sensitive_meta_processed = False
909
987
  state.save()
910
988
  logger.info("Set PDF state: processed=False for simple import")
911
-
989
+
912
990
  # Save changes
913
991
  with transaction.atomic():
914
992
  self.current_pdf.save()
915
-
993
+
916
994
  logger.info("Simple import completed for RawPdfFile hash: %s", self.current_pdf.pdf_hash)
917
995
  return self.current_pdf
918
-
996
+
919
997
  except Exception as e:
920
998
  logger.error(f"Simple PDF import failed for {file_path}: {e}")
921
999
  self._cleanup_on_error()
922
1000
  raise
923
1001
  finally:
924
1002
  self._cleanup_processing_context()
925
-
1003
+
926
1004
  def check_storage_capacity(self, file_path: Union[Path, str], storage_root, min_required_space) -> None:
927
1005
  """
928
1006
  Check if there is sufficient storage capacity for the PDF file.
929
-
1007
+
930
1008
  Args:
931
1009
  file_path: Path to the PDF file to check
932
-
1010
+
933
1011
  Raises:
934
1012
  InsufficientStorageError: If there is not enough space
935
1013
  """
936
1014
  import shutil
1015
+
937
1016
  from endoreg_db.exceptions import InsufficientStorageError
938
-
1017
+
939
1018
  file_path = Path(file_path)
940
1019
  if not file_path.exists():
941
1020
  raise FileNotFoundError(f"File not found for storage check: {file_path}")
942
-
1021
+
943
1022
  # Get the size of the file
944
1023
  file_size = file_path.stat().st_size
945
-
1024
+
946
1025
  # Get available space in the storage directory
947
1026
 
948
1027
  total, used, free = shutil.disk_usage(storage_root)
949
-
1028
+
950
1029
  if file_size:
951
1030
  min_required_space = file_size if isinstance(min_required_space, int) else 0
952
1031
 
@@ -954,9 +1033,9 @@ class PdfImportService:
954
1033
  if file_size > free:
955
1034
  raise InsufficientStorageError(f"Not enough space to store PDF file: {file_path}")
956
1035
  logger.info(f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available")
957
-
1036
+
958
1037
  return True
959
-
1038
+
960
1039
  def create_sensitive_file(self, pdf_instance: "RawPdfFile" = None, file_path: Union[Path, str] = None) -> None:
961
1040
  """
962
1041
  Create a copy of the PDF file in the sensitive directory and update the file reference.
@@ -966,7 +1045,7 @@ class PdfImportService:
966
1045
  Ensures the FileField points to the file under STORAGE_DIR/pdfs/sensitive and never back to raw_pdfs.
967
1046
  """
968
1047
  pdf_file = pdf_instance or self.current_pdf
969
- source_path = Path(file_path) if file_path else self.processing_context.get('file_path')
1048
+ source_path = Path(file_path) if file_path else self.processing_context.get("file_path")
970
1049
 
971
1050
  if not pdf_file:
972
1051
  raise ValueError("No PDF instance available for creating sensitive file")
@@ -1002,9 +1081,9 @@ class PdfImportService:
1002
1081
  relative_name = str(target)
1003
1082
 
1004
1083
  # Only update when changed
1005
- if getattr(pdf_file.file, 'name', None) != relative_name:
1084
+ if getattr(pdf_file.file, "name", None) != relative_name:
1006
1085
  pdf_file.file.name = relative_name
1007
- pdf_file.save(update_fields=['file'])
1086
+ pdf_file.save(update_fields=["file"])
1008
1087
  logger.info("Updated PDF FileField reference to sensitive path: %s", pdf_file.file.path)
1009
1088
  else:
1010
1089
  logger.debug("PDF FileField already points to sensitive path: %s", pdf_file.file.path)
@@ -1020,44 +1099,45 @@ class PdfImportService:
1020
1099
  except Exception as e:
1021
1100
  logger.warning(f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}", exc_info=True)
1022
1101
 
1023
- def archive_or_quarantine_file(self, pdf_instance: "RawPdfFile" = None, source_file_path: Union[Path, str] = None,
1024
- quarantine_reason: str = None, is_pdf_problematic: bool = None) -> bool:
1102
+ def archive_or_quarantine_file(
1103
+ self, pdf_instance: "RawPdfFile" = None, source_file_path: Union[Path, str] = None, quarantine_reason: str = None, is_pdf_problematic: bool = None
1104
+ ) -> bool:
1025
1105
  """
1026
1106
  Archive or quarantine file based on the state of the PDF processing.
1027
1107
  Uses the central PDF instance and processing context if parameters not provided.
1028
-
1108
+
1029
1109
  Args:
1030
1110
  pdf_instance: Optional PDF instance, defaults to self.current_pdf
1031
1111
  source_file_path: Optional source file path, defaults to processing_context['file_path']
1032
1112
  quarantine_reason: Optional quarantine reason, defaults to processing_context['error_reason']
1033
1113
  is_pdf_problematic: Optional override for problematic state
1034
-
1114
+
1035
1115
  Returns:
1036
1116
  bool: True if file was quarantined, False if archived successfully
1037
1117
  """
1038
1118
  pdf_file = pdf_instance or self.current_pdf
1039
- file_path = Path(source_file_path) if source_file_path else self.processing_context.get('file_path')
1040
- quarantine_reason = quarantine_reason or self.processing_context.get('error_reason')
1041
-
1119
+ file_path = Path(source_file_path) if source_file_path else self.processing_context.get("file_path")
1120
+ quarantine_reason = quarantine_reason or self.processing_context.get("error_reason")
1121
+
1042
1122
  if not pdf_file:
1043
1123
  raise ValueError("No PDF instance available for archiving/quarantine")
1044
1124
  if not file_path:
1045
1125
  raise ValueError("No file path available for archiving/quarantine")
1046
-
1126
+
1047
1127
  # Determine if the PDF is problematic
1048
1128
  pdf_problematic = is_pdf_problematic if is_pdf_problematic is not None else pdf_file.is_problematic
1049
-
1129
+
1050
1130
  if pdf_problematic:
1051
1131
  # Quarantine the file
1052
1132
  logger.warning(f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}")
1053
1133
  quarantine_dir = path_utils.PDF_DIR / "quarantine"
1054
1134
  os.makedirs(quarantine_dir, exist_ok=True)
1055
-
1135
+
1056
1136
  quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
1057
1137
  try:
1058
1138
  shutil.move(file_path, quarantine_path)
1059
1139
  pdf_file.quarantine_reason = quarantine_reason or "File processing failed"
1060
- pdf_file.save(update_fields=['quarantine_reason'])
1140
+ pdf_file.save(update_fields=["quarantine_reason"])
1061
1141
  logger.info(f"Moved problematic PDF to quarantine: {quarantine_path}")
1062
1142
  return True
1063
1143
  except Exception as e:
@@ -1068,7 +1148,7 @@ class PdfImportService:
1068
1148
  logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
1069
1149
  archive_dir = path_utils.PDF_DIR / "processed"
1070
1150
  os.makedirs(archive_dir, exist_ok=True)
1071
-
1151
+
1072
1152
  archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
1073
1153
  try:
1074
1154
  shutil.move(file_path, archive_path)