endoreg-db 0.8.5.5__py3-none-any.whl → 0.8.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of endoreg-db might be problematic. Click here for more details.
- endoreg_db/services/pdf_import.py +292 -77
- {endoreg_db-0.8.5.5.dist-info → endoreg_db-0.8.5.6.dist-info}/METADATA +1 -1
- {endoreg_db-0.8.5.5.dist-info → endoreg_db-0.8.5.6.dist-info}/RECORD +5 -5
- {endoreg_db-0.8.5.5.dist-info → endoreg_db-0.8.5.6.dist-info}/WHEEL +0 -0
- {endoreg_db-0.8.5.5.dist-info → endoreg_db-0.8.5.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -43,7 +43,9 @@ class PdfImportService:
|
|
|
43
43
|
- 'cropping': Advanced mode that crops sensitive regions to separate images
|
|
44
44
|
"""
|
|
45
45
|
|
|
46
|
-
def __init__(
|
|
46
|
+
def __init__(
|
|
47
|
+
self, allow_meta_overwrite: bool = False, processing_mode: str = "blackening"
|
|
48
|
+
):
|
|
47
49
|
"""
|
|
48
50
|
Initialize the PDF import service.
|
|
49
51
|
|
|
@@ -59,7 +61,9 @@ class PdfImportService:
|
|
|
59
61
|
# Validate and set processing mode
|
|
60
62
|
valid_modes = ["blackening", "cropping"]
|
|
61
63
|
if processing_mode not in valid_modes:
|
|
62
|
-
raise ValueError(
|
|
64
|
+
raise ValueError(
|
|
65
|
+
f"Invalid processing_mode '{processing_mode}'. Must be one of: {valid_modes}"
|
|
66
|
+
)
|
|
63
67
|
self.processing_mode = processing_mode
|
|
64
68
|
|
|
65
69
|
# Central PDF instance management
|
|
@@ -77,7 +81,9 @@ class PdfImportService:
|
|
|
77
81
|
Returns:
|
|
78
82
|
PdfImportService instance configured for blackening mode
|
|
79
83
|
"""
|
|
80
|
-
return cls(
|
|
84
|
+
return cls(
|
|
85
|
+
allow_meta_overwrite=allow_meta_overwrite, processing_mode="blackening"
|
|
86
|
+
)
|
|
81
87
|
|
|
82
88
|
@classmethod
|
|
83
89
|
def with_cropping(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
|
|
@@ -90,7 +96,9 @@ class PdfImportService:
|
|
|
90
96
|
Returns:
|
|
91
97
|
PdfImportService instance configured for cropping mode
|
|
92
98
|
"""
|
|
93
|
-
return cls(
|
|
99
|
+
return cls(
|
|
100
|
+
allow_meta_overwrite=allow_meta_overwrite, processing_mode="cropping"
|
|
101
|
+
)
|
|
94
102
|
|
|
95
103
|
@contextmanager
|
|
96
104
|
def _file_lock(self, path: Path):
|
|
@@ -115,10 +123,16 @@ class PdfImportService:
|
|
|
115
123
|
|
|
116
124
|
if age is not None and age > STALE_LOCK_SECONDS:
|
|
117
125
|
try:
|
|
118
|
-
logger.warning(
|
|
126
|
+
logger.warning(
|
|
127
|
+
"Stale lock detected for %s (age %.0fs). Reclaiming lock...",
|
|
128
|
+
path,
|
|
129
|
+
age,
|
|
130
|
+
)
|
|
119
131
|
lock_path.unlink()
|
|
120
132
|
except Exception as e:
|
|
121
|
-
logger.warning(
|
|
133
|
+
logger.warning(
|
|
134
|
+
"Failed to remove stale lock %s: %s", lock_path, e
|
|
135
|
+
)
|
|
122
136
|
# retry acquire
|
|
123
137
|
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
|
|
124
138
|
else:
|
|
@@ -233,12 +247,16 @@ class PdfImportService:
|
|
|
233
247
|
try:
|
|
234
248
|
mod = importlib.import_module("lx_anonymizer")
|
|
235
249
|
ReportReader = getattr(mod, "ReportReader")
|
|
236
|
-
logger.info(
|
|
250
|
+
logger.info(
|
|
251
|
+
"Imported lx_anonymizer.ReportReader via LX_ANONYMIZER_PATH"
|
|
252
|
+
)
|
|
237
253
|
self._report_reader_available = True
|
|
238
254
|
self._report_reader_class = ReportReader
|
|
239
255
|
return True, ReportReader
|
|
240
256
|
except Exception as e:
|
|
241
|
-
logger.warning(
|
|
257
|
+
logger.warning(
|
|
258
|
+
"Failed importing lx_anonymizer via LX_ANONYMIZER_PATH: %s", e
|
|
259
|
+
)
|
|
242
260
|
finally:
|
|
243
261
|
# Keep path for future imports if it worked; otherwise remove.
|
|
244
262
|
if "ReportReader" not in locals() and extra in sys.path:
|
|
@@ -259,11 +277,15 @@ class PdfImportService:
|
|
|
259
277
|
"""
|
|
260
278
|
pdf_file = pdf_instance or self.current_pdf
|
|
261
279
|
if not pdf_file:
|
|
262
|
-
logger.warning(
|
|
280
|
+
logger.warning(
|
|
281
|
+
"No PDF instance available for ensuring default patient data"
|
|
282
|
+
)
|
|
263
283
|
return
|
|
264
284
|
|
|
265
285
|
if not pdf_file.sensitive_meta:
|
|
266
|
-
logger.info(
|
|
286
|
+
logger.info(
|
|
287
|
+
f"No SensitiveMeta found for PDF {pdf_file.pdf_hash}, creating default"
|
|
288
|
+
)
|
|
267
289
|
|
|
268
290
|
# Create default SensitiveMeta with placeholder data
|
|
269
291
|
default_data = {
|
|
@@ -271,16 +293,22 @@ class PdfImportService:
|
|
|
271
293
|
"patient_last_name": "Unknown",
|
|
272
294
|
"patient_dob": date(1990, 1, 1), # Default DOB
|
|
273
295
|
"examination_date": date.today(),
|
|
274
|
-
"center_name": pdf_file.center.name
|
|
296
|
+
"center_name": pdf_file.center.name
|
|
297
|
+
if pdf_file.center
|
|
298
|
+
else "university_hospital_wuerzburg",
|
|
275
299
|
}
|
|
276
300
|
|
|
277
301
|
try:
|
|
278
302
|
sensitive_meta = SensitiveMeta.create_from_dict(default_data)
|
|
279
303
|
pdf_file.sensitive_meta = sensitive_meta
|
|
280
304
|
pdf_file.save(update_fields=["sensitive_meta"])
|
|
281
|
-
logger.info(
|
|
305
|
+
logger.info(
|
|
306
|
+
f"Created default SensitiveMeta for PDF {pdf_file.pdf_hash}"
|
|
307
|
+
)
|
|
282
308
|
except Exception as e:
|
|
283
|
-
logger.error(
|
|
309
|
+
logger.error(
|
|
310
|
+
f"Failed to create default SensitiveMeta for PDF {pdf_file.pdf_hash}: {e}"
|
|
311
|
+
)
|
|
284
312
|
|
|
285
313
|
def import_and_anonymize(
|
|
286
314
|
self,
|
|
@@ -311,7 +339,9 @@ class PdfImportService:
|
|
|
311
339
|
"""
|
|
312
340
|
try:
|
|
313
341
|
# Initialize processing context
|
|
314
|
-
self._initialize_processing_context(
|
|
342
|
+
self._initialize_processing_context(
|
|
343
|
+
file_path, center_name, delete_source, retry
|
|
344
|
+
)
|
|
315
345
|
|
|
316
346
|
# Step 1: Validate and prepare file
|
|
317
347
|
self._validate_and_prepare_file()
|
|
@@ -321,7 +351,9 @@ class PdfImportService:
|
|
|
321
351
|
|
|
322
352
|
# Early return check - if no PDF instance was created, return None
|
|
323
353
|
if not self.current_pdf:
|
|
324
|
-
logger.warning(
|
|
354
|
+
logger.warning(
|
|
355
|
+
f"No PDF instance created for {file_path}, returning None"
|
|
356
|
+
)
|
|
325
357
|
return None
|
|
326
358
|
|
|
327
359
|
# Step 3: Setup processing environment
|
|
@@ -353,7 +385,13 @@ class PdfImportService:
|
|
|
353
385
|
# Always cleanup context
|
|
354
386
|
self._cleanup_processing_context()
|
|
355
387
|
|
|
356
|
-
def _initialize_processing_context(
|
|
388
|
+
def _initialize_processing_context(
|
|
389
|
+
self,
|
|
390
|
+
file_path: Union[Path, str],
|
|
391
|
+
center_name: str,
|
|
392
|
+
delete_source: bool,
|
|
393
|
+
retry: bool,
|
|
394
|
+
):
|
|
357
395
|
"""Initialize the processing context for the current PDF."""
|
|
358
396
|
self.processing_context = {
|
|
359
397
|
"file_path": Path(file_path),
|
|
@@ -370,7 +408,9 @@ class PdfImportService:
|
|
|
370
408
|
|
|
371
409
|
# Check if already processed (only during current session to prevent race conditions)
|
|
372
410
|
if str(file_path) in self.processed_files:
|
|
373
|
-
logger.info(
|
|
411
|
+
logger.info(
|
|
412
|
+
f"File {file_path} already being processed in current session, skipping"
|
|
413
|
+
)
|
|
374
414
|
raise ValueError("File already being processed")
|
|
375
415
|
|
|
376
416
|
logger.info(f"Starting import and processing for: {file_path}")
|
|
@@ -406,7 +446,9 @@ class PdfImportService:
|
|
|
406
446
|
if existing:
|
|
407
447
|
logger.info(f"Found existing RawPdfFile {existing.pdf_hash}")
|
|
408
448
|
if existing.text:
|
|
409
|
-
logger.info(
|
|
449
|
+
logger.info(
|
|
450
|
+
f"Existing PDF {existing.pdf_hash} already processed - returning"
|
|
451
|
+
)
|
|
410
452
|
self.current_pdf = existing
|
|
411
453
|
return
|
|
412
454
|
else:
|
|
@@ -428,11 +470,15 @@ class PdfImportService:
|
|
|
428
470
|
else:
|
|
429
471
|
# Retrieve existing for retry
|
|
430
472
|
self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
|
|
431
|
-
logger.info(
|
|
473
|
+
logger.info(
|
|
474
|
+
f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}"
|
|
475
|
+
)
|
|
432
476
|
|
|
433
477
|
# Check if retry is actually needed
|
|
434
478
|
if self.current_pdf.text:
|
|
435
|
-
logger.info(
|
|
479
|
+
logger.info(
|
|
480
|
+
f"Existing PDF {self.current_pdf.pdf_hash} already processed during retry - returning"
|
|
481
|
+
)
|
|
436
482
|
return
|
|
437
483
|
|
|
438
484
|
if not self.current_pdf:
|
|
@@ -459,7 +505,9 @@ class PdfImportService:
|
|
|
459
505
|
self.processing_context["file_path"] = self.current_pdf.file.path
|
|
460
506
|
self.processing_context["sensitive_copy_created"] = True
|
|
461
507
|
try:
|
|
462
|
-
self.processing_context["sensitive_file_path"] = Path(
|
|
508
|
+
self.processing_context["sensitive_file_path"] = Path(
|
|
509
|
+
self.current_pdf.file.path
|
|
510
|
+
)
|
|
463
511
|
except Exception:
|
|
464
512
|
self.processing_context["sensitive_file_path"] = None
|
|
465
513
|
|
|
@@ -490,10 +538,16 @@ class PdfImportService:
|
|
|
490
538
|
return
|
|
491
539
|
|
|
492
540
|
try:
|
|
493
|
-
logger.info(
|
|
541
|
+
logger.info(
|
|
542
|
+
f"Starting text extraction and metadata processing with ReportReader (mode: {self.processing_mode})..."
|
|
543
|
+
)
|
|
494
544
|
|
|
495
545
|
# Initialize ReportReader
|
|
496
|
-
report_reader = ReportReader(
|
|
546
|
+
report_reader = ReportReader(
|
|
547
|
+
report_root_path=str(path_utils.STORAGE_DIR),
|
|
548
|
+
locale="de_DE",
|
|
549
|
+
text_date_format="%d.%m.%Y",
|
|
550
|
+
)
|
|
497
551
|
|
|
498
552
|
if self.processing_mode == "cropping":
|
|
499
553
|
# Use advanced cropping method (existing implementation)
|
|
@@ -519,8 +573,12 @@ class PdfImportService:
|
|
|
519
573
|
anonymized_output_path = anonymized_dir / f"{pdf_hash}_anonymized.pdf"
|
|
520
574
|
|
|
521
575
|
# Process with enhanced process_report method (returns 4-tuple now)
|
|
522
|
-
original_text, anonymized_text, extracted_metadata, anonymized_pdf_path =
|
|
523
|
-
|
|
576
|
+
original_text, anonymized_text, extracted_metadata, anonymized_pdf_path = (
|
|
577
|
+
report_reader.process_report(
|
|
578
|
+
pdf_path=self.processing_context["file_path"],
|
|
579
|
+
create_anonymized_pdf=True,
|
|
580
|
+
anonymized_pdf_output_path=str(anonymized_output_path),
|
|
581
|
+
)
|
|
524
582
|
)
|
|
525
583
|
|
|
526
584
|
# Store results in context
|
|
@@ -560,7 +618,13 @@ class PdfImportService:
|
|
|
560
618
|
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
561
619
|
|
|
562
620
|
# Process with cropping (returns 5-tuple)
|
|
563
|
-
|
|
621
|
+
(
|
|
622
|
+
original_text,
|
|
623
|
+
anonymized_text,
|
|
624
|
+
extracted_metadata,
|
|
625
|
+
cropped_regions,
|
|
626
|
+
anonymized_pdf_path,
|
|
627
|
+
) = report_reader.process_report_with_cropping(
|
|
564
628
|
pdf_path=self.processing_context["file_path"],
|
|
565
629
|
crop_sensitive_regions=True,
|
|
566
630
|
crop_output_dir=str(crops_dir),
|
|
@@ -661,7 +725,11 @@ class PdfImportService:
|
|
|
661
725
|
new_value = raw_value
|
|
662
726
|
|
|
663
727
|
# Configurable overwrite policy
|
|
664
|
-
should_overwrite =
|
|
728
|
+
should_overwrite = (
|
|
729
|
+
self.allow_meta_overwrite
|
|
730
|
+
or not old_value
|
|
731
|
+
or old_value in ["Patient", "Unknown"]
|
|
732
|
+
)
|
|
665
733
|
if new_value and should_overwrite:
|
|
666
734
|
setattr(sm, sm_field, new_value)
|
|
667
735
|
updated_fields.append(sm_field)
|
|
@@ -676,7 +744,11 @@ class PdfImportService:
|
|
|
676
744
|
if isinstance(raw_value, str):
|
|
677
745
|
# Skip if the value is just the field name itself
|
|
678
746
|
if raw_value == meta_key:
|
|
679
|
-
logger.warning(
|
|
747
|
+
logger.warning(
|
|
748
|
+
"Skipping date field %s - got field name '%s' instead of actual date",
|
|
749
|
+
sm_field,
|
|
750
|
+
raw_value,
|
|
751
|
+
)
|
|
680
752
|
return None
|
|
681
753
|
|
|
682
754
|
# Try common date formats
|
|
@@ -687,7 +759,9 @@ class PdfImportService:
|
|
|
687
759
|
except ValueError:
|
|
688
760
|
continue
|
|
689
761
|
|
|
690
|
-
logger.warning(
|
|
762
|
+
logger.warning(
|
|
763
|
+
"Could not parse date '%s' for field %s", raw_value, sm_field
|
|
764
|
+
)
|
|
691
765
|
return None
|
|
692
766
|
|
|
693
767
|
elif hasattr(raw_value, "date"):
|
|
@@ -721,7 +795,10 @@ class PdfImportService:
|
|
|
721
795
|
|
|
722
796
|
anonymized_path = Path(anonymized_pdf_path)
|
|
723
797
|
if not anonymized_path.exists():
|
|
724
|
-
logger.warning(
|
|
798
|
+
logger.warning(
|
|
799
|
+
"Anonymized PDF path returned but file does not exist: %s",
|
|
800
|
+
anonymized_path,
|
|
801
|
+
)
|
|
725
802
|
return
|
|
726
803
|
|
|
727
804
|
logger.info("Anonymized PDF created by ReportReader at: %s", anonymized_path)
|
|
@@ -746,7 +823,9 @@ class PdfImportService:
|
|
|
746
823
|
# If your model has a field for this, persist there; otherwise we just log.
|
|
747
824
|
cropped_regions = self.processing_context.get("cropped_regions")
|
|
748
825
|
if cropped_regions:
|
|
749
|
-
logger.debug(
|
|
826
|
+
logger.debug(
|
|
827
|
+
"Cropped regions recorded (%d regions).", len(cropped_regions)
|
|
828
|
+
)
|
|
750
829
|
|
|
751
830
|
# Save model changes
|
|
752
831
|
update_fields = ["anonymized_file"]
|
|
@@ -759,7 +838,10 @@ class PdfImportService:
|
|
|
759
838
|
if state and not state.anonymized:
|
|
760
839
|
state.mark_anonymized(save=True)
|
|
761
840
|
|
|
762
|
-
logger.info(
|
|
841
|
+
logger.info(
|
|
842
|
+
"Updated anonymized_file reference to: %s",
|
|
843
|
+
self.current_pdf.anonymized_file.name,
|
|
844
|
+
)
|
|
763
845
|
|
|
764
846
|
except Exception as e:
|
|
765
847
|
logger.warning("Could not set anonymized file reference: %s", e)
|
|
@@ -790,7 +872,9 @@ class PdfImportService:
|
|
|
790
872
|
def _mark_processing_incomplete(self, reason: str):
|
|
791
873
|
"""Mark processing as incomplete with reason."""
|
|
792
874
|
if not self.current_pdf:
|
|
793
|
-
logger.warning(
|
|
875
|
+
logger.warning(
|
|
876
|
+
f"Cannot mark processing incomplete - no PDF instance available. Reason: {reason}"
|
|
877
|
+
)
|
|
794
878
|
return
|
|
795
879
|
|
|
796
880
|
try:
|
|
@@ -809,22 +893,44 @@ class PdfImportService:
|
|
|
809
893
|
logger.warning(f"Failed to mark processing incomplete: {e}")
|
|
810
894
|
|
|
811
895
|
def _retry_existing_pdf(self, existing_pdf):
|
|
812
|
-
"""
|
|
896
|
+
"""
|
|
897
|
+
Retry processing for existing PDF.
|
|
898
|
+
|
|
899
|
+
Uses get_raw_file_path() to find the original raw file instead of
|
|
900
|
+
relying on the file field which may point to a deleted sensitive file.
|
|
901
|
+
"""
|
|
813
902
|
try:
|
|
903
|
+
# ✅ FIX: Use get_raw_file_path() to find original file
|
|
904
|
+
raw_file_path = existing_pdf.get_raw_file_path()
|
|
905
|
+
|
|
906
|
+
if not raw_file_path or not raw_file_path.exists():
|
|
907
|
+
logger.error(
|
|
908
|
+
f"Cannot retry PDF {existing_pdf.pdf_hash}: Raw file not found. "
|
|
909
|
+
f"Please re-upload the original PDF file."
|
|
910
|
+
)
|
|
911
|
+
self.current_pdf = existing_pdf
|
|
912
|
+
return existing_pdf
|
|
913
|
+
|
|
914
|
+
logger.info(f"Found raw file for retry at: {raw_file_path}")
|
|
915
|
+
|
|
814
916
|
# Remove from processed files to allow retry
|
|
815
|
-
file_path_str = str(
|
|
816
|
-
if file_path_str
|
|
917
|
+
file_path_str = str(raw_file_path)
|
|
918
|
+
if file_path_str in self.processed_files:
|
|
817
919
|
self.processed_files.remove(file_path_str)
|
|
818
920
|
logger.debug(f"Removed {file_path_str} from processed files for retry")
|
|
819
921
|
|
|
820
922
|
return self.import_and_anonymize(
|
|
821
|
-
file_path=
|
|
822
|
-
center_name=existing_pdf.center.name
|
|
823
|
-
|
|
923
|
+
file_path=raw_file_path, # ✅ Use raw file path, not sensitive path
|
|
924
|
+
center_name=existing_pdf.center.name
|
|
925
|
+
if existing_pdf.center
|
|
926
|
+
else "unknown_center",
|
|
927
|
+
delete_source=False, # Never delete during retry
|
|
824
928
|
retry=True,
|
|
825
929
|
)
|
|
826
930
|
except Exception as e:
|
|
827
|
-
logger.error(
|
|
931
|
+
logger.error(
|
|
932
|
+
f"Failed to re-import existing PDF {existing_pdf.pdf_hash}: {e}"
|
|
933
|
+
)
|
|
828
934
|
self.current_pdf = existing_pdf
|
|
829
935
|
return existing_pdf
|
|
830
936
|
|
|
@@ -852,26 +958,51 @@ class PdfImportService:
|
|
|
852
958
|
if file_field and getattr(file_field, "name", None):
|
|
853
959
|
storage_name = file_field.name
|
|
854
960
|
file_field.delete(save=False)
|
|
855
|
-
logger.debug(
|
|
961
|
+
logger.debug(
|
|
962
|
+
"Deleted sensitive copy %s during error cleanup",
|
|
963
|
+
storage_name,
|
|
964
|
+
)
|
|
856
965
|
except Exception as cleanup_exc:
|
|
857
|
-
logger.warning(
|
|
966
|
+
logger.warning(
|
|
967
|
+
"Failed to remove sensitive copy during error cleanup: %s",
|
|
968
|
+
cleanup_exc,
|
|
969
|
+
)
|
|
858
970
|
|
|
859
971
|
# Always clean up processed files set to prevent blocks
|
|
860
972
|
file_path = self.processing_context.get("file_path")
|
|
861
973
|
if file_path and str(file_path) in self.processed_files:
|
|
862
974
|
self.processed_files.remove(str(file_path))
|
|
863
|
-
logger.debug(
|
|
975
|
+
logger.debug(
|
|
976
|
+
f"Removed {file_path} from processed files during error cleanup"
|
|
977
|
+
)
|
|
864
978
|
|
|
865
979
|
try:
|
|
866
980
|
original_path = self.processing_context.get("original_file_path")
|
|
867
|
-
logger.debug(
|
|
868
|
-
|
|
869
|
-
|
|
981
|
+
logger.debug(
|
|
982
|
+
"PDF cleanup original path: %s (%s)",
|
|
983
|
+
original_path,
|
|
984
|
+
type(original_path),
|
|
985
|
+
)
|
|
986
|
+
raw_dir = (
|
|
987
|
+
original_path.parent if isinstance(original_path, Path) else None
|
|
988
|
+
)
|
|
989
|
+
if (
|
|
990
|
+
isinstance(original_path, Path)
|
|
991
|
+
and original_path.exists()
|
|
992
|
+
and not self.processing_context.get("sensitive_copy_created")
|
|
993
|
+
):
|
|
870
994
|
try:
|
|
871
995
|
original_path.unlink()
|
|
872
|
-
logger.info(
|
|
996
|
+
logger.info(
|
|
997
|
+
"Removed original file %s during error cleanup",
|
|
998
|
+
original_path,
|
|
999
|
+
)
|
|
873
1000
|
except Exception as remove_exc:
|
|
874
|
-
logger.warning(
|
|
1001
|
+
logger.warning(
|
|
1002
|
+
"Could not remove original file %s during error cleanup: %s",
|
|
1003
|
+
original_path,
|
|
1004
|
+
remove_exc,
|
|
1005
|
+
)
|
|
875
1006
|
pdf_dir = self._get_pdf_dir()
|
|
876
1007
|
if not pdf_dir and raw_dir:
|
|
877
1008
|
base_dir = raw_dir.parent
|
|
@@ -888,7 +1019,12 @@ class PdfImportService:
|
|
|
888
1019
|
|
|
889
1020
|
# Remove empty PDF subdirectories that might have been created during setup
|
|
890
1021
|
if pdf_dir and pdf_dir.exists():
|
|
891
|
-
for subdir_name in (
|
|
1022
|
+
for subdir_name in (
|
|
1023
|
+
"sensitive",
|
|
1024
|
+
"cropped_regions",
|
|
1025
|
+
"anonymized",
|
|
1026
|
+
"_processing",
|
|
1027
|
+
):
|
|
892
1028
|
subdir_path = pdf_dir / subdir_name
|
|
893
1029
|
if subdir_path.exists() and subdir_path.is_dir():
|
|
894
1030
|
try:
|
|
@@ -896,22 +1032,49 @@ class PdfImportService:
|
|
|
896
1032
|
except StopIteration:
|
|
897
1033
|
try:
|
|
898
1034
|
subdir_path.rmdir()
|
|
899
|
-
logger.debug(
|
|
1035
|
+
logger.debug(
|
|
1036
|
+
"Removed empty directory %s during error cleanup",
|
|
1037
|
+
subdir_path,
|
|
1038
|
+
)
|
|
900
1039
|
except OSError as rm_err:
|
|
901
|
-
logger.debug(
|
|
1040
|
+
logger.debug(
|
|
1041
|
+
"Could not remove directory %s: %s",
|
|
1042
|
+
subdir_path,
|
|
1043
|
+
rm_err,
|
|
1044
|
+
)
|
|
902
1045
|
except Exception as iter_err:
|
|
903
|
-
logger.debug(
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
1046
|
+
logger.debug(
|
|
1047
|
+
"Could not inspect directory %s: %s",
|
|
1048
|
+
subdir_path,
|
|
1049
|
+
iter_err,
|
|
1050
|
+
)
|
|
1051
|
+
|
|
1052
|
+
raw_count = (
|
|
1053
|
+
len(list(raw_dir.glob("*")))
|
|
1054
|
+
if raw_dir and raw_dir.exists()
|
|
1055
|
+
else None
|
|
1056
|
+
)
|
|
1057
|
+
pdf_count = (
|
|
1058
|
+
len(list(pdf_dir.glob("*")))
|
|
1059
|
+
if pdf_dir and pdf_dir.exists()
|
|
1060
|
+
else None
|
|
1061
|
+
)
|
|
907
1062
|
|
|
908
1063
|
sensitive_path = self.processing_context.get("sensitive_file_path")
|
|
909
1064
|
if sensitive_path:
|
|
910
1065
|
sensitive_parent = Path(sensitive_path).parent
|
|
911
|
-
sensitive_count =
|
|
1066
|
+
sensitive_count = (
|
|
1067
|
+
len(list(sensitive_parent.glob("*")))
|
|
1068
|
+
if sensitive_parent.exists()
|
|
1069
|
+
else None
|
|
1070
|
+
)
|
|
912
1071
|
else:
|
|
913
1072
|
sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
|
|
914
|
-
sensitive_count =
|
|
1073
|
+
sensitive_count = (
|
|
1074
|
+
len(list(sensitive_dir.glob("*")))
|
|
1075
|
+
if sensitive_dir and sensitive_dir.exists()
|
|
1076
|
+
else None
|
|
1077
|
+
)
|
|
915
1078
|
|
|
916
1079
|
logger.info(
|
|
917
1080
|
"PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
|
|
@@ -944,7 +1107,9 @@ class PdfImportService:
|
|
|
944
1107
|
self.current_pdf = None
|
|
945
1108
|
self.processing_context = {}
|
|
946
1109
|
|
|
947
|
-
def import_simple(
|
|
1110
|
+
def import_simple(
|
|
1111
|
+
self, file_path: Union[Path, str], center_name: str, delete_source: bool = False
|
|
1112
|
+
) -> "RawPdfFile":
|
|
948
1113
|
"""
|
|
949
1114
|
Simple PDF import without text processing or anonymization.
|
|
950
1115
|
Uses centralized PDF instance management pattern.
|
|
@@ -959,7 +1124,9 @@ class PdfImportService:
|
|
|
959
1124
|
"""
|
|
960
1125
|
try:
|
|
961
1126
|
# Initialize simple processing context
|
|
962
|
-
self._initialize_processing_context(
|
|
1127
|
+
self._initialize_processing_context(
|
|
1128
|
+
file_path, center_name, delete_source, False
|
|
1129
|
+
)
|
|
963
1130
|
|
|
964
1131
|
# Validate file
|
|
965
1132
|
self._validate_and_prepare_file()
|
|
@@ -991,7 +1158,10 @@ class PdfImportService:
|
|
|
991
1158
|
with transaction.atomic():
|
|
992
1159
|
self.current_pdf.save()
|
|
993
1160
|
|
|
994
|
-
logger.info(
|
|
1161
|
+
logger.info(
|
|
1162
|
+
"Simple import completed for RawPdfFile hash: %s",
|
|
1163
|
+
self.current_pdf.pdf_hash,
|
|
1164
|
+
)
|
|
995
1165
|
return self.current_pdf
|
|
996
1166
|
|
|
997
1167
|
except Exception as e:
|
|
@@ -1001,7 +1171,9 @@ class PdfImportService:
|
|
|
1001
1171
|
finally:
|
|
1002
1172
|
self._cleanup_processing_context()
|
|
1003
1173
|
|
|
1004
|
-
def check_storage_capacity(
|
|
1174
|
+
def check_storage_capacity(
|
|
1175
|
+
self, file_path: Union[Path, str], storage_root, min_required_space
|
|
1176
|
+
) -> None:
|
|
1005
1177
|
"""
|
|
1006
1178
|
Check if there is sufficient storage capacity for the PDF file.
|
|
1007
1179
|
|
|
@@ -1031,12 +1203,18 @@ class PdfImportService:
|
|
|
1031
1203
|
|
|
1032
1204
|
# Check if there is enough space
|
|
1033
1205
|
if file_size > free:
|
|
1034
|
-
raise InsufficientStorageError(
|
|
1035
|
-
|
|
1206
|
+
raise InsufficientStorageError(
|
|
1207
|
+
f"Not enough space to store PDF file: {file_path}"
|
|
1208
|
+
)
|
|
1209
|
+
logger.info(
|
|
1210
|
+
f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available"
|
|
1211
|
+
)
|
|
1036
1212
|
|
|
1037
1213
|
return True
|
|
1038
1214
|
|
|
1039
|
-
def create_sensitive_file(
|
|
1215
|
+
def create_sensitive_file(
|
|
1216
|
+
self, pdf_instance: "RawPdfFile" = None, file_path: Union[Path, str] = None
|
|
1217
|
+
) -> None:
|
|
1040
1218
|
"""
|
|
1041
1219
|
Create a copy of the PDF file in the sensitive directory and update the file reference.
|
|
1042
1220
|
Delete the source path to avoid duplicates.
|
|
@@ -1045,7 +1223,9 @@ class PdfImportService:
|
|
|
1045
1223
|
Ensures the FileField points to the file under STORAGE_DIR/pdfs/sensitive and never back to raw_pdfs.
|
|
1046
1224
|
"""
|
|
1047
1225
|
pdf_file = pdf_instance or self.current_pdf
|
|
1048
|
-
source_path =
|
|
1226
|
+
source_path = (
|
|
1227
|
+
Path(file_path) if file_path else self.processing_context.get("file_path")
|
|
1228
|
+
)
|
|
1049
1229
|
|
|
1050
1230
|
if not pdf_file:
|
|
1051
1231
|
raise ValueError("No PDF instance available for creating sensitive file")
|
|
@@ -1068,14 +1248,20 @@ class PdfImportService:
|
|
|
1068
1248
|
try:
|
|
1069
1249
|
target.unlink()
|
|
1070
1250
|
except Exception as e:
|
|
1071
|
-
logger.warning(
|
|
1251
|
+
logger.warning(
|
|
1252
|
+
"Could not remove existing sensitive target %s: %s",
|
|
1253
|
+
target,
|
|
1254
|
+
e,
|
|
1255
|
+
)
|
|
1072
1256
|
shutil.move(str(source_path), str(target))
|
|
1073
1257
|
logger.info(f"Moved PDF to sensitive directory: {target}")
|
|
1074
1258
|
|
|
1075
1259
|
# Update FileField to reference the file under STORAGE_DIR
|
|
1076
1260
|
# We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
|
|
1077
1261
|
try:
|
|
1078
|
-
relative_name = str(
|
|
1262
|
+
relative_name = str(
|
|
1263
|
+
target.relative_to(path_utils.STORAGE_DIR)
|
|
1264
|
+
) # Point Django FileField to sensitive storage
|
|
1079
1265
|
except ValueError:
|
|
1080
1266
|
# Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
|
|
1081
1267
|
relative_name = str(target)
|
|
@@ -1084,9 +1270,15 @@ class PdfImportService:
|
|
|
1084
1270
|
if getattr(pdf_file.file, "name", None) != relative_name:
|
|
1085
1271
|
pdf_file.file.name = relative_name
|
|
1086
1272
|
pdf_file.save(update_fields=["file"])
|
|
1087
|
-
logger.info(
|
|
1273
|
+
logger.info(
|
|
1274
|
+
"Updated PDF FileField reference to sensitive path: %s",
|
|
1275
|
+
pdf_file.file.path,
|
|
1276
|
+
)
|
|
1088
1277
|
else:
|
|
1089
|
-
logger.debug(
|
|
1278
|
+
logger.debug(
|
|
1279
|
+
"PDF FileField already points to sensitive path: %s",
|
|
1280
|
+
pdf_file.file.path,
|
|
1281
|
+
)
|
|
1090
1282
|
|
|
1091
1283
|
# Best-effort: if original source still exists (e.g., copy), remove it to avoid re-triggers
|
|
1092
1284
|
try:
|
|
@@ -1097,10 +1289,17 @@ class PdfImportService:
|
|
|
1097
1289
|
logger.warning(f"Could not delete original PDF file {source_path}: {e}")
|
|
1098
1290
|
|
|
1099
1291
|
except Exception as e:
|
|
1100
|
-
logger.warning(
|
|
1292
|
+
logger.warning(
|
|
1293
|
+
f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}",
|
|
1294
|
+
exc_info=True,
|
|
1295
|
+
)
|
|
1101
1296
|
|
|
1102
1297
|
def archive_or_quarantine_file(
|
|
1103
|
-
self,
|
|
1298
|
+
self,
|
|
1299
|
+
pdf_instance: "RawPdfFile" = None,
|
|
1300
|
+
source_file_path: Union[Path, str] = None,
|
|
1301
|
+
quarantine_reason: str = None,
|
|
1302
|
+
is_pdf_problematic: bool = None,
|
|
1104
1303
|
) -> bool:
|
|
1105
1304
|
"""
|
|
1106
1305
|
Archive or quarantine file based on the state of the PDF processing.
|
|
@@ -1116,8 +1315,14 @@ class PdfImportService:
|
|
|
1116
1315
|
bool: True if file was quarantined, False if archived successfully
|
|
1117
1316
|
"""
|
|
1118
1317
|
pdf_file = pdf_instance or self.current_pdf
|
|
1119
|
-
file_path =
|
|
1120
|
-
|
|
1318
|
+
file_path = (
|
|
1319
|
+
Path(source_file_path)
|
|
1320
|
+
if source_file_path
|
|
1321
|
+
else self.processing_context.get("file_path")
|
|
1322
|
+
)
|
|
1323
|
+
quarantine_reason = quarantine_reason or self.processing_context.get(
|
|
1324
|
+
"error_reason"
|
|
1325
|
+
)
|
|
1121
1326
|
|
|
1122
1327
|
if not pdf_file:
|
|
1123
1328
|
raise ValueError("No PDF instance available for archiving/quarantine")
|
|
@@ -1125,24 +1330,34 @@ class PdfImportService:
|
|
|
1125
1330
|
raise ValueError("No file path available for archiving/quarantine")
|
|
1126
1331
|
|
|
1127
1332
|
# Determine if the PDF is problematic
|
|
1128
|
-
pdf_problematic =
|
|
1333
|
+
pdf_problematic = (
|
|
1334
|
+
is_pdf_problematic
|
|
1335
|
+
if is_pdf_problematic is not None
|
|
1336
|
+
else pdf_file.is_problematic
|
|
1337
|
+
)
|
|
1129
1338
|
|
|
1130
1339
|
if pdf_problematic:
|
|
1131
1340
|
# Quarantine the file
|
|
1132
|
-
logger.warning(
|
|
1341
|
+
logger.warning(
|
|
1342
|
+
f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}"
|
|
1343
|
+
)
|
|
1133
1344
|
quarantine_dir = path_utils.PDF_DIR / "quarantine"
|
|
1134
1345
|
os.makedirs(quarantine_dir, exist_ok=True)
|
|
1135
1346
|
|
|
1136
1347
|
quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
1137
1348
|
try:
|
|
1138
1349
|
shutil.move(file_path, quarantine_path)
|
|
1139
|
-
pdf_file.quarantine_reason =
|
|
1350
|
+
pdf_file.quarantine_reason = (
|
|
1351
|
+
quarantine_reason or "File processing failed"
|
|
1352
|
+
)
|
|
1140
1353
|
pdf_file.save(update_fields=["quarantine_reason"])
|
|
1141
1354
|
logger.info(f"Moved problematic PDF to quarantine: {quarantine_path}")
|
|
1142
1355
|
return True
|
|
1143
1356
|
except Exception as e:
|
|
1144
1357
|
logger.error(f"Failed to quarantine PDF {pdf_file.pdf_hash}: {e}")
|
|
1145
|
-
return
|
|
1358
|
+
return (
|
|
1359
|
+
True # Still consider as quarantined to prevent further processing
|
|
1360
|
+
)
|
|
1146
1361
|
else:
|
|
1147
1362
|
# Archive the file normally
|
|
1148
1363
|
logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
|
|
@@ -598,7 +598,7 @@ endoreg_db/services/examination_evaluation.py,sha256=jx9IL2PIoBzjiITzs00c1XucE7A
|
|
|
598
598
|
endoreg_db/services/finding_description_service.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
599
599
|
endoreg_db/services/lookup_service.py,sha256=A2t07-qlQhFEeNvOhez0GU0sxi7mnN0MIlhYzxj4W1U,10581
|
|
600
600
|
endoreg_db/services/lookup_store.py,sha256=8sB2HmJQrnzq5Vfqt-UdaJLHYMRZCxnui9BCCXscnJE,4856
|
|
601
|
-
endoreg_db/services/pdf_import.py,sha256=
|
|
601
|
+
endoreg_db/services/pdf_import.py,sha256=iipdALOF2bxtM_IOQXzPOu75nRvYVzplarzkajLoS8E,54274
|
|
602
602
|
endoreg_db/services/polling_coordinator.py,sha256=alnPB-kdMyxbYaxQN9fki9dKrwmAsY3s68bUHWDSNeI,10662
|
|
603
603
|
endoreg_db/services/pseudonym_service.py,sha256=CJhbtRa6K6SPbphgCZgEMi8AFQtB18CUoBDttFnxEoM,3126
|
|
604
604
|
endoreg_db/services/requirements_object.py,sha256=290zf8AEbVtCoHhW4Jr7_ud-RvrqYmb1Nz9UBHtTnc0,6164
|
|
@@ -789,7 +789,7 @@ endoreg_db/views/video/video_meta.py,sha256=C1wBMTtQb_yzEUrhFGAy2UHEWMk_CbU75WXX
|
|
|
789
789
|
endoreg_db/views/video/video_processing_history.py,sha256=mhFuS8RG5GV8E-lTtuD0qrq-bIpnUFp8vy9aERfC-J8,770
|
|
790
790
|
endoreg_db/views/video/video_remove_frames.py,sha256=2FmvNrSPM0fUXiBxINN6vBUUDCqDlBkNcGR3WsLDgKo,1696
|
|
791
791
|
endoreg_db/views/video/video_stream.py,sha256=kLyuf0ORTmsLeYUQkTQ6iRYqlIQozWhMMR3Lhfe_trk,12148
|
|
792
|
-
endoreg_db-0.8.5.
|
|
793
|
-
endoreg_db-0.8.5.
|
|
794
|
-
endoreg_db-0.8.5.
|
|
795
|
-
endoreg_db-0.8.5.
|
|
792
|
+
endoreg_db-0.8.5.6.dist-info/METADATA,sha256=VAc2EkiO9Qd6M1YOZTSu8piH-Ik7zlGHXdgbYmHzqtE,14719
|
|
793
|
+
endoreg_db-0.8.5.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
794
|
+
endoreg_db-0.8.5.6.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
795
|
+
endoreg_db-0.8.5.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|