endoreg-db 0.8.2__py3-none-any.whl → 0.8.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of endoreg-db might be problematic. Click here for more details.

@@ -37,7 +37,7 @@ class SensitiveMeta(models.Model):
37
37
  blank=True,
38
38
  null=True,
39
39
  help_text="FK to the pseudo-anonymized Patient record."
40
- )
40
+ ) # type: ignore
41
41
  patient_first_name = models.CharField(max_length=255, blank=True, null=True)
42
42
  patient_last_name = models.CharField(max_length=255, blank=True, null=True)
43
43
  patient_dob = models.DateTimeField(
@@ -51,24 +51,24 @@ class SensitiveMeta(models.Model):
51
51
  blank=True,
52
52
  null=True,
53
53
  help_text="FK to the pseudo-anonymized PatientExamination record."
54
- )
54
+ ) # type: ignore
55
55
  patient_gender = models.ForeignKey(
56
56
  "Gender",
57
57
  on_delete=models.CASCADE,
58
58
  blank=True,
59
59
  null=True,
60
- )
60
+ ) # type: ignore
61
61
  examiners = models.ManyToManyField(
62
62
  "Examiner",
63
63
  blank=True,
64
64
  help_text="Pseudo-anonymized examiner(s) associated with the examination."
65
- )
65
+ ) # type: ignore
66
66
  center = models.ForeignKey(
67
67
  "Center",
68
68
  on_delete=models.CASCADE,
69
69
  blank=True, # Should ideally be False if always required before save
70
70
  null=True, # Should ideally be False
71
- )
71
+ ) # type: ignore
72
72
 
73
73
  # Raw examiner names stored temporarily until pseudo-examiner is created/linked
74
74
  examiner_first_name = models.CharField(max_length=255, blank=True, null=True, editable=False)
@@ -258,7 +258,7 @@ class SensitiveMeta(models.Model):
258
258
 
259
259
  # 4. Handle ManyToMany linking (examiners) *after* the instance has a PK.
260
260
  if examiner_to_link and self.pk and not self.examiners.filter(pk=examiner_to_link.pk).exists():
261
- self.examiners.add(examiner_to_link)
261
+ self.examiners.add(examiner_to_link) # type: ignore
262
262
  # Adding to M2M handles its own DB interaction, no second super().save() needed.
263
263
 
264
264
  def mark_dob_verified(self):
@@ -13,7 +13,7 @@ else:
13
13
  ENDOREG_CENTER_ID = settings.ENDOREG_CENTER_ID
14
14
 
15
15
  # Import the new utility function
16
- from ...utils.video.ffmpeg_wrapper import get_stream_info
16
+ from ...utils.video import ffmpeg_wrapper
17
17
 
18
18
  logger = logging.getLogger(__name__)
19
19
 
@@ -214,7 +214,7 @@ class FFMpegMeta(models.Model):
214
214
  """
215
215
  logger.info("Running ffprobe on %s", file_path)
216
216
  try:
217
- probe_data = get_stream_info(file_path) # Use the new utility
217
+ probe_data = ffmpeg_wrapper.get_stream_info(file_path) # Use the new utility
218
218
  except Exception as probe_err:
219
219
  logger.error("ffprobe execution failed for %s: %s", file_path, probe_err, exc_info=True)
220
220
  raise RuntimeError(f"ffprobe execution failed for {file_path}") from probe_err
@@ -5,6 +5,7 @@ Provides high-level functions for importing and anonymizing PDF files,
5
5
  combining RawPdfFile creation with text extraction and anonymization.
6
6
  """
7
7
  from datetime import date, datetime
8
+ import errno
8
9
  import logging
9
10
  import shutil
10
11
  import sys
@@ -13,12 +14,11 @@ import hashlib
13
14
  from pathlib import Path
14
15
  from typing import TYPE_CHECKING, Union
15
16
  from contextlib import contextmanager
16
- from django.conf.locale import tr
17
17
  from django.db import transaction
18
18
  from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
19
19
  from endoreg_db.models.state.raw_pdf import RawPdfState
20
20
  from endoreg_db.models import SensitiveMeta
21
- from endoreg_db.utils.paths import PDF_DIR, STORAGE_DIR
21
+ from endoreg_db.utils import paths as path_utils
22
22
  import time
23
23
 
24
24
  logger = logging.getLogger(__name__)
@@ -111,14 +111,44 @@ class PdfImportService:
111
111
  break
112
112
  h.update(b)
113
113
  return h.hexdigest()
114
+
115
+ def _get_pdf_dir(self) -> Path | None:
116
+ """Resolve the configured PDF directory to a concrete Path."""
117
+ candidate = getattr(path_utils, "PDF_DIR", None)
118
+ if isinstance(candidate, Path):
119
+ return candidate
120
+ if candidate is None:
121
+ return None
122
+ try:
123
+ derived = candidate / "."
124
+ except Exception:
125
+ derived = None
126
+
127
+ if derived is not None:
128
+ try:
129
+ return Path(derived)
130
+ except Exception:
131
+ return None
132
+
133
+ try:
134
+ return Path(str(candidate))
135
+ except Exception:
136
+ return None
114
137
 
115
138
  def _quarantine(self, source: Path) -> Path:
116
139
  """Move file to quarantine directory to prevent re-processing."""
117
- qdir = PDF_DIR / "_processing"
140
+ qdir = path_utils.PDF_DIR / "_processing"
118
141
  qdir.mkdir(parents=True, exist_ok=True)
119
142
  target = qdir / source.name
120
- # atomic rename on same filesystem
121
- source.rename(target)
143
+ try:
144
+ # Try atomic rename first (fastest when on same filesystem)
145
+ source.rename(target)
146
+ except OSError as exc:
147
+ if exc.errno == errno.EXDEV:
148
+ # Cross-device move, fall back to shutil.move which copies+removes
149
+ shutil.move(str(source), str(target))
150
+ else:
151
+ raise
122
152
  return target
123
153
 
124
154
  def _ensure_state(self, pdf_file: "RawPdfFile"):
@@ -287,6 +317,7 @@ class PdfImportService:
287
317
  """Initialize the processing context for the current PDF."""
288
318
  self.processing_context = {
289
319
  'file_path': Path(file_path),
320
+ 'original_file_path': Path(file_path),
290
321
  'center_name': center_name,
291
322
  'delete_source': delete_source,
292
323
  'retry': retry,
@@ -379,11 +410,18 @@ class PdfImportService:
379
410
 
380
411
  def _setup_processing_environment(self):
381
412
  """Setup processing environment and state."""
413
+ original_path = self.processing_context.get('file_path')
414
+
382
415
  # Create sensitive file copy
383
- self.create_sensitive_file(self.current_pdf, self.processing_context['file_path'])
416
+ self.create_sensitive_file(self.current_pdf, original_path)
384
417
 
385
418
  # Update file path to point to sensitive copy
386
419
  self.processing_context['file_path'] = self.current_pdf.file.path
420
+ self.processing_context['sensitive_copy_created'] = True
421
+ try:
422
+ self.processing_context['sensitive_file_path'] = Path(self.current_pdf.file.path)
423
+ except Exception:
424
+ self.processing_context['sensitive_file_path'] = None
387
425
 
388
426
  # Ensure state exists
389
427
  state = self.current_pdf.get_or_create_state()
@@ -415,14 +453,14 @@ class PdfImportService:
415
453
  logger.info("Starting text extraction and metadata processing with ReportReader...")
416
454
 
417
455
  # Setup output directories
418
- crops_dir = PDF_DIR / 'cropped_regions'
419
- anonymized_dir = PDF_DIR / 'anonymized'
456
+ crops_dir = path_utils.PDF_DIR / 'cropped_regions'
457
+ anonymized_dir = path_utils.PDF_DIR / 'anonymized'
420
458
  crops_dir.mkdir(parents=True, exist_ok=True)
421
459
  anonymized_dir.mkdir(parents=True, exist_ok=True)
422
460
 
423
461
  # Initialize ReportReader
424
462
  report_reader = ReportReader(
425
- report_root_path=STORAGE_DIR,
463
+ report_root_path=str(path_utils.STORAGE_DIR),
426
464
  locale="de_DE",
427
465
  text_date_format="%d.%m.%Y"
428
466
  )
@@ -603,7 +641,7 @@ class PdfImportService:
603
641
  try:
604
642
  # Prefer storing a path relative to STORAGE_DIR so Django serves it correctly
605
643
  try:
606
- relative_name = str(anonymized_path.relative_to(STORAGE_DIR))
644
+ relative_name = str(anonymized_path.relative_to(path_utils.STORAGE_DIR))
607
645
  except ValueError:
608
646
  # Fallback to absolute path if the file lives outside STORAGE_DIR
609
647
  relative_name = str(anonymized_path)
@@ -717,18 +755,96 @@ class PdfImportService:
717
755
  except Exception as e:
718
756
  logger.warning(f"Error during cleanup: {e}")
719
757
  finally:
758
+ # Remove any sensitive copy created during this processing run
759
+ sensitive_created = self.processing_context.get('sensitive_copy_created')
760
+ if sensitive_created:
761
+ pdf_obj = self.current_pdf
762
+ try:
763
+ if pdf_obj:
764
+ file_field = getattr(pdf_obj, "file", None)
765
+ if file_field and getattr(file_field, "name", None):
766
+ storage_name = file_field.name
767
+ file_field.delete(save=False)
768
+ logger.debug("Deleted sensitive copy %s during error cleanup", storage_name)
769
+ except Exception as cleanup_exc:
770
+ logger.warning("Failed to remove sensitive copy during error cleanup: %s", cleanup_exc)
771
+
720
772
  # Always clean up processed files set to prevent blocks
721
773
  file_path = self.processing_context.get('file_path')
722
774
  if file_path and str(file_path) in self.processed_files:
723
775
  self.processed_files.remove(str(file_path))
724
776
  logger.debug(f"Removed {file_path} from processed files during error cleanup")
725
777
 
778
+ try:
779
+ original_path = self.processing_context.get('original_file_path')
780
+ logger.debug("PDF cleanup original path: %s (%s)", original_path, type(original_path))
781
+ raw_dir = original_path.parent if isinstance(original_path, Path) else None
782
+ if (
783
+ isinstance(original_path, Path)
784
+ and original_path.exists()
785
+ and not self.processing_context.get('sensitive_copy_created')
786
+ ):
787
+ try:
788
+ original_path.unlink()
789
+ logger.info("Removed original file %s during error cleanup", original_path)
790
+ except Exception as remove_exc:
791
+ logger.warning("Could not remove original file %s during error cleanup: %s", original_path, remove_exc)
792
+ pdf_dir = self._get_pdf_dir()
793
+ if not pdf_dir and raw_dir:
794
+ base_dir = raw_dir.parent
795
+ dir_name = getattr(path_utils, "PDF_DIR_NAME", "pdfs")
796
+ fallback_pdf_dir = base_dir / dir_name
797
+ logger.debug(
798
+ "PDF cleanup fallback resolution - base: %s, dir_name: %s, exists: %s",
799
+ base_dir,
800
+ dir_name,
801
+ fallback_pdf_dir.exists(),
802
+ )
803
+ if fallback_pdf_dir.exists():
804
+ pdf_dir = fallback_pdf_dir
805
+
806
+ # Remove empty PDF subdirectories that might have been created during setup
807
+ if pdf_dir and pdf_dir.exists():
808
+ for subdir_name in ("sensitive", "cropped_regions", "anonymized", "_processing"):
809
+ subdir_path = pdf_dir / subdir_name
810
+ if subdir_path.exists() and subdir_path.is_dir():
811
+ try:
812
+ next(subdir_path.iterdir())
813
+ except StopIteration:
814
+ try:
815
+ subdir_path.rmdir()
816
+ logger.debug("Removed empty directory %s during error cleanup", subdir_path)
817
+ except OSError as rm_err:
818
+ logger.debug("Could not remove directory %s: %s", subdir_path, rm_err)
819
+ except Exception as iter_err:
820
+ logger.debug("Could not inspect directory %s: %s", subdir_path, iter_err)
821
+
822
+ raw_count = len(list(raw_dir.glob("*"))) if raw_dir and raw_dir.exists() else None
823
+ pdf_count = len(list(pdf_dir.glob("*"))) if pdf_dir and pdf_dir.exists() else None
824
+
825
+ sensitive_path = self.processing_context.get('sensitive_file_path')
826
+ if sensitive_path:
827
+ sensitive_parent = Path(sensitive_path).parent
828
+ sensitive_count = len(list(sensitive_parent.glob("*"))) if sensitive_parent.exists() else None
829
+ else:
830
+ sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
831
+ sensitive_count = len(list(sensitive_dir.glob("*"))) if sensitive_dir and sensitive_dir.exists() else None
832
+
833
+ logger.info(
834
+ "PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
835
+ raw_count,
836
+ pdf_count,
837
+ sensitive_count,
838
+ )
839
+ except Exception:
840
+ pass
841
+
726
842
  def _cleanup_processing_context(self):
727
843
  """Cleanup processing context."""
728
844
  try:
729
845
  # Clean up temporary directories
730
846
  if self.processing_context.get('text_extracted'):
731
- crops_dir = PDF_DIR / 'cropped_regions'
847
+ crops_dir = path_utils.PDF_DIR / 'cropped_regions'
732
848
  if crops_dir.exists() and not any(crops_dir.iterdir()):
733
849
  crops_dir.rmdir()
734
850
 
@@ -857,7 +973,7 @@ class PdfImportService:
857
973
  if not source_path:
858
974
  raise ValueError("No file path available for creating sensitive file")
859
975
 
860
- SENSITIVE_DIR = PDF_DIR / "sensitive"
976
+ SENSITIVE_DIR = path_utils.PDF_DIR / "sensitive"
861
977
  target = SENSITIVE_DIR / f"{pdf_file.pdf_hash}.pdf"
862
978
 
863
979
  try:
@@ -880,7 +996,7 @@ class PdfImportService:
880
996
  # Update FileField to reference the file under STORAGE_DIR
881
997
  # We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
882
998
  try:
883
- relative_name = str(target.relative_to(STORAGE_DIR)) #just point the Django FileField to the file that the anonymizer already created in data/pdfs/anonymized/.
999
+ relative_name = str(target.relative_to(path_utils.STORAGE_DIR)) # Point Django FileField to sensitive storage
884
1000
  except ValueError:
885
1001
  # Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
886
1002
  relative_name = str(target)
@@ -934,7 +1050,7 @@ class PdfImportService:
934
1050
  if pdf_problematic:
935
1051
  # Quarantine the file
936
1052
  logger.warning(f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}")
937
- quarantine_dir = PDF_DIR / "quarantine"
1053
+ quarantine_dir = path_utils.PDF_DIR / "quarantine"
938
1054
  os.makedirs(quarantine_dir, exist_ok=True)
939
1055
 
940
1056
  quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
@@ -950,7 +1066,7 @@ class PdfImportService:
950
1066
  else:
951
1067
  # Archive the file normally
952
1068
  logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
953
- archive_dir = PDF_DIR / "processed"
1069
+ archive_dir = path_utils.PDF_DIR / "processed"
954
1070
  os.makedirs(archive_dir, exist_ok=True)
955
1071
 
956
1072
  archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
@@ -56,7 +56,7 @@ def generate_patient_pseudonym(patient: Patient) -> Tuple[str, bool]:
56
56
  patient.patient_hash = patient_hash
57
57
  patient.save(update_fields=['patient_hash'])
58
58
 
59
- logger.info(f"Generated and persisted pseudonym for patient {patient.id}: {patient_hash[:8]}...")
59
+ logger.info(f"Generated and persisted pseudonym for patient {patient.id}")
60
60
 
61
61
  return patient_hash, True
62
62