endoreg-db 0.8.1__py3-none-any.whl → 0.8.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of endoreg-db might be problematic. Click here for more details.

Files changed (48) hide show
  1. endoreg_db/helpers/download_segmentation_model.py +31 -0
  2. endoreg_db/migrations/0003_add_center_display_name.py +30 -0
  3. endoreg_db/models/administration/center/center.py +7 -1
  4. endoreg_db/models/media/pdf/raw_pdf.py +31 -26
  5. endoreg_db/models/media/video/create_from_file.py +26 -4
  6. endoreg_db/models/media/video/pipe_1.py +13 -1
  7. endoreg_db/models/media/video/video_file.py +36 -13
  8. endoreg_db/models/media/video/video_file_anonymize.py +2 -1
  9. endoreg_db/models/media/video/video_file_frames/_manage_frame_range.py +12 -0
  10. endoreg_db/models/media/video/video_file_io.py +4 -2
  11. endoreg_db/models/metadata/video_meta.py +2 -2
  12. endoreg_db/serializers/anonymization.py +3 -0
  13. endoreg_db/services/pdf_import.py +131 -45
  14. endoreg_db/services/video_import.py +427 -128
  15. endoreg_db/urls/__init__.py +0 -2
  16. endoreg_db/urls/media.py +201 -4
  17. endoreg_db/urls/report.py +0 -30
  18. endoreg_db/urls/sensitive_meta.py +0 -36
  19. endoreg_db/urls/video.py +30 -88
  20. endoreg_db/utils/paths.py +2 -10
  21. endoreg_db/utils/video/ffmpeg_wrapper.py +67 -4
  22. endoreg_db/views/anonymization/validate.py +76 -32
  23. endoreg_db/views/media/__init__.py +38 -2
  24. endoreg_db/views/media/pdf_media.py +1 -1
  25. endoreg_db/views/media/segments.py +71 -0
  26. endoreg_db/views/media/sensitive_metadata.py +314 -0
  27. endoreg_db/views/media/video_segments.py +596 -0
  28. endoreg_db/views/pdf/reimport.py +18 -8
  29. endoreg_db/views/video/__init__.py +0 -8
  30. endoreg_db/views/video/correction.py +34 -32
  31. endoreg_db/views/video/reimport.py +15 -12
  32. endoreg_db/views/video/video_stream.py +168 -50
  33. {endoreg_db-0.8.1.dist-info → endoreg_db-0.8.2.1.dist-info}/METADATA +2 -2
  34. {endoreg_db-0.8.1.dist-info → endoreg_db-0.8.2.1.dist-info}/RECORD +47 -43
  35. endoreg_db/views/video/media/__init__.py +0 -23
  36. /endoreg_db/{urls/pdf.py → config/__init__.py} +0 -0
  37. /endoreg_db/views/video/{media/task_status.py → task_status.py} +0 -0
  38. /endoreg_db/views/video/{media/video_analyze.py → video_analyze.py} +0 -0
  39. /endoreg_db/views/video/{media/video_apply_mask.py → video_apply_mask.py} +0 -0
  40. /endoreg_db/views/video/{media/video_correction.py → video_correction.py} +0 -0
  41. /endoreg_db/views/video/{media/video_download_processed.py → video_download_processed.py} +0 -0
  42. /endoreg_db/views/video/{media/video_media.py → video_media.py} +0 -0
  43. /endoreg_db/views/video/{media/video_meta.py → video_meta.py} +0 -0
  44. /endoreg_db/views/video/{media/video_processing_history.py → video_processing_history.py} +0 -0
  45. /endoreg_db/views/video/{media/video_remove_frames.py → video_remove_frames.py} +0 -0
  46. /endoreg_db/views/video/{media/video_reprocess.py → video_reprocess.py} +0 -0
  47. {endoreg_db-0.8.1.dist-info → endoreg_db-0.8.2.1.dist-info}/WHEEL +0 -0
  48. {endoreg_db-0.8.1.dist-info → endoreg_db-0.8.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -5,6 +5,7 @@ Provides high-level functions for importing and anonymizing PDF files,
5
5
  combining RawPdfFile creation with text extraction and anonymization.
6
6
  """
7
7
  from datetime import date, datetime
8
+ import errno
8
9
  import logging
9
10
  import shutil
10
11
  import sys
@@ -13,12 +14,11 @@ import hashlib
13
14
  from pathlib import Path
14
15
  from typing import TYPE_CHECKING, Union
15
16
  from contextlib import contextmanager
16
- from django.conf.locale import tr
17
17
  from django.db import transaction
18
18
  from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
19
19
  from endoreg_db.models.state.raw_pdf import RawPdfState
20
20
  from endoreg_db.models import SensitiveMeta
21
- from endoreg_db.utils.paths import PDF_DIR, STORAGE_DIR
21
+ from endoreg_db.utils import paths as path_utils
22
22
  import time
23
23
 
24
24
  logger = logging.getLogger(__name__)
@@ -111,14 +111,44 @@ class PdfImportService:
111
111
  break
112
112
  h.update(b)
113
113
  return h.hexdigest()
114
+
115
+ def _get_pdf_dir(self) -> Path | None:
116
+ """Resolve the configured PDF directory to a concrete Path."""
117
+ candidate = getattr(path_utils, "PDF_DIR", None)
118
+ if isinstance(candidate, Path):
119
+ return candidate
120
+ if candidate is None:
121
+ return None
122
+ try:
123
+ derived = candidate / "."
124
+ except Exception:
125
+ derived = None
126
+
127
+ if derived is not None:
128
+ try:
129
+ return Path(derived)
130
+ except Exception:
131
+ return None
132
+
133
+ try:
134
+ return Path(str(candidate))
135
+ except Exception:
136
+ return None
114
137
 
115
138
  def _quarantine(self, source: Path) -> Path:
116
139
  """Move file to quarantine directory to prevent re-processing."""
117
- qdir = PDF_DIR / "_processing"
140
+ qdir = path_utils.PDF_DIR / "_processing"
118
141
  qdir.mkdir(parents=True, exist_ok=True)
119
142
  target = qdir / source.name
120
- # atomic rename on same filesystem
121
- source.rename(target)
143
+ try:
144
+ # Try atomic rename first (fastest when on same filesystem)
145
+ source.rename(target)
146
+ except OSError as exc:
147
+ if exc.errno == errno.EXDEV:
148
+ # Cross-device move, fall back to shutil.move which copies+removes
149
+ shutil.move(str(source), str(target))
150
+ else:
151
+ raise
122
152
  return target
123
153
 
124
154
  def _ensure_state(self, pdf_file: "RawPdfFile"):
@@ -287,6 +317,7 @@ class PdfImportService:
287
317
  """Initialize the processing context for the current PDF."""
288
318
  self.processing_context = {
289
319
  'file_path': Path(file_path),
320
+ 'original_file_path': Path(file_path),
290
321
  'center_name': center_name,
291
322
  'delete_source': delete_source,
292
323
  'retry': retry,
@@ -379,11 +410,18 @@ class PdfImportService:
379
410
 
380
411
  def _setup_processing_environment(self):
381
412
  """Setup processing environment and state."""
413
+ original_path = self.processing_context.get('file_path')
414
+
382
415
  # Create sensitive file copy
383
- self.create_sensitive_file(self.current_pdf, self.processing_context['file_path'])
416
+ self.create_sensitive_file(self.current_pdf, original_path)
384
417
 
385
418
  # Update file path to point to sensitive copy
386
419
  self.processing_context['file_path'] = self.current_pdf.file.path
420
+ self.processing_context['sensitive_copy_created'] = True
421
+ try:
422
+ self.processing_context['sensitive_file_path'] = Path(self.current_pdf.file.path)
423
+ except Exception:
424
+ self.processing_context['sensitive_file_path'] = None
387
425
 
388
426
  # Ensure state exists
389
427
  state = self.current_pdf.get_or_create_state()
@@ -415,14 +453,14 @@ class PdfImportService:
415
453
  logger.info("Starting text extraction and metadata processing with ReportReader...")
416
454
 
417
455
  # Setup output directories
418
- crops_dir = PDF_DIR / 'cropped_regions'
419
- anonymized_dir = PDF_DIR / 'anonymized'
456
+ crops_dir = path_utils.PDF_DIR / 'cropped_regions'
457
+ anonymized_dir = path_utils.PDF_DIR / 'anonymized'
420
458
  crops_dir.mkdir(parents=True, exist_ok=True)
421
459
  anonymized_dir.mkdir(parents=True, exist_ok=True)
422
460
 
423
461
  # Initialize ReportReader
424
462
  report_reader = ReportReader(
425
- report_root_path=STORAGE_DIR,
463
+ report_root_path=str(path_utils.STORAGE_DIR),
426
464
  locale="de_DE",
427
465
  text_date_format="%d.%m.%Y"
428
466
  )
@@ -603,7 +641,7 @@ class PdfImportService:
603
641
  try:
604
642
  # Prefer storing a path relative to STORAGE_DIR so Django serves it correctly
605
643
  try:
606
- relative_name = str(anonymized_path.relative_to(STORAGE_DIR))
644
+ relative_name = str(anonymized_path.relative_to(path_utils.STORAGE_DIR))
607
645
  except ValueError:
608
646
  # Fallback to absolute path if the file lives outside STORAGE_DIR
609
647
  relative_name = str(anonymized_path)
@@ -637,36 +675,6 @@ class PdfImportService:
637
675
 
638
676
  except Exception as e:
639
677
  logger.warning("Could not set anonymized file reference: %s", e)
640
-
641
- '''def _apply_anonymized_pdf(self):
642
- """Apply anonymized PDF results."""
643
- if not self.current_pdf:
644
- logger.warning("Cannot apply anonymized PDF - no PDF instance available")
645
- return
646
-
647
- anonymized_pdf_path = self.processing_context.get('anonymized_pdf_path')
648
-
649
- if not anonymized_pdf_path:
650
- return
651
-
652
- anonymized_path = Path(anonymized_pdf_path)
653
- if anonymized_path.exists():
654
- logger.info(f"Anonymized PDF created by ReportReader at: {anonymized_path}")
655
- try:
656
- from django.core.files.base import File
657
- with open(anonymized_path, 'rb') as f:
658
- django_file = File(f)
659
- self.current_pdf.anonymized_file.save(
660
- anonymized_path.name,
661
- django_file,
662
- save=False
663
- )
664
- except Exception as e:
665
- logger.warning(f"Could not set anonymized file reference: {e}")
666
- else:
667
- logger.warning(f"Anonymized PDF path returned but file does not exist: {anonymized_path}")'''
668
-
669
-
670
678
 
671
679
 
672
680
  def _finalize_processing(self):
@@ -747,18 +755,96 @@ class PdfImportService:
747
755
  except Exception as e:
748
756
  logger.warning(f"Error during cleanup: {e}")
749
757
  finally:
758
+ # Remove any sensitive copy created during this processing run
759
+ sensitive_created = self.processing_context.get('sensitive_copy_created')
760
+ if sensitive_created:
761
+ pdf_obj = self.current_pdf
762
+ try:
763
+ if pdf_obj:
764
+ file_field = getattr(pdf_obj, "file", None)
765
+ if file_field and getattr(file_field, "name", None):
766
+ storage_name = file_field.name
767
+ file_field.delete(save=False)
768
+ logger.debug("Deleted sensitive copy %s during error cleanup", storage_name)
769
+ except Exception as cleanup_exc:
770
+ logger.warning("Failed to remove sensitive copy during error cleanup: %s", cleanup_exc)
771
+
750
772
  # Always clean up processed files set to prevent blocks
751
773
  file_path = self.processing_context.get('file_path')
752
774
  if file_path and str(file_path) in self.processed_files:
753
775
  self.processed_files.remove(str(file_path))
754
776
  logger.debug(f"Removed {file_path} from processed files during error cleanup")
755
777
 
778
+ try:
779
+ original_path = self.processing_context.get('original_file_path')
780
+ logger.debug("PDF cleanup original path: %s (%s)", original_path, type(original_path))
781
+ raw_dir = original_path.parent if isinstance(original_path, Path) else None
782
+ if (
783
+ isinstance(original_path, Path)
784
+ and original_path.exists()
785
+ and not self.processing_context.get('sensitive_copy_created')
786
+ ):
787
+ try:
788
+ original_path.unlink()
789
+ logger.info("Removed original file %s during error cleanup", original_path)
790
+ except Exception as remove_exc:
791
+ logger.warning("Could not remove original file %s during error cleanup: %s", original_path, remove_exc)
792
+ pdf_dir = self._get_pdf_dir()
793
+ if not pdf_dir and raw_dir:
794
+ base_dir = raw_dir.parent
795
+ dir_name = getattr(path_utils, "PDF_DIR_NAME", "pdfs")
796
+ fallback_pdf_dir = base_dir / dir_name
797
+ logger.debug(
798
+ "PDF cleanup fallback resolution - base: %s, dir_name: %s, exists: %s",
799
+ base_dir,
800
+ dir_name,
801
+ fallback_pdf_dir.exists(),
802
+ )
803
+ if fallback_pdf_dir.exists():
804
+ pdf_dir = fallback_pdf_dir
805
+
806
+ # Remove empty PDF subdirectories that might have been created during setup
807
+ if pdf_dir and pdf_dir.exists():
808
+ for subdir_name in ("sensitive", "cropped_regions", "anonymized", "_processing"):
809
+ subdir_path = pdf_dir / subdir_name
810
+ if subdir_path.exists() and subdir_path.is_dir():
811
+ try:
812
+ next(subdir_path.iterdir())
813
+ except StopIteration:
814
+ try:
815
+ subdir_path.rmdir()
816
+ logger.debug("Removed empty directory %s during error cleanup", subdir_path)
817
+ except OSError as rm_err:
818
+ logger.debug("Could not remove directory %s: %s", subdir_path, rm_err)
819
+ except Exception as iter_err:
820
+ logger.debug("Could not inspect directory %s: %s", subdir_path, iter_err)
821
+
822
+ raw_count = len(list(raw_dir.glob("*"))) if raw_dir and raw_dir.exists() else None
823
+ pdf_count = len(list(pdf_dir.glob("*"))) if pdf_dir and pdf_dir.exists() else None
824
+
825
+ sensitive_path = self.processing_context.get('sensitive_file_path')
826
+ if sensitive_path:
827
+ sensitive_parent = Path(sensitive_path).parent
828
+ sensitive_count = len(list(sensitive_parent.glob("*"))) if sensitive_parent.exists() else None
829
+ else:
830
+ sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
831
+ sensitive_count = len(list(sensitive_dir.glob("*"))) if sensitive_dir and sensitive_dir.exists() else None
832
+
833
+ logger.info(
834
+ "PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
835
+ raw_count,
836
+ pdf_count,
837
+ sensitive_count,
838
+ )
839
+ except Exception:
840
+ pass
841
+
756
842
  def _cleanup_processing_context(self):
757
843
  """Cleanup processing context."""
758
844
  try:
759
845
  # Clean up temporary directories
760
846
  if self.processing_context.get('text_extracted'):
761
- crops_dir = PDF_DIR / 'cropped_regions'
847
+ crops_dir = path_utils.PDF_DIR / 'cropped_regions'
762
848
  if crops_dir.exists() and not any(crops_dir.iterdir()):
763
849
  crops_dir.rmdir()
764
850
 
@@ -887,7 +973,7 @@ class PdfImportService:
887
973
  if not source_path:
888
974
  raise ValueError("No file path available for creating sensitive file")
889
975
 
890
- SENSITIVE_DIR = PDF_DIR / "sensitive"
976
+ SENSITIVE_DIR = path_utils.PDF_DIR / "sensitive"
891
977
  target = SENSITIVE_DIR / f"{pdf_file.pdf_hash}.pdf"
892
978
 
893
979
  try:
@@ -910,7 +996,7 @@ class PdfImportService:
910
996
  # Update FileField to reference the file under STORAGE_DIR
911
997
  # We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
912
998
  try:
913
- relative_name = str(target.relative_to(STORAGE_DIR)) #just point the Django FileField to the file that the anonymizer already created in data/pdfs/anonymized/.
999
+ relative_name = str(target.relative_to(path_utils.STORAGE_DIR)) # Point Django FileField to sensitive storage
914
1000
  except ValueError:
915
1001
  # Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
916
1002
  relative_name = str(target)
@@ -964,7 +1050,7 @@ class PdfImportService:
964
1050
  if pdf_problematic:
965
1051
  # Quarantine the file
966
1052
  logger.warning(f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}")
967
- quarantine_dir = PDF_DIR / "quarantine"
1053
+ quarantine_dir = path_utils.PDF_DIR / "quarantine"
968
1054
  os.makedirs(quarantine_dir, exist_ok=True)
969
1055
 
970
1056
  quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
@@ -980,7 +1066,7 @@ class PdfImportService:
980
1066
  else:
981
1067
  # Archive the file normally
982
1068
  logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
983
- archive_dir = PDF_DIR / "processed"
1069
+ archive_dir = path_utils.PDF_DIR / "processed"
984
1070
  os.makedirs(archive_dir, exist_ok=True)
985
1071
 
986
1072
  archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"