endoreg-db 0.8.1__py3-none-any.whl → 0.8.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of endoreg-db might be problematic. Click here for more details.
- endoreg_db/helpers/download_segmentation_model.py +31 -0
- endoreg_db/migrations/0003_add_center_display_name.py +30 -0
- endoreg_db/models/administration/center/center.py +7 -1
- endoreg_db/models/media/pdf/raw_pdf.py +31 -26
- endoreg_db/models/media/video/create_from_file.py +26 -4
- endoreg_db/models/media/video/pipe_1.py +13 -1
- endoreg_db/models/media/video/video_file.py +36 -13
- endoreg_db/models/media/video/video_file_anonymize.py +2 -1
- endoreg_db/models/media/video/video_file_frames/_manage_frame_range.py +12 -0
- endoreg_db/models/media/video/video_file_io.py +4 -2
- endoreg_db/models/metadata/video_meta.py +2 -2
- endoreg_db/serializers/anonymization.py +3 -0
- endoreg_db/services/pdf_import.py +131 -45
- endoreg_db/services/video_import.py +427 -128
- endoreg_db/urls/__init__.py +0 -2
- endoreg_db/urls/media.py +201 -4
- endoreg_db/urls/report.py +0 -30
- endoreg_db/urls/sensitive_meta.py +0 -36
- endoreg_db/urls/video.py +30 -88
- endoreg_db/utils/paths.py +2 -10
- endoreg_db/utils/video/ffmpeg_wrapper.py +67 -4
- endoreg_db/views/anonymization/validate.py +76 -32
- endoreg_db/views/media/__init__.py +38 -2
- endoreg_db/views/media/pdf_media.py +1 -1
- endoreg_db/views/media/segments.py +71 -0
- endoreg_db/views/media/sensitive_metadata.py +314 -0
- endoreg_db/views/media/video_segments.py +596 -0
- endoreg_db/views/pdf/reimport.py +18 -8
- endoreg_db/views/video/__init__.py +0 -8
- endoreg_db/views/video/correction.py +34 -32
- endoreg_db/views/video/reimport.py +15 -12
- endoreg_db/views/video/video_stream.py +168 -50
- {endoreg_db-0.8.1.dist-info → endoreg_db-0.8.2.1.dist-info}/METADATA +2 -2
- {endoreg_db-0.8.1.dist-info → endoreg_db-0.8.2.1.dist-info}/RECORD +47 -43
- endoreg_db/views/video/media/__init__.py +0 -23
- /endoreg_db/{urls/pdf.py → config/__init__.py} +0 -0
- /endoreg_db/views/video/{media/task_status.py → task_status.py} +0 -0
- /endoreg_db/views/video/{media/video_analyze.py → video_analyze.py} +0 -0
- /endoreg_db/views/video/{media/video_apply_mask.py → video_apply_mask.py} +0 -0
- /endoreg_db/views/video/{media/video_correction.py → video_correction.py} +0 -0
- /endoreg_db/views/video/{media/video_download_processed.py → video_download_processed.py} +0 -0
- /endoreg_db/views/video/{media/video_media.py → video_media.py} +0 -0
- /endoreg_db/views/video/{media/video_meta.py → video_meta.py} +0 -0
- /endoreg_db/views/video/{media/video_processing_history.py → video_processing_history.py} +0 -0
- /endoreg_db/views/video/{media/video_remove_frames.py → video_remove_frames.py} +0 -0
- /endoreg_db/views/video/{media/video_reprocess.py → video_reprocess.py} +0 -0
- {endoreg_db-0.8.1.dist-info → endoreg_db-0.8.2.1.dist-info}/WHEEL +0 -0
- {endoreg_db-0.8.1.dist-info → endoreg_db-0.8.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,6 +5,7 @@ Provides high-level functions for importing and anonymizing PDF files,
|
|
|
5
5
|
combining RawPdfFile creation with text extraction and anonymization.
|
|
6
6
|
"""
|
|
7
7
|
from datetime import date, datetime
|
|
8
|
+
import errno
|
|
8
9
|
import logging
|
|
9
10
|
import shutil
|
|
10
11
|
import sys
|
|
@@ -13,12 +14,11 @@ import hashlib
|
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from typing import TYPE_CHECKING, Union
|
|
15
16
|
from contextlib import contextmanager
|
|
16
|
-
from django.conf.locale import tr
|
|
17
17
|
from django.db import transaction
|
|
18
18
|
from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
|
|
19
19
|
from endoreg_db.models.state.raw_pdf import RawPdfState
|
|
20
20
|
from endoreg_db.models import SensitiveMeta
|
|
21
|
-
from endoreg_db.utils
|
|
21
|
+
from endoreg_db.utils import paths as path_utils
|
|
22
22
|
import time
|
|
23
23
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
@@ -111,14 +111,44 @@ class PdfImportService:
|
|
|
111
111
|
break
|
|
112
112
|
h.update(b)
|
|
113
113
|
return h.hexdigest()
|
|
114
|
+
|
|
115
|
+
def _get_pdf_dir(self) -> Path | None:
|
|
116
|
+
"""Resolve the configured PDF directory to a concrete Path."""
|
|
117
|
+
candidate = getattr(path_utils, "PDF_DIR", None)
|
|
118
|
+
if isinstance(candidate, Path):
|
|
119
|
+
return candidate
|
|
120
|
+
if candidate is None:
|
|
121
|
+
return None
|
|
122
|
+
try:
|
|
123
|
+
derived = candidate / "."
|
|
124
|
+
except Exception:
|
|
125
|
+
derived = None
|
|
126
|
+
|
|
127
|
+
if derived is not None:
|
|
128
|
+
try:
|
|
129
|
+
return Path(derived)
|
|
130
|
+
except Exception:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
return Path(str(candidate))
|
|
135
|
+
except Exception:
|
|
136
|
+
return None
|
|
114
137
|
|
|
115
138
|
def _quarantine(self, source: Path) -> Path:
|
|
116
139
|
"""Move file to quarantine directory to prevent re-processing."""
|
|
117
|
-
qdir = PDF_DIR / "_processing"
|
|
140
|
+
qdir = path_utils.PDF_DIR / "_processing"
|
|
118
141
|
qdir.mkdir(parents=True, exist_ok=True)
|
|
119
142
|
target = qdir / source.name
|
|
120
|
-
|
|
121
|
-
|
|
143
|
+
try:
|
|
144
|
+
# Try atomic rename first (fastest when on same filesystem)
|
|
145
|
+
source.rename(target)
|
|
146
|
+
except OSError as exc:
|
|
147
|
+
if exc.errno == errno.EXDEV:
|
|
148
|
+
# Cross-device move, fall back to shutil.move which copies+removes
|
|
149
|
+
shutil.move(str(source), str(target))
|
|
150
|
+
else:
|
|
151
|
+
raise
|
|
122
152
|
return target
|
|
123
153
|
|
|
124
154
|
def _ensure_state(self, pdf_file: "RawPdfFile"):
|
|
@@ -287,6 +317,7 @@ class PdfImportService:
|
|
|
287
317
|
"""Initialize the processing context for the current PDF."""
|
|
288
318
|
self.processing_context = {
|
|
289
319
|
'file_path': Path(file_path),
|
|
320
|
+
'original_file_path': Path(file_path),
|
|
290
321
|
'center_name': center_name,
|
|
291
322
|
'delete_source': delete_source,
|
|
292
323
|
'retry': retry,
|
|
@@ -379,11 +410,18 @@ class PdfImportService:
|
|
|
379
410
|
|
|
380
411
|
def _setup_processing_environment(self):
|
|
381
412
|
"""Setup processing environment and state."""
|
|
413
|
+
original_path = self.processing_context.get('file_path')
|
|
414
|
+
|
|
382
415
|
# Create sensitive file copy
|
|
383
|
-
self.create_sensitive_file(self.current_pdf,
|
|
416
|
+
self.create_sensitive_file(self.current_pdf, original_path)
|
|
384
417
|
|
|
385
418
|
# Update file path to point to sensitive copy
|
|
386
419
|
self.processing_context['file_path'] = self.current_pdf.file.path
|
|
420
|
+
self.processing_context['sensitive_copy_created'] = True
|
|
421
|
+
try:
|
|
422
|
+
self.processing_context['sensitive_file_path'] = Path(self.current_pdf.file.path)
|
|
423
|
+
except Exception:
|
|
424
|
+
self.processing_context['sensitive_file_path'] = None
|
|
387
425
|
|
|
388
426
|
# Ensure state exists
|
|
389
427
|
state = self.current_pdf.get_or_create_state()
|
|
@@ -415,14 +453,14 @@ class PdfImportService:
|
|
|
415
453
|
logger.info("Starting text extraction and metadata processing with ReportReader...")
|
|
416
454
|
|
|
417
455
|
# Setup output directories
|
|
418
|
-
crops_dir = PDF_DIR / 'cropped_regions'
|
|
419
|
-
anonymized_dir = PDF_DIR / 'anonymized'
|
|
456
|
+
crops_dir = path_utils.PDF_DIR / 'cropped_regions'
|
|
457
|
+
anonymized_dir = path_utils.PDF_DIR / 'anonymized'
|
|
420
458
|
crops_dir.mkdir(parents=True, exist_ok=True)
|
|
421
459
|
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
422
460
|
|
|
423
461
|
# Initialize ReportReader
|
|
424
462
|
report_reader = ReportReader(
|
|
425
|
-
report_root_path=STORAGE_DIR,
|
|
463
|
+
report_root_path=str(path_utils.STORAGE_DIR),
|
|
426
464
|
locale="de_DE",
|
|
427
465
|
text_date_format="%d.%m.%Y"
|
|
428
466
|
)
|
|
@@ -603,7 +641,7 @@ class PdfImportService:
|
|
|
603
641
|
try:
|
|
604
642
|
# Prefer storing a path relative to STORAGE_DIR so Django serves it correctly
|
|
605
643
|
try:
|
|
606
|
-
relative_name = str(anonymized_path.relative_to(STORAGE_DIR))
|
|
644
|
+
relative_name = str(anonymized_path.relative_to(path_utils.STORAGE_DIR))
|
|
607
645
|
except ValueError:
|
|
608
646
|
# Fallback to absolute path if the file lives outside STORAGE_DIR
|
|
609
647
|
relative_name = str(anonymized_path)
|
|
@@ -637,36 +675,6 @@ class PdfImportService:
|
|
|
637
675
|
|
|
638
676
|
except Exception as e:
|
|
639
677
|
logger.warning("Could not set anonymized file reference: %s", e)
|
|
640
|
-
|
|
641
|
-
'''def _apply_anonymized_pdf(self):
|
|
642
|
-
"""Apply anonymized PDF results."""
|
|
643
|
-
if not self.current_pdf:
|
|
644
|
-
logger.warning("Cannot apply anonymized PDF - no PDF instance available")
|
|
645
|
-
return
|
|
646
|
-
|
|
647
|
-
anonymized_pdf_path = self.processing_context.get('anonymized_pdf_path')
|
|
648
|
-
|
|
649
|
-
if not anonymized_pdf_path:
|
|
650
|
-
return
|
|
651
|
-
|
|
652
|
-
anonymized_path = Path(anonymized_pdf_path)
|
|
653
|
-
if anonymized_path.exists():
|
|
654
|
-
logger.info(f"Anonymized PDF created by ReportReader at: {anonymized_path}")
|
|
655
|
-
try:
|
|
656
|
-
from django.core.files.base import File
|
|
657
|
-
with open(anonymized_path, 'rb') as f:
|
|
658
|
-
django_file = File(f)
|
|
659
|
-
self.current_pdf.anonymized_file.save(
|
|
660
|
-
anonymized_path.name,
|
|
661
|
-
django_file,
|
|
662
|
-
save=False
|
|
663
|
-
)
|
|
664
|
-
except Exception as e:
|
|
665
|
-
logger.warning(f"Could not set anonymized file reference: {e}")
|
|
666
|
-
else:
|
|
667
|
-
logger.warning(f"Anonymized PDF path returned but file does not exist: {anonymized_path}")'''
|
|
668
|
-
|
|
669
|
-
|
|
670
678
|
|
|
671
679
|
|
|
672
680
|
def _finalize_processing(self):
|
|
@@ -747,18 +755,96 @@ class PdfImportService:
|
|
|
747
755
|
except Exception as e:
|
|
748
756
|
logger.warning(f"Error during cleanup: {e}")
|
|
749
757
|
finally:
|
|
758
|
+
# Remove any sensitive copy created during this processing run
|
|
759
|
+
sensitive_created = self.processing_context.get('sensitive_copy_created')
|
|
760
|
+
if sensitive_created:
|
|
761
|
+
pdf_obj = self.current_pdf
|
|
762
|
+
try:
|
|
763
|
+
if pdf_obj:
|
|
764
|
+
file_field = getattr(pdf_obj, "file", None)
|
|
765
|
+
if file_field and getattr(file_field, "name", None):
|
|
766
|
+
storage_name = file_field.name
|
|
767
|
+
file_field.delete(save=False)
|
|
768
|
+
logger.debug("Deleted sensitive copy %s during error cleanup", storage_name)
|
|
769
|
+
except Exception as cleanup_exc:
|
|
770
|
+
logger.warning("Failed to remove sensitive copy during error cleanup: %s", cleanup_exc)
|
|
771
|
+
|
|
750
772
|
# Always clean up processed files set to prevent blocks
|
|
751
773
|
file_path = self.processing_context.get('file_path')
|
|
752
774
|
if file_path and str(file_path) in self.processed_files:
|
|
753
775
|
self.processed_files.remove(str(file_path))
|
|
754
776
|
logger.debug(f"Removed {file_path} from processed files during error cleanup")
|
|
755
777
|
|
|
778
|
+
try:
|
|
779
|
+
original_path = self.processing_context.get('original_file_path')
|
|
780
|
+
logger.debug("PDF cleanup original path: %s (%s)", original_path, type(original_path))
|
|
781
|
+
raw_dir = original_path.parent if isinstance(original_path, Path) else None
|
|
782
|
+
if (
|
|
783
|
+
isinstance(original_path, Path)
|
|
784
|
+
and original_path.exists()
|
|
785
|
+
and not self.processing_context.get('sensitive_copy_created')
|
|
786
|
+
):
|
|
787
|
+
try:
|
|
788
|
+
original_path.unlink()
|
|
789
|
+
logger.info("Removed original file %s during error cleanup", original_path)
|
|
790
|
+
except Exception as remove_exc:
|
|
791
|
+
logger.warning("Could not remove original file %s during error cleanup: %s", original_path, remove_exc)
|
|
792
|
+
pdf_dir = self._get_pdf_dir()
|
|
793
|
+
if not pdf_dir and raw_dir:
|
|
794
|
+
base_dir = raw_dir.parent
|
|
795
|
+
dir_name = getattr(path_utils, "PDF_DIR_NAME", "pdfs")
|
|
796
|
+
fallback_pdf_dir = base_dir / dir_name
|
|
797
|
+
logger.debug(
|
|
798
|
+
"PDF cleanup fallback resolution - base: %s, dir_name: %s, exists: %s",
|
|
799
|
+
base_dir,
|
|
800
|
+
dir_name,
|
|
801
|
+
fallback_pdf_dir.exists(),
|
|
802
|
+
)
|
|
803
|
+
if fallback_pdf_dir.exists():
|
|
804
|
+
pdf_dir = fallback_pdf_dir
|
|
805
|
+
|
|
806
|
+
# Remove empty PDF subdirectories that might have been created during setup
|
|
807
|
+
if pdf_dir and pdf_dir.exists():
|
|
808
|
+
for subdir_name in ("sensitive", "cropped_regions", "anonymized", "_processing"):
|
|
809
|
+
subdir_path = pdf_dir / subdir_name
|
|
810
|
+
if subdir_path.exists() and subdir_path.is_dir():
|
|
811
|
+
try:
|
|
812
|
+
next(subdir_path.iterdir())
|
|
813
|
+
except StopIteration:
|
|
814
|
+
try:
|
|
815
|
+
subdir_path.rmdir()
|
|
816
|
+
logger.debug("Removed empty directory %s during error cleanup", subdir_path)
|
|
817
|
+
except OSError as rm_err:
|
|
818
|
+
logger.debug("Could not remove directory %s: %s", subdir_path, rm_err)
|
|
819
|
+
except Exception as iter_err:
|
|
820
|
+
logger.debug("Could not inspect directory %s: %s", subdir_path, iter_err)
|
|
821
|
+
|
|
822
|
+
raw_count = len(list(raw_dir.glob("*"))) if raw_dir and raw_dir.exists() else None
|
|
823
|
+
pdf_count = len(list(pdf_dir.glob("*"))) if pdf_dir and pdf_dir.exists() else None
|
|
824
|
+
|
|
825
|
+
sensitive_path = self.processing_context.get('sensitive_file_path')
|
|
826
|
+
if sensitive_path:
|
|
827
|
+
sensitive_parent = Path(sensitive_path).parent
|
|
828
|
+
sensitive_count = len(list(sensitive_parent.glob("*"))) if sensitive_parent.exists() else None
|
|
829
|
+
else:
|
|
830
|
+
sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
|
|
831
|
+
sensitive_count = len(list(sensitive_dir.glob("*"))) if sensitive_dir and sensitive_dir.exists() else None
|
|
832
|
+
|
|
833
|
+
logger.info(
|
|
834
|
+
"PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
|
|
835
|
+
raw_count,
|
|
836
|
+
pdf_count,
|
|
837
|
+
sensitive_count,
|
|
838
|
+
)
|
|
839
|
+
except Exception:
|
|
840
|
+
pass
|
|
841
|
+
|
|
756
842
|
def _cleanup_processing_context(self):
|
|
757
843
|
"""Cleanup processing context."""
|
|
758
844
|
try:
|
|
759
845
|
# Clean up temporary directories
|
|
760
846
|
if self.processing_context.get('text_extracted'):
|
|
761
|
-
crops_dir = PDF_DIR / 'cropped_regions'
|
|
847
|
+
crops_dir = path_utils.PDF_DIR / 'cropped_regions'
|
|
762
848
|
if crops_dir.exists() and not any(crops_dir.iterdir()):
|
|
763
849
|
crops_dir.rmdir()
|
|
764
850
|
|
|
@@ -887,7 +973,7 @@ class PdfImportService:
|
|
|
887
973
|
if not source_path:
|
|
888
974
|
raise ValueError("No file path available for creating sensitive file")
|
|
889
975
|
|
|
890
|
-
SENSITIVE_DIR = PDF_DIR / "sensitive"
|
|
976
|
+
SENSITIVE_DIR = path_utils.PDF_DIR / "sensitive"
|
|
891
977
|
target = SENSITIVE_DIR / f"{pdf_file.pdf_hash}.pdf"
|
|
892
978
|
|
|
893
979
|
try:
|
|
@@ -910,7 +996,7 @@ class PdfImportService:
|
|
|
910
996
|
# Update FileField to reference the file under STORAGE_DIR
|
|
911
997
|
# We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
|
|
912
998
|
try:
|
|
913
|
-
relative_name = str(target.relative_to(STORAGE_DIR))
|
|
999
|
+
relative_name = str(target.relative_to(path_utils.STORAGE_DIR)) # Point Django FileField to sensitive storage
|
|
914
1000
|
except ValueError:
|
|
915
1001
|
# Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
|
|
916
1002
|
relative_name = str(target)
|
|
@@ -964,7 +1050,7 @@ class PdfImportService:
|
|
|
964
1050
|
if pdf_problematic:
|
|
965
1051
|
# Quarantine the file
|
|
966
1052
|
logger.warning(f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}")
|
|
967
|
-
quarantine_dir = PDF_DIR / "quarantine"
|
|
1053
|
+
quarantine_dir = path_utils.PDF_DIR / "quarantine"
|
|
968
1054
|
os.makedirs(quarantine_dir, exist_ok=True)
|
|
969
1055
|
|
|
970
1056
|
quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
@@ -980,7 +1066,7 @@ class PdfImportService:
|
|
|
980
1066
|
else:
|
|
981
1067
|
# Archive the file normally
|
|
982
1068
|
logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
|
|
983
|
-
archive_dir = PDF_DIR / "processed"
|
|
1069
|
+
archive_dir = path_utils.PDF_DIR / "processed"
|
|
984
1070
|
os.makedirs(archive_dir, exist_ok=True)
|
|
985
1071
|
|
|
986
1072
|
archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
|