endoreg-db 0.8.2__py3-none-any.whl → 0.8.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of endoreg-db might be problematic. Click here for more details.
- endoreg_db/config/__init__.py +0 -0
- endoreg_db/helpers/default_objects.py +48 -29
- endoreg_db/management/commands/import_video.py +5 -3
- endoreg_db/migrations/0003_add_center_display_name.py +30 -0
- endoreg_db/models/administration/center/center.py +7 -1
- endoreg_db/models/media/pdf/raw_pdf.py +31 -26
- endoreg_db/models/media/video/create_from_file.py +26 -4
- endoreg_db/models/media/video/video_file.py +89 -57
- endoreg_db/models/media/video/video_file_anonymize.py +2 -1
- endoreg_db/models/media/video/video_file_frames/_manage_frame_range.py +12 -0
- endoreg_db/models/media/video/video_file_io.py +4 -2
- endoreg_db/models/metadata/sensitive_meta.py +6 -6
- endoreg_db/models/metadata/video_meta.py +2 -2
- endoreg_db/services/pdf_import.py +131 -15
- endoreg_db/services/pseudonym_service.py +1 -1
- endoreg_db/services/video_import.py +400 -387
- endoreg_db/urls/sensitive_meta.py +0 -0
- endoreg_db/utils/paths.py +2 -10
- endoreg_db/utils/video/ffmpeg_wrapper.py +67 -4
- endoreg_db/views/anonymization/validate.py +75 -34
- endoreg_db/views/video/correction.py +8 -6
- {endoreg_db-0.8.2.dist-info → endoreg_db-0.8.2.2.dist-info}/METADATA +2 -2
- {endoreg_db-0.8.2.dist-info → endoreg_db-0.8.2.2.dist-info}/RECORD +25 -23
- endoreg_db/services/ollama_api_docs.py +0 -1528
- {endoreg_db-0.8.2.dist-info → endoreg_db-0.8.2.2.dist-info}/WHEEL +0 -0
- {endoreg_db-0.8.2.dist-info → endoreg_db-0.8.2.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -37,7 +37,7 @@ class SensitiveMeta(models.Model):
|
|
|
37
37
|
blank=True,
|
|
38
38
|
null=True,
|
|
39
39
|
help_text="FK to the pseudo-anonymized Patient record."
|
|
40
|
-
)
|
|
40
|
+
) # type: ignore
|
|
41
41
|
patient_first_name = models.CharField(max_length=255, blank=True, null=True)
|
|
42
42
|
patient_last_name = models.CharField(max_length=255, blank=True, null=True)
|
|
43
43
|
patient_dob = models.DateTimeField(
|
|
@@ -51,24 +51,24 @@ class SensitiveMeta(models.Model):
|
|
|
51
51
|
blank=True,
|
|
52
52
|
null=True,
|
|
53
53
|
help_text="FK to the pseudo-anonymized PatientExamination record."
|
|
54
|
-
)
|
|
54
|
+
) # type: ignore
|
|
55
55
|
patient_gender = models.ForeignKey(
|
|
56
56
|
"Gender",
|
|
57
57
|
on_delete=models.CASCADE,
|
|
58
58
|
blank=True,
|
|
59
59
|
null=True,
|
|
60
|
-
)
|
|
60
|
+
) # type: ignore
|
|
61
61
|
examiners = models.ManyToManyField(
|
|
62
62
|
"Examiner",
|
|
63
63
|
blank=True,
|
|
64
64
|
help_text="Pseudo-anonymized examiner(s) associated with the examination."
|
|
65
|
-
)
|
|
65
|
+
) # type: ignore
|
|
66
66
|
center = models.ForeignKey(
|
|
67
67
|
"Center",
|
|
68
68
|
on_delete=models.CASCADE,
|
|
69
69
|
blank=True, # Should ideally be False if always required before save
|
|
70
70
|
null=True, # Should ideally be False
|
|
71
|
-
)
|
|
71
|
+
) # type: ignore
|
|
72
72
|
|
|
73
73
|
# Raw examiner names stored temporarily until pseudo-examiner is created/linked
|
|
74
74
|
examiner_first_name = models.CharField(max_length=255, blank=True, null=True, editable=False)
|
|
@@ -258,7 +258,7 @@ class SensitiveMeta(models.Model):
|
|
|
258
258
|
|
|
259
259
|
# 4. Handle ManyToMany linking (examiners) *after* the instance has a PK.
|
|
260
260
|
if examiner_to_link and self.pk and not self.examiners.filter(pk=examiner_to_link.pk).exists():
|
|
261
|
-
self.examiners.add(examiner_to_link)
|
|
261
|
+
self.examiners.add(examiner_to_link) # type: ignore
|
|
262
262
|
# Adding to M2M handles its own DB interaction, no second super().save() needed.
|
|
263
263
|
|
|
264
264
|
def mark_dob_verified(self):
|
|
@@ -13,7 +13,7 @@ else:
|
|
|
13
13
|
ENDOREG_CENTER_ID = settings.ENDOREG_CENTER_ID
|
|
14
14
|
|
|
15
15
|
# Import the new utility function
|
|
16
|
-
from ...utils.video
|
|
16
|
+
from ...utils.video import ffmpeg_wrapper
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
@@ -214,7 +214,7 @@ class FFMpegMeta(models.Model):
|
|
|
214
214
|
"""
|
|
215
215
|
logger.info("Running ffprobe on %s", file_path)
|
|
216
216
|
try:
|
|
217
|
-
probe_data = get_stream_info(file_path) # Use the new utility
|
|
217
|
+
probe_data = ffmpeg_wrapper.get_stream_info(file_path) # Use the new utility
|
|
218
218
|
except Exception as probe_err:
|
|
219
219
|
logger.error("ffprobe execution failed for %s: %s", file_path, probe_err, exc_info=True)
|
|
220
220
|
raise RuntimeError(f"ffprobe execution failed for {file_path}") from probe_err
|
|
@@ -5,6 +5,7 @@ Provides high-level functions for importing and anonymizing PDF files,
|
|
|
5
5
|
combining RawPdfFile creation with text extraction and anonymization.
|
|
6
6
|
"""
|
|
7
7
|
from datetime import date, datetime
|
|
8
|
+
import errno
|
|
8
9
|
import logging
|
|
9
10
|
import shutil
|
|
10
11
|
import sys
|
|
@@ -13,12 +14,11 @@ import hashlib
|
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from typing import TYPE_CHECKING, Union
|
|
15
16
|
from contextlib import contextmanager
|
|
16
|
-
from django.conf.locale import tr
|
|
17
17
|
from django.db import transaction
|
|
18
18
|
from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
|
|
19
19
|
from endoreg_db.models.state.raw_pdf import RawPdfState
|
|
20
20
|
from endoreg_db.models import SensitiveMeta
|
|
21
|
-
from endoreg_db.utils
|
|
21
|
+
from endoreg_db.utils import paths as path_utils
|
|
22
22
|
import time
|
|
23
23
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
@@ -111,14 +111,44 @@ class PdfImportService:
|
|
|
111
111
|
break
|
|
112
112
|
h.update(b)
|
|
113
113
|
return h.hexdigest()
|
|
114
|
+
|
|
115
|
+
def _get_pdf_dir(self) -> Path | None:
|
|
116
|
+
"""Resolve the configured PDF directory to a concrete Path."""
|
|
117
|
+
candidate = getattr(path_utils, "PDF_DIR", None)
|
|
118
|
+
if isinstance(candidate, Path):
|
|
119
|
+
return candidate
|
|
120
|
+
if candidate is None:
|
|
121
|
+
return None
|
|
122
|
+
try:
|
|
123
|
+
derived = candidate / "."
|
|
124
|
+
except Exception:
|
|
125
|
+
derived = None
|
|
126
|
+
|
|
127
|
+
if derived is not None:
|
|
128
|
+
try:
|
|
129
|
+
return Path(derived)
|
|
130
|
+
except Exception:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
return Path(str(candidate))
|
|
135
|
+
except Exception:
|
|
136
|
+
return None
|
|
114
137
|
|
|
115
138
|
def _quarantine(self, source: Path) -> Path:
|
|
116
139
|
"""Move file to quarantine directory to prevent re-processing."""
|
|
117
|
-
qdir = PDF_DIR / "_processing"
|
|
140
|
+
qdir = path_utils.PDF_DIR / "_processing"
|
|
118
141
|
qdir.mkdir(parents=True, exist_ok=True)
|
|
119
142
|
target = qdir / source.name
|
|
120
|
-
|
|
121
|
-
|
|
143
|
+
try:
|
|
144
|
+
# Try atomic rename first (fastest when on same filesystem)
|
|
145
|
+
source.rename(target)
|
|
146
|
+
except OSError as exc:
|
|
147
|
+
if exc.errno == errno.EXDEV:
|
|
148
|
+
# Cross-device move, fall back to shutil.move which copies+removes
|
|
149
|
+
shutil.move(str(source), str(target))
|
|
150
|
+
else:
|
|
151
|
+
raise
|
|
122
152
|
return target
|
|
123
153
|
|
|
124
154
|
def _ensure_state(self, pdf_file: "RawPdfFile"):
|
|
@@ -287,6 +317,7 @@ class PdfImportService:
|
|
|
287
317
|
"""Initialize the processing context for the current PDF."""
|
|
288
318
|
self.processing_context = {
|
|
289
319
|
'file_path': Path(file_path),
|
|
320
|
+
'original_file_path': Path(file_path),
|
|
290
321
|
'center_name': center_name,
|
|
291
322
|
'delete_source': delete_source,
|
|
292
323
|
'retry': retry,
|
|
@@ -379,11 +410,18 @@ class PdfImportService:
|
|
|
379
410
|
|
|
380
411
|
def _setup_processing_environment(self):
|
|
381
412
|
"""Setup processing environment and state."""
|
|
413
|
+
original_path = self.processing_context.get('file_path')
|
|
414
|
+
|
|
382
415
|
# Create sensitive file copy
|
|
383
|
-
self.create_sensitive_file(self.current_pdf,
|
|
416
|
+
self.create_sensitive_file(self.current_pdf, original_path)
|
|
384
417
|
|
|
385
418
|
# Update file path to point to sensitive copy
|
|
386
419
|
self.processing_context['file_path'] = self.current_pdf.file.path
|
|
420
|
+
self.processing_context['sensitive_copy_created'] = True
|
|
421
|
+
try:
|
|
422
|
+
self.processing_context['sensitive_file_path'] = Path(self.current_pdf.file.path)
|
|
423
|
+
except Exception:
|
|
424
|
+
self.processing_context['sensitive_file_path'] = None
|
|
387
425
|
|
|
388
426
|
# Ensure state exists
|
|
389
427
|
state = self.current_pdf.get_or_create_state()
|
|
@@ -415,14 +453,14 @@ class PdfImportService:
|
|
|
415
453
|
logger.info("Starting text extraction and metadata processing with ReportReader...")
|
|
416
454
|
|
|
417
455
|
# Setup output directories
|
|
418
|
-
crops_dir = PDF_DIR / 'cropped_regions'
|
|
419
|
-
anonymized_dir = PDF_DIR / 'anonymized'
|
|
456
|
+
crops_dir = path_utils.PDF_DIR / 'cropped_regions'
|
|
457
|
+
anonymized_dir = path_utils.PDF_DIR / 'anonymized'
|
|
420
458
|
crops_dir.mkdir(parents=True, exist_ok=True)
|
|
421
459
|
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
422
460
|
|
|
423
461
|
# Initialize ReportReader
|
|
424
462
|
report_reader = ReportReader(
|
|
425
|
-
report_root_path=STORAGE_DIR,
|
|
463
|
+
report_root_path=str(path_utils.STORAGE_DIR),
|
|
426
464
|
locale="de_DE",
|
|
427
465
|
text_date_format="%d.%m.%Y"
|
|
428
466
|
)
|
|
@@ -603,7 +641,7 @@ class PdfImportService:
|
|
|
603
641
|
try:
|
|
604
642
|
# Prefer storing a path relative to STORAGE_DIR so Django serves it correctly
|
|
605
643
|
try:
|
|
606
|
-
relative_name = str(anonymized_path.relative_to(STORAGE_DIR))
|
|
644
|
+
relative_name = str(anonymized_path.relative_to(path_utils.STORAGE_DIR))
|
|
607
645
|
except ValueError:
|
|
608
646
|
# Fallback to absolute path if the file lives outside STORAGE_DIR
|
|
609
647
|
relative_name = str(anonymized_path)
|
|
@@ -717,18 +755,96 @@ class PdfImportService:
|
|
|
717
755
|
except Exception as e:
|
|
718
756
|
logger.warning(f"Error during cleanup: {e}")
|
|
719
757
|
finally:
|
|
758
|
+
# Remove any sensitive copy created during this processing run
|
|
759
|
+
sensitive_created = self.processing_context.get('sensitive_copy_created')
|
|
760
|
+
if sensitive_created:
|
|
761
|
+
pdf_obj = self.current_pdf
|
|
762
|
+
try:
|
|
763
|
+
if pdf_obj:
|
|
764
|
+
file_field = getattr(pdf_obj, "file", None)
|
|
765
|
+
if file_field and getattr(file_field, "name", None):
|
|
766
|
+
storage_name = file_field.name
|
|
767
|
+
file_field.delete(save=False)
|
|
768
|
+
logger.debug("Deleted sensitive copy %s during error cleanup", storage_name)
|
|
769
|
+
except Exception as cleanup_exc:
|
|
770
|
+
logger.warning("Failed to remove sensitive copy during error cleanup: %s", cleanup_exc)
|
|
771
|
+
|
|
720
772
|
# Always clean up processed files set to prevent blocks
|
|
721
773
|
file_path = self.processing_context.get('file_path')
|
|
722
774
|
if file_path and str(file_path) in self.processed_files:
|
|
723
775
|
self.processed_files.remove(str(file_path))
|
|
724
776
|
logger.debug(f"Removed {file_path} from processed files during error cleanup")
|
|
725
777
|
|
|
778
|
+
try:
|
|
779
|
+
original_path = self.processing_context.get('original_file_path')
|
|
780
|
+
logger.debug("PDF cleanup original path: %s (%s)", original_path, type(original_path))
|
|
781
|
+
raw_dir = original_path.parent if isinstance(original_path, Path) else None
|
|
782
|
+
if (
|
|
783
|
+
isinstance(original_path, Path)
|
|
784
|
+
and original_path.exists()
|
|
785
|
+
and not self.processing_context.get('sensitive_copy_created')
|
|
786
|
+
):
|
|
787
|
+
try:
|
|
788
|
+
original_path.unlink()
|
|
789
|
+
logger.info("Removed original file %s during error cleanup", original_path)
|
|
790
|
+
except Exception as remove_exc:
|
|
791
|
+
logger.warning("Could not remove original file %s during error cleanup: %s", original_path, remove_exc)
|
|
792
|
+
pdf_dir = self._get_pdf_dir()
|
|
793
|
+
if not pdf_dir and raw_dir:
|
|
794
|
+
base_dir = raw_dir.parent
|
|
795
|
+
dir_name = getattr(path_utils, "PDF_DIR_NAME", "pdfs")
|
|
796
|
+
fallback_pdf_dir = base_dir / dir_name
|
|
797
|
+
logger.debug(
|
|
798
|
+
"PDF cleanup fallback resolution - base: %s, dir_name: %s, exists: %s",
|
|
799
|
+
base_dir,
|
|
800
|
+
dir_name,
|
|
801
|
+
fallback_pdf_dir.exists(),
|
|
802
|
+
)
|
|
803
|
+
if fallback_pdf_dir.exists():
|
|
804
|
+
pdf_dir = fallback_pdf_dir
|
|
805
|
+
|
|
806
|
+
# Remove empty PDF subdirectories that might have been created during setup
|
|
807
|
+
if pdf_dir and pdf_dir.exists():
|
|
808
|
+
for subdir_name in ("sensitive", "cropped_regions", "anonymized", "_processing"):
|
|
809
|
+
subdir_path = pdf_dir / subdir_name
|
|
810
|
+
if subdir_path.exists() and subdir_path.is_dir():
|
|
811
|
+
try:
|
|
812
|
+
next(subdir_path.iterdir())
|
|
813
|
+
except StopIteration:
|
|
814
|
+
try:
|
|
815
|
+
subdir_path.rmdir()
|
|
816
|
+
logger.debug("Removed empty directory %s during error cleanup", subdir_path)
|
|
817
|
+
except OSError as rm_err:
|
|
818
|
+
logger.debug("Could not remove directory %s: %s", subdir_path, rm_err)
|
|
819
|
+
except Exception as iter_err:
|
|
820
|
+
logger.debug("Could not inspect directory %s: %s", subdir_path, iter_err)
|
|
821
|
+
|
|
822
|
+
raw_count = len(list(raw_dir.glob("*"))) if raw_dir and raw_dir.exists() else None
|
|
823
|
+
pdf_count = len(list(pdf_dir.glob("*"))) if pdf_dir and pdf_dir.exists() else None
|
|
824
|
+
|
|
825
|
+
sensitive_path = self.processing_context.get('sensitive_file_path')
|
|
826
|
+
if sensitive_path:
|
|
827
|
+
sensitive_parent = Path(sensitive_path).parent
|
|
828
|
+
sensitive_count = len(list(sensitive_parent.glob("*"))) if sensitive_parent.exists() else None
|
|
829
|
+
else:
|
|
830
|
+
sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
|
|
831
|
+
sensitive_count = len(list(sensitive_dir.glob("*"))) if sensitive_dir and sensitive_dir.exists() else None
|
|
832
|
+
|
|
833
|
+
logger.info(
|
|
834
|
+
"PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
|
|
835
|
+
raw_count,
|
|
836
|
+
pdf_count,
|
|
837
|
+
sensitive_count,
|
|
838
|
+
)
|
|
839
|
+
except Exception:
|
|
840
|
+
pass
|
|
841
|
+
|
|
726
842
|
def _cleanup_processing_context(self):
|
|
727
843
|
"""Cleanup processing context."""
|
|
728
844
|
try:
|
|
729
845
|
# Clean up temporary directories
|
|
730
846
|
if self.processing_context.get('text_extracted'):
|
|
731
|
-
crops_dir = PDF_DIR / 'cropped_regions'
|
|
847
|
+
crops_dir = path_utils.PDF_DIR / 'cropped_regions'
|
|
732
848
|
if crops_dir.exists() and not any(crops_dir.iterdir()):
|
|
733
849
|
crops_dir.rmdir()
|
|
734
850
|
|
|
@@ -857,7 +973,7 @@ class PdfImportService:
|
|
|
857
973
|
if not source_path:
|
|
858
974
|
raise ValueError("No file path available for creating sensitive file")
|
|
859
975
|
|
|
860
|
-
SENSITIVE_DIR = PDF_DIR / "sensitive"
|
|
976
|
+
SENSITIVE_DIR = path_utils.PDF_DIR / "sensitive"
|
|
861
977
|
target = SENSITIVE_DIR / f"{pdf_file.pdf_hash}.pdf"
|
|
862
978
|
|
|
863
979
|
try:
|
|
@@ -880,7 +996,7 @@ class PdfImportService:
|
|
|
880
996
|
# Update FileField to reference the file under STORAGE_DIR
|
|
881
997
|
# We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
|
|
882
998
|
try:
|
|
883
|
-
relative_name = str(target.relative_to(STORAGE_DIR))
|
|
999
|
+
relative_name = str(target.relative_to(path_utils.STORAGE_DIR)) # Point Django FileField to sensitive storage
|
|
884
1000
|
except ValueError:
|
|
885
1001
|
# Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
|
|
886
1002
|
relative_name = str(target)
|
|
@@ -934,7 +1050,7 @@ class PdfImportService:
|
|
|
934
1050
|
if pdf_problematic:
|
|
935
1051
|
# Quarantine the file
|
|
936
1052
|
logger.warning(f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}")
|
|
937
|
-
quarantine_dir = PDF_DIR / "quarantine"
|
|
1053
|
+
quarantine_dir = path_utils.PDF_DIR / "quarantine"
|
|
938
1054
|
os.makedirs(quarantine_dir, exist_ok=True)
|
|
939
1055
|
|
|
940
1056
|
quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
@@ -950,7 +1066,7 @@ class PdfImportService:
|
|
|
950
1066
|
else:
|
|
951
1067
|
# Archive the file normally
|
|
952
1068
|
logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
|
|
953
|
-
archive_dir = PDF_DIR / "processed"
|
|
1069
|
+
archive_dir = path_utils.PDF_DIR / "processed"
|
|
954
1070
|
os.makedirs(archive_dir, exist_ok=True)
|
|
955
1071
|
|
|
956
1072
|
archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
@@ -56,7 +56,7 @@ def generate_patient_pseudonym(patient: Patient) -> Tuple[str, bool]:
|
|
|
56
56
|
patient.patient_hash = patient_hash
|
|
57
57
|
patient.save(update_fields=['patient_hash'])
|
|
58
58
|
|
|
59
|
-
logger.info(f"Generated and persisted pseudonym for patient {patient.id}
|
|
59
|
+
logger.info(f"Generated and persisted pseudonym for patient {patient.id}")
|
|
60
60
|
|
|
61
61
|
return patient_hash, True
|
|
62
62
|
|