endoreg-db 0.8.2__py3-none-any.whl → 0.8.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of endoreg-db might be problematic. Click here for more details.
- endoreg_db/config/__init__.py +0 -0
- endoreg_db/migrations/0003_add_center_display_name.py +30 -0
- endoreg_db/models/administration/center/center.py +7 -1
- endoreg_db/models/media/pdf/raw_pdf.py +31 -26
- endoreg_db/models/media/video/create_from_file.py +26 -4
- endoreg_db/models/media/video/video_file.py +36 -13
- endoreg_db/models/media/video/video_file_anonymize.py +2 -1
- endoreg_db/models/media/video/video_file_frames/_manage_frame_range.py +12 -0
- endoreg_db/models/media/video/video_file_io.py +4 -2
- endoreg_db/models/metadata/video_meta.py +2 -2
- endoreg_db/services/pdf_import.py +131 -15
- endoreg_db/services/video_import.py +158 -62
- endoreg_db/urls/sensitive_meta.py +0 -0
- endoreg_db/utils/paths.py +2 -10
- endoreg_db/utils/video/ffmpeg_wrapper.py +67 -4
- endoreg_db/views/anonymization/validate.py +75 -34
- endoreg_db/views/video/correction.py +8 -6
- {endoreg_db-0.8.2.dist-info → endoreg_db-0.8.2.1.dist-info}/METADATA +2 -2
- {endoreg_db-0.8.2.dist-info → endoreg_db-0.8.2.1.dist-info}/RECORD +21 -18
- {endoreg_db-0.8.2.dist-info → endoreg_db-0.8.2.1.dist-info}/WHEEL +0 -0
- {endoreg_db-0.8.2.dist-info → endoreg_db-0.8.2.1.dist-info}/licenses/LICENSE +0 -0
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from django.db import migrations, models
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def populate_display_name(apps, schema_editor):
|
|
5
|
+
Center = apps.get_model('endoreg_db', 'Center')
|
|
6
|
+
for center in Center.objects.all():
|
|
7
|
+
if not center.display_name:
|
|
8
|
+
center.display_name = center.name
|
|
9
|
+
center.save(update_fields=['display_name'])
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def reset_display_name(apps, schema_editor):
|
|
13
|
+
Center = apps.get_model('endoreg_db', 'Center')
|
|
14
|
+
Center.objects.update(display_name='')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Migration(migrations.Migration):
|
|
18
|
+
|
|
19
|
+
dependencies = [
|
|
20
|
+
('endoreg_db', '0002_add_video_correction_models'),
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
operations = [
|
|
24
|
+
migrations.AddField(
|
|
25
|
+
model_name='center',
|
|
26
|
+
name='display_name',
|
|
27
|
+
field=models.CharField(blank=True, default='', max_length=255),
|
|
28
|
+
),
|
|
29
|
+
migrations.RunPython(populate_display_name, reset_display_name),
|
|
30
|
+
]
|
|
@@ -19,6 +19,7 @@ class Center(models.Model):
|
|
|
19
19
|
|
|
20
20
|
# import_id = models.IntegerField(primary_key=True)
|
|
21
21
|
name = models.CharField(max_length=255)
|
|
22
|
+
display_name = models.CharField(max_length=255, blank=True, default="")
|
|
22
23
|
|
|
23
24
|
first_names = models.ManyToManyField(
|
|
24
25
|
to="FirstName",
|
|
@@ -45,8 +46,13 @@ class Center(models.Model):
|
|
|
45
46
|
def natural_key(self) -> tuple[str]:
|
|
46
47
|
return (self.name,)
|
|
47
48
|
|
|
49
|
+
def save(self, *args, **kwargs):
|
|
50
|
+
if not self.display_name:
|
|
51
|
+
self.display_name = self.name
|
|
52
|
+
super().save(*args, **kwargs)
|
|
53
|
+
|
|
48
54
|
def __str__(self) -> str:
|
|
49
|
-
return str(object=self.name)
|
|
55
|
+
return str(object=self.display_name or self.name)
|
|
50
56
|
|
|
51
57
|
def get_first_names(self):
|
|
52
58
|
return self.first_names.all()
|
|
@@ -383,37 +383,42 @@ class RawPdfFile(models.Model):
|
|
|
383
383
|
new_file_name, _uuid = get_uuid_filename(file_path)
|
|
384
384
|
logger.info(f"Generated new filename: {new_file_name}")
|
|
385
385
|
|
|
386
|
-
# Create model instance
|
|
387
|
-
raw_pdf = cls(
|
|
388
|
-
pdf_hash=pdf_hash,
|
|
389
|
-
center=center,
|
|
390
|
-
)
|
|
391
|
-
|
|
392
|
-
# Assign file using Django's File wrapper and save
|
|
386
|
+
# Create model instance via manager so creation can be intercepted/mocked during tests
|
|
393
387
|
try:
|
|
394
388
|
with file_path.open("rb") as f:
|
|
395
389
|
django_file = File(f, name=new_file_name)
|
|
396
|
-
raw_pdf
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
390
|
+
raw_pdf = cls.objects.create(
|
|
391
|
+
pdf_hash=pdf_hash,
|
|
392
|
+
center=center,
|
|
393
|
+
file=django_file,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
_file = raw_pdf.file
|
|
397
|
+
assert _file is not None
|
|
398
|
+
logger.info(
|
|
399
|
+
"Created and saved new RawPdfFile %s with file %s", raw_pdf.pk, _file.name
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
if not _file.storage.exists(_file.name):
|
|
403
|
+
logger.error(
|
|
404
|
+
"File was not saved correctly to storage path %s after model save.",
|
|
405
|
+
_file.name,
|
|
406
|
+
)
|
|
407
|
+
raise IOError(
|
|
408
|
+
f"File not found at expected storage path after save: {_file.name}"
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
try:
|
|
412
|
+
logger.info("File saved to absolute path: %s", _file.path)
|
|
413
|
+
except NotImplementedError:
|
|
414
|
+
logger.info(
|
|
415
|
+
"File saved to storage path: %s (Absolute path not available from storage)",
|
|
416
|
+
_file.name,
|
|
417
|
+
)
|
|
412
418
|
|
|
413
419
|
except Exception as e:
|
|
414
|
-
logger.error(
|
|
415
|
-
|
|
416
|
-
raise # Re-raise the exception
|
|
420
|
+
logger.error("Error processing or saving file %s for new record: %s", file_path, e)
|
|
421
|
+
raise
|
|
417
422
|
|
|
418
423
|
# Delete source file *after* successful save and verification
|
|
419
424
|
if delete_source:
|
|
@@ -6,7 +6,8 @@ from typing import TYPE_CHECKING, Optional, Type
|
|
|
6
6
|
|
|
7
7
|
# Import the new exceptions from the correct path
|
|
8
8
|
from endoreg_db.exceptions import InsufficientStorageError, TranscodingError
|
|
9
|
-
from ...utils import VIDEO_DIR, TMP_VIDEO_DIR
|
|
9
|
+
from ...utils import VIDEO_DIR, TMP_VIDEO_DIR
|
|
10
|
+
from importlib import import_module
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
13
|
from endoreg_db.models import VideoFile
|
|
@@ -170,6 +171,22 @@ def atomic_move_with_fallback(src_path: Path, dst_path: Path) -> bool:
|
|
|
170
171
|
raise
|
|
171
172
|
|
|
172
173
|
|
|
174
|
+
def _get_data_paths():
|
|
175
|
+
"""Return the current data_paths mapping (supports patched instances in tests)."""
|
|
176
|
+
utils_module = import_module("endoreg_db.utils")
|
|
177
|
+
return getattr(utils_module, "data_paths")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _get_path(mapping, key, default):
|
|
181
|
+
"""Access mapping by key using __getitem__ so MagicMocks with side effects work."""
|
|
182
|
+
if mapping is None:
|
|
183
|
+
return default
|
|
184
|
+
try:
|
|
185
|
+
return mapping[key]
|
|
186
|
+
except (KeyError, TypeError):
|
|
187
|
+
return default
|
|
188
|
+
|
|
189
|
+
|
|
173
190
|
def _create_from_file(
|
|
174
191
|
cls_model: Type["VideoFile"],
|
|
175
192
|
file_path: Path,
|
|
@@ -199,8 +216,12 @@ def _create_from_file(
|
|
|
199
216
|
|
|
200
217
|
try:
|
|
201
218
|
# Ensure we operate under the canonical video path root
|
|
202
|
-
|
|
203
|
-
|
|
219
|
+
data_paths = _get_data_paths()
|
|
220
|
+
resolved_video_dir = _get_path(data_paths, "video", video_dir)
|
|
221
|
+
video_dir = Path(resolved_video_dir)
|
|
222
|
+
storage_root_default = Path(video_dir).parent
|
|
223
|
+
resolved_storage_root = _get_path(data_paths, "storage", storage_root_default)
|
|
224
|
+
storage_root = Path(resolved_storage_root)
|
|
204
225
|
storage_root.mkdir(parents=True, exist_ok=True)
|
|
205
226
|
|
|
206
227
|
# Check storage capacity before starting any work
|
|
@@ -300,7 +321,8 @@ def _create_from_file(
|
|
|
300
321
|
# 8. Create the VideoFile instance
|
|
301
322
|
logger.info("Creating new VideoFile instance with UUID: %s", uuid_val)
|
|
302
323
|
# Store FileField path relative to storage root including the videos prefix
|
|
303
|
-
|
|
324
|
+
storage_base = Path(_get_path(data_paths, "storage", final_storage_path.parent))
|
|
325
|
+
relative_name = (final_storage_path.relative_to(storage_base)).as_posix()
|
|
304
326
|
video = cls_model(
|
|
305
327
|
uuid=uuid_val,
|
|
306
328
|
raw_file=relative_name,
|
|
@@ -126,7 +126,7 @@ class VideoFile(models.Model):
|
|
|
126
126
|
|
|
127
127
|
sensitive_meta = models.OneToOneField(
|
|
128
128
|
"SensitiveMeta", on_delete=models.SET_NULL, null=True, blank=True, related_name="video_file"
|
|
129
|
-
)
|
|
129
|
+
) # type: ignore
|
|
130
130
|
center = models.ForeignKey("Center", on_delete=models.PROTECT)
|
|
131
131
|
processor = models.ForeignKey(
|
|
132
132
|
"EndoscopyProcessor", on_delete=models.PROTECT, blank=True, null=True
|
|
@@ -465,7 +465,18 @@ class VideoFile(models.Model):
|
|
|
465
465
|
# Use proper database connection
|
|
466
466
|
if using is None:
|
|
467
467
|
using = 'default'
|
|
468
|
-
|
|
468
|
+
|
|
469
|
+
raw_file_path = self.get_raw_file_path()
|
|
470
|
+
if raw_file_path:
|
|
471
|
+
raw_file_path = Path(raw_file_path)
|
|
472
|
+
lock_path = raw_file_path.with_suffix(raw_file_path.suffix + ".lock")
|
|
473
|
+
if lock_path.exists():
|
|
474
|
+
try:
|
|
475
|
+
lock_path.unlink()
|
|
476
|
+
logger.info(f"Removed processing lock: {lock_path}")
|
|
477
|
+
except Exception as e:
|
|
478
|
+
logger.warning(f"Could not remove processing lock {lock_path}: {e}")
|
|
479
|
+
|
|
469
480
|
try:
|
|
470
481
|
# Call parent delete with proper parameters
|
|
471
482
|
super().delete(using=using, keep_parents=keep_parents)
|
|
@@ -572,15 +583,28 @@ class VideoFile(models.Model):
|
|
|
572
583
|
super().save(*args, **kwargs)
|
|
573
584
|
|
|
574
585
|
def get_or_create_state(self) -> "VideoState":
|
|
575
|
-
"""
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
586
|
+
"""Ensure this video has a persisted ``VideoState`` and return it."""
|
|
587
|
+
|
|
588
|
+
state = self.state
|
|
589
|
+
|
|
590
|
+
# When tests reuse cached instances across database flushes, ``state`` may reference
|
|
591
|
+
# a row that no longer exists. Guard against that by validating persistence.
|
|
592
|
+
state_pk = getattr(state, "pk", None)
|
|
593
|
+
if state is not None and state_pk is not None:
|
|
594
|
+
if not VideoState.objects.filter(pk=state_pk).exists():
|
|
595
|
+
state = None
|
|
596
|
+
|
|
597
|
+
if state is None:
|
|
598
|
+
# Create a fresh state to avoid refresh_from_db() failures on unsaved instances.
|
|
599
|
+
state = VideoState.objects.create()
|
|
600
|
+
self.state = state
|
|
601
|
+
|
|
602
|
+
# Persist the relation immediately if the VideoFile already exists in the DB so
|
|
603
|
+
# later refreshes see the association without requiring additional saves.
|
|
604
|
+
if self.pk:
|
|
605
|
+
self.save(update_fields=["state"])
|
|
606
|
+
|
|
607
|
+
return state
|
|
584
608
|
|
|
585
609
|
def get_or_create_sensitive_meta(self) -> "SensitiveMeta":
|
|
586
610
|
"""
|
|
@@ -592,8 +616,7 @@ class VideoFile(models.Model):
|
|
|
592
616
|
from endoreg_db.models import SensitiveMeta
|
|
593
617
|
if self.sensitive_meta is None:
|
|
594
618
|
self.sensitive_meta = SensitiveMeta.objects.create(center = self.center)
|
|
595
|
-
#
|
|
596
|
-
self.get_or_create_state().mark_sensitive_meta_processed(save=True)
|
|
619
|
+
# Do not mark processed here; it will be set after extraction/validation steps
|
|
597
620
|
return self.sensitive_meta
|
|
598
621
|
|
|
599
622
|
def get_outside_segments(self, only_validated: bool = False) -> models.QuerySet["LabelVideoSegment"]:
|
|
@@ -12,6 +12,7 @@ from django.conf import settings
|
|
|
12
12
|
|
|
13
13
|
from endoreg_db.utils.hashs import get_video_hash
|
|
14
14
|
from endoreg_db.utils.validate_endo_roi import validate_endo_roi
|
|
15
|
+
from endoreg_db.utils.paths import STORAGE_DIR
|
|
15
16
|
from ....utils.video.ffmpeg_wrapper import assemble_video_from_frames
|
|
16
17
|
from ...utils import anonymize_frame # Import from models.utils
|
|
17
18
|
from .video_file_segments import _get_outside_frames, _get_outside_frame_numbers
|
|
@@ -268,7 +269,7 @@ def _anonymize(video: "VideoFile", delete_original_raw: bool = True) -> bool:
|
|
|
268
269
|
raise ValueError(f"Processed video hash {new_processed_hash} already exists for another video (Video: {video.uuid}).")
|
|
269
270
|
|
|
270
271
|
video.processed_video_hash = new_processed_hash
|
|
271
|
-
video.processed_file.name = video.get_target_anonymized_video_path().relative_to(
|
|
272
|
+
video.processed_file.name = video.get_target_anonymized_video_path().relative_to(STORAGE_DIR).as_posix()
|
|
272
273
|
|
|
273
274
|
update_fields = [
|
|
274
275
|
"processed_video_hash",
|
|
@@ -97,6 +97,7 @@ def _extract_frame_range(
|
|
|
97
97
|
return True # Indicate success as frames are considered present
|
|
98
98
|
|
|
99
99
|
frame_dir.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
extracted_paths = []
|
|
100
101
|
|
|
101
102
|
try:
|
|
102
103
|
logger.info("Starting frame range extraction [%d, %d) for video %s to %s", start_frame, end_frame, video.uuid, frame_dir)
|
|
@@ -111,6 +112,17 @@ def _extract_frame_range(
|
|
|
111
112
|
|
|
112
113
|
return True
|
|
113
114
|
|
|
115
|
+
except FileNotFoundError as err:
|
|
116
|
+
logger.error(
|
|
117
|
+
"Frame range extraction [%d, %d) failed for video %s: %s",
|
|
118
|
+
start_frame,
|
|
119
|
+
end_frame,
|
|
120
|
+
video.uuid,
|
|
121
|
+
err,
|
|
122
|
+
exc_info=True,
|
|
123
|
+
)
|
|
124
|
+
raise
|
|
125
|
+
|
|
114
126
|
except Exception as e:
|
|
115
127
|
logger.error("Frame range extraction [%d, %d) or DB update failed for video %s: %s", start_frame, end_frame, video.uuid, e, exc_info=True)
|
|
116
128
|
|
|
@@ -32,13 +32,15 @@ def _get_raw_file_path(video: "VideoFile") -> Optional[Path]:
|
|
|
32
32
|
if sensitive_path.exists():
|
|
33
33
|
return sensitive_path.resolve()
|
|
34
34
|
|
|
35
|
+
# Check direct raw_file.path if available
|
|
35
36
|
# Check direct raw_file.path if available
|
|
36
37
|
try:
|
|
37
38
|
direct_path = Path(video.raw_file.path)
|
|
38
39
|
if direct_path.exists():
|
|
39
40
|
return direct_path.resolve()
|
|
40
|
-
except Exception:
|
|
41
|
-
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.debug("Could not access direct raw_file.path for video %s: %s", video.uuid, e)
|
|
43
|
+
# Fallback to checking alternative paths
|
|
42
44
|
|
|
43
45
|
# Check common alternative paths
|
|
44
46
|
alternative_paths = [
|
|
@@ -13,7 +13,7 @@ else:
|
|
|
13
13
|
ENDOREG_CENTER_ID = settings.ENDOREG_CENTER_ID
|
|
14
14
|
|
|
15
15
|
# Import the new utility function
|
|
16
|
-
from ...utils.video
|
|
16
|
+
from ...utils.video import ffmpeg_wrapper
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
@@ -214,7 +214,7 @@ class FFMpegMeta(models.Model):
|
|
|
214
214
|
"""
|
|
215
215
|
logger.info("Running ffprobe on %s", file_path)
|
|
216
216
|
try:
|
|
217
|
-
probe_data = get_stream_info(file_path) # Use the new utility
|
|
217
|
+
probe_data = ffmpeg_wrapper.get_stream_info(file_path) # Use the new utility
|
|
218
218
|
except Exception as probe_err:
|
|
219
219
|
logger.error("ffprobe execution failed for %s: %s", file_path, probe_err, exc_info=True)
|
|
220
220
|
raise RuntimeError(f"ffprobe execution failed for {file_path}") from probe_err
|
|
@@ -5,6 +5,7 @@ Provides high-level functions for importing and anonymizing PDF files,
|
|
|
5
5
|
combining RawPdfFile creation with text extraction and anonymization.
|
|
6
6
|
"""
|
|
7
7
|
from datetime import date, datetime
|
|
8
|
+
import errno
|
|
8
9
|
import logging
|
|
9
10
|
import shutil
|
|
10
11
|
import sys
|
|
@@ -13,12 +14,11 @@ import hashlib
|
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from typing import TYPE_CHECKING, Union
|
|
15
16
|
from contextlib import contextmanager
|
|
16
|
-
from django.conf.locale import tr
|
|
17
17
|
from django.db import transaction
|
|
18
18
|
from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
|
|
19
19
|
from endoreg_db.models.state.raw_pdf import RawPdfState
|
|
20
20
|
from endoreg_db.models import SensitiveMeta
|
|
21
|
-
from endoreg_db.utils
|
|
21
|
+
from endoreg_db.utils import paths as path_utils
|
|
22
22
|
import time
|
|
23
23
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
@@ -111,14 +111,44 @@ class PdfImportService:
|
|
|
111
111
|
break
|
|
112
112
|
h.update(b)
|
|
113
113
|
return h.hexdigest()
|
|
114
|
+
|
|
115
|
+
def _get_pdf_dir(self) -> Path | None:
|
|
116
|
+
"""Resolve the configured PDF directory to a concrete Path."""
|
|
117
|
+
candidate = getattr(path_utils, "PDF_DIR", None)
|
|
118
|
+
if isinstance(candidate, Path):
|
|
119
|
+
return candidate
|
|
120
|
+
if candidate is None:
|
|
121
|
+
return None
|
|
122
|
+
try:
|
|
123
|
+
derived = candidate / "."
|
|
124
|
+
except Exception:
|
|
125
|
+
derived = None
|
|
126
|
+
|
|
127
|
+
if derived is not None:
|
|
128
|
+
try:
|
|
129
|
+
return Path(derived)
|
|
130
|
+
except Exception:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
return Path(str(candidate))
|
|
135
|
+
except Exception:
|
|
136
|
+
return None
|
|
114
137
|
|
|
115
138
|
def _quarantine(self, source: Path) -> Path:
|
|
116
139
|
"""Move file to quarantine directory to prevent re-processing."""
|
|
117
|
-
qdir = PDF_DIR / "_processing"
|
|
140
|
+
qdir = path_utils.PDF_DIR / "_processing"
|
|
118
141
|
qdir.mkdir(parents=True, exist_ok=True)
|
|
119
142
|
target = qdir / source.name
|
|
120
|
-
|
|
121
|
-
|
|
143
|
+
try:
|
|
144
|
+
# Try atomic rename first (fastest when on same filesystem)
|
|
145
|
+
source.rename(target)
|
|
146
|
+
except OSError as exc:
|
|
147
|
+
if exc.errno == errno.EXDEV:
|
|
148
|
+
# Cross-device move, fall back to shutil.move which copies+removes
|
|
149
|
+
shutil.move(str(source), str(target))
|
|
150
|
+
else:
|
|
151
|
+
raise
|
|
122
152
|
return target
|
|
123
153
|
|
|
124
154
|
def _ensure_state(self, pdf_file: "RawPdfFile"):
|
|
@@ -287,6 +317,7 @@ class PdfImportService:
|
|
|
287
317
|
"""Initialize the processing context for the current PDF."""
|
|
288
318
|
self.processing_context = {
|
|
289
319
|
'file_path': Path(file_path),
|
|
320
|
+
'original_file_path': Path(file_path),
|
|
290
321
|
'center_name': center_name,
|
|
291
322
|
'delete_source': delete_source,
|
|
292
323
|
'retry': retry,
|
|
@@ -379,11 +410,18 @@ class PdfImportService:
|
|
|
379
410
|
|
|
380
411
|
def _setup_processing_environment(self):
|
|
381
412
|
"""Setup processing environment and state."""
|
|
413
|
+
original_path = self.processing_context.get('file_path')
|
|
414
|
+
|
|
382
415
|
# Create sensitive file copy
|
|
383
|
-
self.create_sensitive_file(self.current_pdf,
|
|
416
|
+
self.create_sensitive_file(self.current_pdf, original_path)
|
|
384
417
|
|
|
385
418
|
# Update file path to point to sensitive copy
|
|
386
419
|
self.processing_context['file_path'] = self.current_pdf.file.path
|
|
420
|
+
self.processing_context['sensitive_copy_created'] = True
|
|
421
|
+
try:
|
|
422
|
+
self.processing_context['sensitive_file_path'] = Path(self.current_pdf.file.path)
|
|
423
|
+
except Exception:
|
|
424
|
+
self.processing_context['sensitive_file_path'] = None
|
|
387
425
|
|
|
388
426
|
# Ensure state exists
|
|
389
427
|
state = self.current_pdf.get_or_create_state()
|
|
@@ -415,14 +453,14 @@ class PdfImportService:
|
|
|
415
453
|
logger.info("Starting text extraction and metadata processing with ReportReader...")
|
|
416
454
|
|
|
417
455
|
# Setup output directories
|
|
418
|
-
crops_dir = PDF_DIR / 'cropped_regions'
|
|
419
|
-
anonymized_dir = PDF_DIR / 'anonymized'
|
|
456
|
+
crops_dir = path_utils.PDF_DIR / 'cropped_regions'
|
|
457
|
+
anonymized_dir = path_utils.PDF_DIR / 'anonymized'
|
|
420
458
|
crops_dir.mkdir(parents=True, exist_ok=True)
|
|
421
459
|
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
422
460
|
|
|
423
461
|
# Initialize ReportReader
|
|
424
462
|
report_reader = ReportReader(
|
|
425
|
-
report_root_path=STORAGE_DIR,
|
|
463
|
+
report_root_path=str(path_utils.STORAGE_DIR),
|
|
426
464
|
locale="de_DE",
|
|
427
465
|
text_date_format="%d.%m.%Y"
|
|
428
466
|
)
|
|
@@ -603,7 +641,7 @@ class PdfImportService:
|
|
|
603
641
|
try:
|
|
604
642
|
# Prefer storing a path relative to STORAGE_DIR so Django serves it correctly
|
|
605
643
|
try:
|
|
606
|
-
relative_name = str(anonymized_path.relative_to(STORAGE_DIR))
|
|
644
|
+
relative_name = str(anonymized_path.relative_to(path_utils.STORAGE_DIR))
|
|
607
645
|
except ValueError:
|
|
608
646
|
# Fallback to absolute path if the file lives outside STORAGE_DIR
|
|
609
647
|
relative_name = str(anonymized_path)
|
|
@@ -717,18 +755,96 @@ class PdfImportService:
|
|
|
717
755
|
except Exception as e:
|
|
718
756
|
logger.warning(f"Error during cleanup: {e}")
|
|
719
757
|
finally:
|
|
758
|
+
# Remove any sensitive copy created during this processing run
|
|
759
|
+
sensitive_created = self.processing_context.get('sensitive_copy_created')
|
|
760
|
+
if sensitive_created:
|
|
761
|
+
pdf_obj = self.current_pdf
|
|
762
|
+
try:
|
|
763
|
+
if pdf_obj:
|
|
764
|
+
file_field = getattr(pdf_obj, "file", None)
|
|
765
|
+
if file_field and getattr(file_field, "name", None):
|
|
766
|
+
storage_name = file_field.name
|
|
767
|
+
file_field.delete(save=False)
|
|
768
|
+
logger.debug("Deleted sensitive copy %s during error cleanup", storage_name)
|
|
769
|
+
except Exception as cleanup_exc:
|
|
770
|
+
logger.warning("Failed to remove sensitive copy during error cleanup: %s", cleanup_exc)
|
|
771
|
+
|
|
720
772
|
# Always clean up processed files set to prevent blocks
|
|
721
773
|
file_path = self.processing_context.get('file_path')
|
|
722
774
|
if file_path and str(file_path) in self.processed_files:
|
|
723
775
|
self.processed_files.remove(str(file_path))
|
|
724
776
|
logger.debug(f"Removed {file_path} from processed files during error cleanup")
|
|
725
777
|
|
|
778
|
+
try:
|
|
779
|
+
original_path = self.processing_context.get('original_file_path')
|
|
780
|
+
logger.debug("PDF cleanup original path: %s (%s)", original_path, type(original_path))
|
|
781
|
+
raw_dir = original_path.parent if isinstance(original_path, Path) else None
|
|
782
|
+
if (
|
|
783
|
+
isinstance(original_path, Path)
|
|
784
|
+
and original_path.exists()
|
|
785
|
+
and not self.processing_context.get('sensitive_copy_created')
|
|
786
|
+
):
|
|
787
|
+
try:
|
|
788
|
+
original_path.unlink()
|
|
789
|
+
logger.info("Removed original file %s during error cleanup", original_path)
|
|
790
|
+
except Exception as remove_exc:
|
|
791
|
+
logger.warning("Could not remove original file %s during error cleanup: %s", original_path, remove_exc)
|
|
792
|
+
pdf_dir = self._get_pdf_dir()
|
|
793
|
+
if not pdf_dir and raw_dir:
|
|
794
|
+
base_dir = raw_dir.parent
|
|
795
|
+
dir_name = getattr(path_utils, "PDF_DIR_NAME", "pdfs")
|
|
796
|
+
fallback_pdf_dir = base_dir / dir_name
|
|
797
|
+
logger.debug(
|
|
798
|
+
"PDF cleanup fallback resolution - base: %s, dir_name: %s, exists: %s",
|
|
799
|
+
base_dir,
|
|
800
|
+
dir_name,
|
|
801
|
+
fallback_pdf_dir.exists(),
|
|
802
|
+
)
|
|
803
|
+
if fallback_pdf_dir.exists():
|
|
804
|
+
pdf_dir = fallback_pdf_dir
|
|
805
|
+
|
|
806
|
+
# Remove empty PDF subdirectories that might have been created during setup
|
|
807
|
+
if pdf_dir and pdf_dir.exists():
|
|
808
|
+
for subdir_name in ("sensitive", "cropped_regions", "anonymized", "_processing"):
|
|
809
|
+
subdir_path = pdf_dir / subdir_name
|
|
810
|
+
if subdir_path.exists() and subdir_path.is_dir():
|
|
811
|
+
try:
|
|
812
|
+
next(subdir_path.iterdir())
|
|
813
|
+
except StopIteration:
|
|
814
|
+
try:
|
|
815
|
+
subdir_path.rmdir()
|
|
816
|
+
logger.debug("Removed empty directory %s during error cleanup", subdir_path)
|
|
817
|
+
except OSError as rm_err:
|
|
818
|
+
logger.debug("Could not remove directory %s: %s", subdir_path, rm_err)
|
|
819
|
+
except Exception as iter_err:
|
|
820
|
+
logger.debug("Could not inspect directory %s: %s", subdir_path, iter_err)
|
|
821
|
+
|
|
822
|
+
raw_count = len(list(raw_dir.glob("*"))) if raw_dir and raw_dir.exists() else None
|
|
823
|
+
pdf_count = len(list(pdf_dir.glob("*"))) if pdf_dir and pdf_dir.exists() else None
|
|
824
|
+
|
|
825
|
+
sensitive_path = self.processing_context.get('sensitive_file_path')
|
|
826
|
+
if sensitive_path:
|
|
827
|
+
sensitive_parent = Path(sensitive_path).parent
|
|
828
|
+
sensitive_count = len(list(sensitive_parent.glob("*"))) if sensitive_parent.exists() else None
|
|
829
|
+
else:
|
|
830
|
+
sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
|
|
831
|
+
sensitive_count = len(list(sensitive_dir.glob("*"))) if sensitive_dir and sensitive_dir.exists() else None
|
|
832
|
+
|
|
833
|
+
logger.info(
|
|
834
|
+
"PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
|
|
835
|
+
raw_count,
|
|
836
|
+
pdf_count,
|
|
837
|
+
sensitive_count,
|
|
838
|
+
)
|
|
839
|
+
except Exception:
|
|
840
|
+
pass
|
|
841
|
+
|
|
726
842
|
def _cleanup_processing_context(self):
|
|
727
843
|
"""Cleanup processing context."""
|
|
728
844
|
try:
|
|
729
845
|
# Clean up temporary directories
|
|
730
846
|
if self.processing_context.get('text_extracted'):
|
|
731
|
-
crops_dir = PDF_DIR / 'cropped_regions'
|
|
847
|
+
crops_dir = path_utils.PDF_DIR / 'cropped_regions'
|
|
732
848
|
if crops_dir.exists() and not any(crops_dir.iterdir()):
|
|
733
849
|
crops_dir.rmdir()
|
|
734
850
|
|
|
@@ -857,7 +973,7 @@ class PdfImportService:
|
|
|
857
973
|
if not source_path:
|
|
858
974
|
raise ValueError("No file path available for creating sensitive file")
|
|
859
975
|
|
|
860
|
-
SENSITIVE_DIR = PDF_DIR / "sensitive"
|
|
976
|
+
SENSITIVE_DIR = path_utils.PDF_DIR / "sensitive"
|
|
861
977
|
target = SENSITIVE_DIR / f"{pdf_file.pdf_hash}.pdf"
|
|
862
978
|
|
|
863
979
|
try:
|
|
@@ -880,7 +996,7 @@ class PdfImportService:
|
|
|
880
996
|
# Update FileField to reference the file under STORAGE_DIR
|
|
881
997
|
# We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
|
|
882
998
|
try:
|
|
883
|
-
relative_name = str(target.relative_to(STORAGE_DIR))
|
|
999
|
+
relative_name = str(target.relative_to(path_utils.STORAGE_DIR)) # Point Django FileField to sensitive storage
|
|
884
1000
|
except ValueError:
|
|
885
1001
|
# Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
|
|
886
1002
|
relative_name = str(target)
|
|
@@ -934,7 +1050,7 @@ class PdfImportService:
|
|
|
934
1050
|
if pdf_problematic:
|
|
935
1051
|
# Quarantine the file
|
|
936
1052
|
logger.warning(f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}")
|
|
937
|
-
quarantine_dir = PDF_DIR / "quarantine"
|
|
1053
|
+
quarantine_dir = path_utils.PDF_DIR / "quarantine"
|
|
938
1054
|
os.makedirs(quarantine_dir, exist_ok=True)
|
|
939
1055
|
|
|
940
1056
|
quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
@@ -950,7 +1066,7 @@ class PdfImportService:
|
|
|
950
1066
|
else:
|
|
951
1067
|
# Archive the file normally
|
|
952
1068
|
logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
|
|
953
|
-
archive_dir = PDF_DIR / "processed"
|
|
1069
|
+
archive_dir = path_utils.PDF_DIR / "processed"
|
|
954
1070
|
os.makedirs(archive_dir, exist_ok=True)
|
|
955
1071
|
|
|
956
1072
|
archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
|