endoreg-db 0.8.4.4__py3-none-any.whl → 0.8.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of endoreg-db might be problematic. Click here for more details.
- endoreg_db/management/commands/load_ai_model_data.py +2 -1
- endoreg_db/management/commands/setup_endoreg_db.py +11 -7
- endoreg_db/models/media/pdf/raw_pdf.py +241 -97
- endoreg_db/models/media/video/pipe_1.py +30 -33
- endoreg_db/models/media/video/video_file.py +300 -187
- endoreg_db/models/metadata/model_meta_logic.py +15 -1
- endoreg_db/models/metadata/sensitive_meta_logic.py +391 -70
- endoreg_db/serializers/__init__.py +26 -55
- endoreg_db/serializers/misc/__init__.py +1 -1
- endoreg_db/serializers/misc/file_overview.py +65 -35
- endoreg_db/serializers/misc/{vop_patient_data.py → sensitive_patient_data.py} +1 -1
- endoreg_db/serializers/video_examination.py +198 -0
- endoreg_db/services/lookup_service.py +228 -58
- endoreg_db/services/lookup_store.py +174 -30
- endoreg_db/services/pdf_import.py +585 -282
- endoreg_db/services/video_import.py +340 -101
- endoreg_db/urls/__init__.py +36 -23
- endoreg_db/urls/label_video_segments.py +2 -0
- endoreg_db/urls/media.py +3 -2
- endoreg_db/views/__init__.py +6 -3
- endoreg_db/views/media/pdf_media.py +3 -1
- endoreg_db/views/media/video_media.py +1 -1
- endoreg_db/views/media/video_segments.py +187 -259
- endoreg_db/views/pdf/__init__.py +5 -8
- endoreg_db/views/pdf/pdf_stream.py +187 -0
- endoreg_db/views/pdf/reimport.py +110 -94
- endoreg_db/views/requirement/lookup.py +171 -287
- endoreg_db/views/video/__init__.py +0 -2
- endoreg_db/views/video/video_examination_viewset.py +202 -289
- {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.6.1.dist-info}/METADATA +1 -1
- {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.6.1.dist-info}/RECORD +33 -34
- endoreg_db/views/pdf/pdf_media.py +0 -239
- endoreg_db/views/pdf/pdf_stream_views.py +0 -127
- endoreg_db/views/video/video_media.py +0 -158
- {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.6.1.dist-info}/WHEEL +0 -0
- {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.6.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,22 +4,25 @@ PDF import service module.
|
|
|
4
4
|
Provides high-level functions for importing and anonymizing PDF files,
|
|
5
5
|
combining RawPdfFile creation with text extraction and anonymization.
|
|
6
6
|
"""
|
|
7
|
-
|
|
7
|
+
|
|
8
8
|
import errno
|
|
9
|
+
import hashlib
|
|
9
10
|
import logging
|
|
11
|
+
import os
|
|
10
12
|
import shutil
|
|
11
13
|
import sys
|
|
12
|
-
import
|
|
13
|
-
import
|
|
14
|
+
import time
|
|
15
|
+
from contextlib import contextmanager
|
|
16
|
+
from datetime import date, datetime
|
|
14
17
|
from pathlib import Path
|
|
15
18
|
from typing import TYPE_CHECKING, Union
|
|
16
|
-
|
|
19
|
+
|
|
17
20
|
from django.db import transaction
|
|
21
|
+
|
|
22
|
+
from endoreg_db.models import SensitiveMeta
|
|
18
23
|
from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
|
|
19
24
|
from endoreg_db.models.state.raw_pdf import RawPdfState
|
|
20
|
-
from endoreg_db.models import SensitiveMeta
|
|
21
25
|
from endoreg_db.utils import paths as path_utils
|
|
22
|
-
import time
|
|
23
26
|
|
|
24
27
|
logger = logging.getLogger(__name__)
|
|
25
28
|
|
|
@@ -34,24 +37,69 @@ class PdfImportService:
|
|
|
34
37
|
"""
|
|
35
38
|
Service class for importing and processing PDF files with text extraction and anonymization.
|
|
36
39
|
Uses a central PDF instance pattern for cleaner state management.
|
|
40
|
+
|
|
41
|
+
Supports two processing modes:
|
|
42
|
+
- 'blackening': Simple PDF masking with black rectangles over sensitive areas
|
|
43
|
+
- 'cropping': Advanced mode that crops sensitive regions to separate images
|
|
37
44
|
"""
|
|
38
|
-
|
|
39
|
-
def __init__(
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self, allow_meta_overwrite: bool = False, processing_mode: str = "blackening"
|
|
48
|
+
):
|
|
40
49
|
"""
|
|
41
50
|
Initialize the PDF import service.
|
|
42
|
-
|
|
51
|
+
|
|
43
52
|
Args:
|
|
44
53
|
allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
|
|
54
|
+
processing_mode: Processing mode - 'blackening' for simple masking, 'cropping' for advanced cropping
|
|
45
55
|
"""
|
|
46
56
|
self.processed_files = set()
|
|
47
57
|
self._report_reader_available = None
|
|
48
58
|
self._report_reader_class = None
|
|
49
59
|
self.allow_meta_overwrite = allow_meta_overwrite
|
|
50
|
-
|
|
60
|
+
|
|
61
|
+
# Validate and set processing mode
|
|
62
|
+
valid_modes = ["blackening", "cropping"]
|
|
63
|
+
if processing_mode not in valid_modes:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
f"Invalid processing_mode '{processing_mode}'. Must be one of: {valid_modes}"
|
|
66
|
+
)
|
|
67
|
+
self.processing_mode = processing_mode
|
|
68
|
+
|
|
51
69
|
# Central PDF instance management
|
|
52
70
|
self.current_pdf = None
|
|
53
71
|
self.processing_context = {}
|
|
54
|
-
|
|
72
|
+
|
|
73
|
+
@classmethod
|
|
74
|
+
def with_blackening(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
|
|
75
|
+
"""
|
|
76
|
+
Create a PdfImportService configured for simple PDF blackening mode.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
PdfImportService instance configured for blackening mode
|
|
83
|
+
"""
|
|
84
|
+
return cls(
|
|
85
|
+
allow_meta_overwrite=allow_meta_overwrite, processing_mode="blackening"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
def with_cropping(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
|
|
90
|
+
"""
|
|
91
|
+
Create a PdfImportService configured for advanced cropping mode.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
PdfImportService instance configured for cropping mode
|
|
98
|
+
"""
|
|
99
|
+
return cls(
|
|
100
|
+
allow_meta_overwrite=allow_meta_overwrite, processing_mode="cropping"
|
|
101
|
+
)
|
|
102
|
+
|
|
55
103
|
@contextmanager
|
|
56
104
|
def _file_lock(self, path: Path):
|
|
57
105
|
"""Create a file lock to prevent duplicate processing.
|
|
@@ -77,11 +125,14 @@ class PdfImportService:
|
|
|
77
125
|
try:
|
|
78
126
|
logger.warning(
|
|
79
127
|
"Stale lock detected for %s (age %.0fs). Reclaiming lock...",
|
|
80
|
-
path,
|
|
128
|
+
path,
|
|
129
|
+
age,
|
|
81
130
|
)
|
|
82
131
|
lock_path.unlink()
|
|
83
132
|
except Exception as e:
|
|
84
|
-
logger.warning(
|
|
133
|
+
logger.warning(
|
|
134
|
+
"Failed to remove stale lock %s: %s", lock_path, e
|
|
135
|
+
)
|
|
85
136
|
# retry acquire
|
|
86
137
|
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
|
|
87
138
|
else:
|
|
@@ -100,7 +151,7 @@ class PdfImportService:
|
|
|
100
151
|
lock_path.unlink()
|
|
101
152
|
except OSError:
|
|
102
153
|
pass
|
|
103
|
-
|
|
154
|
+
|
|
104
155
|
def _sha256(self, path: Path, chunk: int = 1024 * 1024) -> str:
|
|
105
156
|
"""Compute SHA256 hash of a file."""
|
|
106
157
|
h = hashlib.sha256()
|
|
@@ -134,7 +185,7 @@ class PdfImportService:
|
|
|
134
185
|
return Path(str(candidate))
|
|
135
186
|
except Exception:
|
|
136
187
|
return None
|
|
137
|
-
|
|
188
|
+
|
|
138
189
|
def _quarantine(self, source: Path) -> Path:
|
|
139
190
|
"""Move file to quarantine directory to prevent re-processing."""
|
|
140
191
|
qdir = path_utils.PDF_DIR / "_processing"
|
|
@@ -150,7 +201,7 @@ class PdfImportService:
|
|
|
150
201
|
else:
|
|
151
202
|
raise
|
|
152
203
|
return target
|
|
153
|
-
|
|
204
|
+
|
|
154
205
|
def _ensure_state(self, pdf_file: "RawPdfFile"):
|
|
155
206
|
"""Ensure PDF file has a state object."""
|
|
156
207
|
if getattr(pdf_file, "state", None):
|
|
@@ -166,134 +217,156 @@ class PdfImportService:
|
|
|
166
217
|
return state
|
|
167
218
|
except Exception:
|
|
168
219
|
return None
|
|
169
|
-
|
|
220
|
+
|
|
170
221
|
def _ensure_report_reading_available(self):
|
|
171
222
|
"""
|
|
172
223
|
Ensure report reading modules are available by adding lx-anonymizer to path.
|
|
173
|
-
|
|
224
|
+
|
|
174
225
|
Returns:
|
|
175
226
|
Tuple of (availability_flag, ReportReader_class)
|
|
176
227
|
"""
|
|
177
228
|
if self._report_reader_available is not None:
|
|
178
229
|
return self._report_reader_available, self._report_reader_class
|
|
179
|
-
|
|
230
|
+
|
|
180
231
|
try:
|
|
181
232
|
# Try direct import first
|
|
182
233
|
from lx_anonymizer import ReportReader
|
|
183
|
-
|
|
234
|
+
|
|
184
235
|
logger.info("Successfully imported lx_anonymizer ReportReader module")
|
|
185
236
|
self._report_reader_available = True
|
|
186
237
|
self._report_reader_class = ReportReader
|
|
187
238
|
return True, ReportReader
|
|
188
|
-
|
|
239
|
+
|
|
189
240
|
except ImportError:
|
|
190
241
|
# Optional: honor LX_ANONYMIZER_PATH=/abs/path/to/src
|
|
191
242
|
import importlib
|
|
243
|
+
|
|
192
244
|
extra = os.getenv("LX_ANONYMIZER_PATH")
|
|
193
245
|
if extra and extra not in sys.path and Path(extra).exists():
|
|
194
246
|
sys.path.insert(0, extra)
|
|
195
247
|
try:
|
|
196
248
|
mod = importlib.import_module("lx_anonymizer")
|
|
197
249
|
ReportReader = getattr(mod, "ReportReader")
|
|
198
|
-
logger.info(
|
|
250
|
+
logger.info(
|
|
251
|
+
"Imported lx_anonymizer.ReportReader via LX_ANONYMIZER_PATH"
|
|
252
|
+
)
|
|
199
253
|
self._report_reader_available = True
|
|
200
254
|
self._report_reader_class = ReportReader
|
|
201
255
|
return True, ReportReader
|
|
202
256
|
except Exception as e:
|
|
203
|
-
logger.warning(
|
|
257
|
+
logger.warning(
|
|
258
|
+
"Failed importing lx_anonymizer via LX_ANONYMIZER_PATH: %s", e
|
|
259
|
+
)
|
|
204
260
|
finally:
|
|
205
261
|
# Keep path for future imports if it worked; otherwise remove.
|
|
206
262
|
if "ReportReader" not in locals() and extra in sys.path:
|
|
207
263
|
sys.path.remove(extra)
|
|
208
|
-
|
|
264
|
+
|
|
209
265
|
self._report_reader_available = False
|
|
210
266
|
self._report_reader_class = None
|
|
211
267
|
return False, None
|
|
212
268
|
|
|
213
|
-
|
|
214
269
|
def _ensure_default_patient_data(self, pdf_instance: "RawPdfFile" = None) -> None:
|
|
215
270
|
"""
|
|
216
271
|
Ensure PDF has minimum required patient data in SensitiveMeta.
|
|
217
272
|
Creates default values if data is missing after text processing.
|
|
218
273
|
Uses the central PDF instance if no specific instance provided.
|
|
219
|
-
|
|
274
|
+
|
|
220
275
|
Args:
|
|
221
276
|
pdf_instance: Optional specific PDF instance, defaults to self.current_pdf
|
|
222
277
|
"""
|
|
223
278
|
pdf_file = pdf_instance or self.current_pdf
|
|
224
279
|
if not pdf_file:
|
|
225
|
-
logger.warning(
|
|
280
|
+
logger.warning(
|
|
281
|
+
"No PDF instance available for ensuring default patient data"
|
|
282
|
+
)
|
|
226
283
|
return
|
|
227
|
-
|
|
284
|
+
|
|
228
285
|
if not pdf_file.sensitive_meta:
|
|
229
|
-
logger.info(
|
|
230
|
-
|
|
286
|
+
logger.info(
|
|
287
|
+
f"No SensitiveMeta found for PDF {pdf_file.pdf_hash}, creating default"
|
|
288
|
+
)
|
|
289
|
+
|
|
231
290
|
# Create default SensitiveMeta with placeholder data
|
|
232
291
|
default_data = {
|
|
233
292
|
"patient_first_name": "Patient",
|
|
234
|
-
"patient_last_name": "Unknown",
|
|
293
|
+
"patient_last_name": "Unknown",
|
|
235
294
|
"patient_dob": date(1990, 1, 1), # Default DOB
|
|
236
295
|
"examination_date": date.today(),
|
|
237
|
-
"center_name": pdf_file.center.name
|
|
296
|
+
"center_name": pdf_file.center.name
|
|
297
|
+
if pdf_file.center
|
|
298
|
+
else "university_hospital_wuerzburg",
|
|
238
299
|
}
|
|
239
|
-
|
|
300
|
+
|
|
240
301
|
try:
|
|
241
302
|
sensitive_meta = SensitiveMeta.create_from_dict(default_data)
|
|
242
303
|
pdf_file.sensitive_meta = sensitive_meta
|
|
243
|
-
pdf_file.save(update_fields=[
|
|
244
|
-
logger.info(
|
|
304
|
+
pdf_file.save(update_fields=["sensitive_meta"])
|
|
305
|
+
logger.info(
|
|
306
|
+
f"Created default SensitiveMeta for PDF {pdf_file.pdf_hash}"
|
|
307
|
+
)
|
|
245
308
|
except Exception as e:
|
|
246
|
-
logger.error(
|
|
309
|
+
logger.error(
|
|
310
|
+
f"Failed to create default SensitiveMeta for PDF {pdf_file.pdf_hash}: {e}"
|
|
311
|
+
)
|
|
247
312
|
|
|
248
313
|
def import_and_anonymize(
|
|
249
|
-
self,
|
|
250
|
-
file_path: Union[Path, str],
|
|
251
|
-
center_name: str,
|
|
314
|
+
self,
|
|
315
|
+
file_path: Union[Path, str],
|
|
316
|
+
center_name: str,
|
|
252
317
|
delete_source: bool = False,
|
|
253
318
|
retry: bool = False,
|
|
254
319
|
) -> "RawPdfFile":
|
|
255
320
|
"""
|
|
256
321
|
Import a PDF file and anonymize it using ReportReader.
|
|
257
322
|
Uses centralized PDF instance management pattern.
|
|
258
|
-
|
|
323
|
+
|
|
324
|
+
The processing mode is determined by the service initialization:
|
|
325
|
+
- 'blackening': Creates an anonymized PDF with black rectangles over sensitive regions
|
|
326
|
+
- 'cropping': Advanced mode that crops sensitive regions to separate images
|
|
327
|
+
|
|
259
328
|
Args:
|
|
260
329
|
file_path: Path to the PDF file to import
|
|
261
330
|
center_name: Name of the center to associate with PDF
|
|
262
331
|
delete_source: Whether to delete the source file after import
|
|
263
332
|
retry: Whether this is a retry attempt
|
|
264
|
-
|
|
333
|
+
|
|
265
334
|
Returns:
|
|
266
335
|
RawPdfFile instance after import and processing
|
|
267
|
-
|
|
336
|
+
|
|
268
337
|
Raises:
|
|
269
338
|
Exception: On any failure during import or processing
|
|
270
339
|
"""
|
|
271
340
|
try:
|
|
272
341
|
# Initialize processing context
|
|
273
|
-
self._initialize_processing_context(
|
|
274
|
-
|
|
342
|
+
self._initialize_processing_context(
|
|
343
|
+
file_path, center_name, delete_source, retry
|
|
344
|
+
)
|
|
345
|
+
|
|
275
346
|
# Step 1: Validate and prepare file
|
|
276
347
|
self._validate_and_prepare_file()
|
|
277
|
-
|
|
348
|
+
|
|
278
349
|
# Step 2: Create or retrieve PDF instance
|
|
279
350
|
self._create_or_retrieve_pdf_instance()
|
|
280
|
-
|
|
351
|
+
|
|
281
352
|
# Early return check - if no PDF instance was created, return None
|
|
282
353
|
if not self.current_pdf:
|
|
283
|
-
logger.warning(
|
|
354
|
+
logger.warning(
|
|
355
|
+
f"No PDF instance created for {file_path}, returning None"
|
|
356
|
+
)
|
|
284
357
|
return None
|
|
285
|
-
|
|
358
|
+
|
|
286
359
|
# Step 3: Setup processing environment
|
|
287
360
|
self._setup_processing_environment()
|
|
288
|
-
|
|
361
|
+
|
|
289
362
|
# Step 4: Process text and metadata
|
|
290
363
|
self._process_text_and_metadata()
|
|
291
|
-
|
|
364
|
+
|
|
292
365
|
# Step 5: Finalize processing
|
|
293
366
|
self._finalize_processing()
|
|
294
|
-
|
|
367
|
+
|
|
295
368
|
return self.current_pdf
|
|
296
|
-
|
|
369
|
+
|
|
297
370
|
except ValueError as e:
|
|
298
371
|
# Handle "File already being processed" case specifically
|
|
299
372
|
if "already being processed" in str(e):
|
|
@@ -312,50 +385,57 @@ class PdfImportService:
|
|
|
312
385
|
# Always cleanup context
|
|
313
386
|
self._cleanup_processing_context()
|
|
314
387
|
|
|
315
|
-
def _initialize_processing_context(
|
|
316
|
-
|
|
388
|
+
def _initialize_processing_context(
|
|
389
|
+
self,
|
|
390
|
+
file_path: Union[Path, str],
|
|
391
|
+
center_name: str,
|
|
392
|
+
delete_source: bool,
|
|
393
|
+
retry: bool,
|
|
394
|
+
):
|
|
317
395
|
"""Initialize the processing context for the current PDF."""
|
|
318
396
|
self.processing_context = {
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
397
|
+
"file_path": Path(file_path),
|
|
398
|
+
"original_file_path": Path(file_path),
|
|
399
|
+
"center_name": center_name,
|
|
400
|
+
"delete_source": delete_source,
|
|
401
|
+
"retry": retry,
|
|
402
|
+
"file_hash": None,
|
|
403
|
+
"processing_started": False,
|
|
404
|
+
"text_extracted": False,
|
|
405
|
+
"metadata_processed": False,
|
|
406
|
+
"anonymization_completed": False,
|
|
329
407
|
}
|
|
330
|
-
|
|
408
|
+
|
|
331
409
|
# Check if already processed (only during current session to prevent race conditions)
|
|
332
410
|
if str(file_path) in self.processed_files:
|
|
333
|
-
logger.info(
|
|
411
|
+
logger.info(
|
|
412
|
+
f"File {file_path} already being processed in current session, skipping"
|
|
413
|
+
)
|
|
334
414
|
raise ValueError("File already being processed")
|
|
335
|
-
|
|
415
|
+
|
|
336
416
|
logger.info(f"Starting import and processing for: {file_path}")
|
|
337
417
|
|
|
338
418
|
def _validate_and_prepare_file(self):
|
|
339
419
|
"""Validate file existence and calculate hash."""
|
|
340
|
-
file_path = self.processing_context[
|
|
341
|
-
|
|
420
|
+
file_path = self.processing_context["file_path"]
|
|
421
|
+
|
|
342
422
|
if not file_path.exists():
|
|
343
423
|
raise FileNotFoundError(f"PDF file not found: {file_path}")
|
|
344
|
-
|
|
424
|
+
|
|
345
425
|
try:
|
|
346
|
-
self.processing_context[
|
|
426
|
+
self.processing_context["file_hash"] = self._sha256(file_path)
|
|
347
427
|
except Exception as e:
|
|
348
428
|
logger.warning(f"Could not calculate file hash: {e}")
|
|
349
|
-
self.processing_context[
|
|
429
|
+
self.processing_context["file_hash"] = None
|
|
350
430
|
|
|
351
431
|
def _create_or_retrieve_pdf_instance(self):
|
|
352
432
|
"""Create new or retrieve existing PDF instance."""
|
|
353
|
-
file_path = self.processing_context[
|
|
354
|
-
center_name = self.processing_context[
|
|
355
|
-
delete_source = self.processing_context[
|
|
356
|
-
retry = self.processing_context[
|
|
357
|
-
file_hash = self.processing_context[
|
|
358
|
-
|
|
433
|
+
file_path = self.processing_context["file_path"]
|
|
434
|
+
center_name = self.processing_context["center_name"]
|
|
435
|
+
delete_source = self.processing_context["delete_source"]
|
|
436
|
+
retry = self.processing_context["retry"]
|
|
437
|
+
file_hash = self.processing_context["file_hash"]
|
|
438
|
+
|
|
359
439
|
if not retry:
|
|
360
440
|
# Check for existing PDF and handle duplicates
|
|
361
441
|
with self._file_lock(file_path):
|
|
@@ -366,18 +446,20 @@ class PdfImportService:
|
|
|
366
446
|
if existing:
|
|
367
447
|
logger.info(f"Found existing RawPdfFile {existing.pdf_hash}")
|
|
368
448
|
if existing.text:
|
|
369
|
-
logger.info(
|
|
449
|
+
logger.info(
|
|
450
|
+
f"Existing PDF {existing.pdf_hash} already processed - returning"
|
|
451
|
+
)
|
|
370
452
|
self.current_pdf = existing
|
|
371
453
|
return
|
|
372
454
|
else:
|
|
373
455
|
# Retry processing
|
|
374
456
|
logger.info(f"Reprocessing existing PDF {existing.pdf_hash}")
|
|
375
457
|
return self._retry_existing_pdf(existing)
|
|
376
|
-
|
|
458
|
+
|
|
377
459
|
# Create new PDF instance
|
|
378
460
|
logger.info("Creating new RawPdfFile instance...")
|
|
379
461
|
from django.db import IntegrityError
|
|
380
|
-
|
|
462
|
+
|
|
381
463
|
try:
|
|
382
464
|
if not retry:
|
|
383
465
|
self.current_pdf = RawPdfFile.create_from_file_initialized(
|
|
@@ -388,18 +470,22 @@ class PdfImportService:
|
|
|
388
470
|
else:
|
|
389
471
|
# Retrieve existing for retry
|
|
390
472
|
self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
|
|
391
|
-
logger.info(
|
|
392
|
-
|
|
473
|
+
logger.info(
|
|
474
|
+
f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}"
|
|
475
|
+
)
|
|
476
|
+
|
|
393
477
|
# Check if retry is actually needed
|
|
394
478
|
if self.current_pdf.text:
|
|
395
|
-
logger.info(
|
|
479
|
+
logger.info(
|
|
480
|
+
f"Existing PDF {self.current_pdf.pdf_hash} already processed during retry - returning"
|
|
481
|
+
)
|
|
396
482
|
return
|
|
397
|
-
|
|
483
|
+
|
|
398
484
|
if not self.current_pdf:
|
|
399
485
|
raise RuntimeError("Failed to create RawPdfFile instance")
|
|
400
|
-
|
|
486
|
+
|
|
401
487
|
logger.info(f"PDF instance ready: {self.current_pdf.pdf_hash}")
|
|
402
|
-
|
|
488
|
+
|
|
403
489
|
except IntegrityError:
|
|
404
490
|
# Race condition - another worker created it
|
|
405
491
|
if file_hash:
|
|
@@ -410,27 +496,29 @@ class PdfImportService:
|
|
|
410
496
|
|
|
411
497
|
def _setup_processing_environment(self):
|
|
412
498
|
"""Setup processing environment and state."""
|
|
413
|
-
original_path = self.processing_context.get(
|
|
499
|
+
original_path = self.processing_context.get("file_path")
|
|
414
500
|
|
|
415
501
|
# Create sensitive file copy
|
|
416
502
|
self.create_sensitive_file(self.current_pdf, original_path)
|
|
417
|
-
|
|
503
|
+
|
|
418
504
|
# Update file path to point to sensitive copy
|
|
419
|
-
self.processing_context[
|
|
420
|
-
self.processing_context[
|
|
505
|
+
self.processing_context["file_path"] = self.current_pdf.file.path
|
|
506
|
+
self.processing_context["sensitive_copy_created"] = True
|
|
421
507
|
try:
|
|
422
|
-
self.processing_context[
|
|
508
|
+
self.processing_context["sensitive_file_path"] = Path(
|
|
509
|
+
self.current_pdf.file.path
|
|
510
|
+
)
|
|
423
511
|
except Exception:
|
|
424
|
-
self.processing_context[
|
|
425
|
-
|
|
512
|
+
self.processing_context["sensitive_file_path"] = None
|
|
513
|
+
|
|
426
514
|
# Ensure state exists
|
|
427
515
|
state = self.current_pdf.get_or_create_state()
|
|
428
516
|
state.mark_processing_started()
|
|
429
|
-
self.processing_context[
|
|
430
|
-
|
|
517
|
+
self.processing_context["processing_started"] = True
|
|
518
|
+
|
|
431
519
|
# Mark as processed to prevent duplicates
|
|
432
|
-
self.processed_files.add(str(self.processing_context[
|
|
433
|
-
|
|
520
|
+
self.processed_files.add(str(self.processing_context["file_path"]))
|
|
521
|
+
|
|
434
522
|
# Ensure default patient data
|
|
435
523
|
logger.info("Ensuring default patient data...")
|
|
436
524
|
self._ensure_default_patient_data(self.current_pdf)
|
|
@@ -438,83 +526,154 @@ class PdfImportService:
|
|
|
438
526
|
def _process_text_and_metadata(self):
|
|
439
527
|
"""Process text extraction and metadata using ReportReader."""
|
|
440
528
|
report_reading_available, ReportReader = self._ensure_report_reading_available()
|
|
441
|
-
|
|
529
|
+
|
|
442
530
|
if not report_reading_available:
|
|
443
531
|
logger.warning("Report reading not available (lx_anonymizer not found)")
|
|
444
532
|
self._mark_processing_incomplete("no_report_reader")
|
|
445
533
|
return
|
|
446
|
-
|
|
534
|
+
|
|
447
535
|
if not self.current_pdf.file:
|
|
448
536
|
logger.warning("No file available for text processing")
|
|
449
537
|
self._mark_processing_incomplete("no_file")
|
|
450
538
|
return
|
|
451
|
-
|
|
539
|
+
|
|
452
540
|
try:
|
|
453
|
-
logger.info(
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
crops_dir = path_utils.PDF_DIR / 'cropped_regions'
|
|
457
|
-
anonymized_dir = path_utils.PDF_DIR / 'anonymized'
|
|
458
|
-
crops_dir.mkdir(parents=True, exist_ok=True)
|
|
459
|
-
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
541
|
+
logger.info(
|
|
542
|
+
f"Starting text extraction and metadata processing with ReportReader (mode: {self.processing_mode})..."
|
|
543
|
+
)
|
|
460
544
|
|
|
461
545
|
# Initialize ReportReader
|
|
462
546
|
report_reader = ReportReader(
|
|
463
547
|
report_root_path=str(path_utils.STORAGE_DIR),
|
|
464
548
|
locale="de_DE",
|
|
465
|
-
text_date_format="%d.%m.%Y"
|
|
549
|
+
text_date_format="%d.%m.%Y",
|
|
466
550
|
)
|
|
467
551
|
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
# Store results in context
|
|
477
|
-
self.processing_context.update({
|
|
478
|
-
'original_text': original_text,
|
|
479
|
-
'anonymized_text': anonymized_text,
|
|
480
|
-
'extracted_metadata': extracted_metadata,
|
|
481
|
-
'cropped_regions': cropped_regions,
|
|
482
|
-
'anonymized_pdf_path': anonymized_pdf_path
|
|
483
|
-
})
|
|
484
|
-
|
|
485
|
-
if original_text:
|
|
486
|
-
self._apply_text_results()
|
|
487
|
-
self.processing_context['text_extracted'] = True
|
|
488
|
-
|
|
489
|
-
if extracted_metadata:
|
|
490
|
-
self._apply_metadata_results()
|
|
491
|
-
self.processing_context['metadata_processed'] = True
|
|
492
|
-
|
|
493
|
-
if anonymized_pdf_path:
|
|
494
|
-
self._apply_anonymized_pdf()
|
|
495
|
-
self.processing_context['anonymization_completed'] = True
|
|
496
|
-
|
|
552
|
+
if self.processing_mode == "cropping":
|
|
553
|
+
# Use advanced cropping method (existing implementation)
|
|
554
|
+
self._process_with_cropping(report_reader)
|
|
555
|
+
else: # blackening mode
|
|
556
|
+
# Use enhanced process_report with PDF masking
|
|
557
|
+
self._process_with_blackening(report_reader)
|
|
558
|
+
|
|
497
559
|
except Exception as e:
|
|
498
560
|
logger.warning(f"Text processing failed: {e}")
|
|
499
561
|
self._mark_processing_incomplete("text_processing_failed")
|
|
500
562
|
|
|
563
|
+
def _process_with_blackening(self, report_reader):
|
|
564
|
+
"""Process PDF using simple blackening/masking mode."""
|
|
565
|
+
logger.info("Using simple PDF blackening mode...")
|
|
566
|
+
|
|
567
|
+
# Setup anonymized directory
|
|
568
|
+
anonymized_dir = path_utils.PDF_DIR / "anonymized"
|
|
569
|
+
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
570
|
+
|
|
571
|
+
# Generate output path for anonymized PDF
|
|
572
|
+
pdf_hash = self.current_pdf.pdf_hash
|
|
573
|
+
anonymized_output_path = anonymized_dir / f"{pdf_hash}_anonymized.pdf"
|
|
574
|
+
|
|
575
|
+
# Process with enhanced process_report method (returns 4-tuple now)
|
|
576
|
+
original_text, anonymized_text, extracted_metadata, anonymized_pdf_path = (
|
|
577
|
+
report_reader.process_report(
|
|
578
|
+
pdf_path=self.processing_context["file_path"],
|
|
579
|
+
create_anonymized_pdf=True,
|
|
580
|
+
anonymized_pdf_output_path=str(anonymized_output_path),
|
|
581
|
+
)
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
# Store results in context
|
|
585
|
+
self.processing_context.update(
|
|
586
|
+
{
|
|
587
|
+
"original_text": original_text,
|
|
588
|
+
"anonymized_text": anonymized_text,
|
|
589
|
+
"extracted_metadata": extracted_metadata,
|
|
590
|
+
"cropped_regions": None, # Not available in blackening mode
|
|
591
|
+
"anonymized_pdf_path": anonymized_pdf_path,
|
|
592
|
+
}
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
# Apply results
|
|
596
|
+
if original_text:
|
|
597
|
+
self._apply_text_results()
|
|
598
|
+
self.processing_context["text_extracted"] = True
|
|
599
|
+
|
|
600
|
+
if extracted_metadata:
|
|
601
|
+
self._apply_metadata_results()
|
|
602
|
+
self.processing_context["metadata_processed"] = True
|
|
603
|
+
|
|
604
|
+
if anonymized_pdf_path:
|
|
605
|
+
self._apply_anonymized_pdf()
|
|
606
|
+
self.processing_context["anonymization_completed"] = True
|
|
607
|
+
|
|
608
|
+
logger.info("PDF blackening processing completed")
|
|
609
|
+
|
|
610
|
+
def _process_with_cropping(self, report_reader):
|
|
611
|
+
"""Process PDF using advanced cropping mode (existing implementation)."""
|
|
612
|
+
logger.info("Using advanced cropping mode...")
|
|
613
|
+
|
|
614
|
+
# Setup output directories
|
|
615
|
+
crops_dir = path_utils.PDF_DIR / "cropped_regions"
|
|
616
|
+
anonymized_dir = path_utils.PDF_DIR / "anonymized"
|
|
617
|
+
crops_dir.mkdir(parents=True, exist_ok=True)
|
|
618
|
+
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
619
|
+
|
|
620
|
+
# Process with cropping (returns 5-tuple)
|
|
621
|
+
(
|
|
622
|
+
original_text,
|
|
623
|
+
anonymized_text,
|
|
624
|
+
extracted_metadata,
|
|
625
|
+
cropped_regions,
|
|
626
|
+
anonymized_pdf_path,
|
|
627
|
+
) = report_reader.process_report_with_cropping(
|
|
628
|
+
pdf_path=self.processing_context["file_path"],
|
|
629
|
+
crop_sensitive_regions=True,
|
|
630
|
+
crop_output_dir=str(crops_dir),
|
|
631
|
+
anonymization_output_dir=str(anonymized_dir),
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
# Store results in context
|
|
635
|
+
self.processing_context.update(
|
|
636
|
+
{
|
|
637
|
+
"original_text": original_text,
|
|
638
|
+
"anonymized_text": anonymized_text,
|
|
639
|
+
"extracted_metadata": extracted_metadata,
|
|
640
|
+
"cropped_regions": cropped_regions,
|
|
641
|
+
"anonymized_pdf_path": anonymized_pdf_path,
|
|
642
|
+
}
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
# Apply results
|
|
646
|
+
if original_text:
|
|
647
|
+
self._apply_text_results()
|
|
648
|
+
self.processing_context["text_extracted"] = True
|
|
649
|
+
|
|
650
|
+
if extracted_metadata:
|
|
651
|
+
self._apply_metadata_results()
|
|
652
|
+
self.processing_context["metadata_processed"] = True
|
|
653
|
+
|
|
654
|
+
if anonymized_pdf_path:
|
|
655
|
+
self._apply_anonymized_pdf()
|
|
656
|
+
self.processing_context["anonymization_completed"] = True
|
|
657
|
+
|
|
658
|
+
logger.info("PDF cropping processing completed")
|
|
659
|
+
|
|
501
660
|
def _apply_text_results(self):
|
|
502
661
|
"""Apply text extraction results to the PDF instance."""
|
|
503
662
|
if not self.current_pdf:
|
|
504
663
|
logger.warning("Cannot apply text results - no PDF instance available")
|
|
505
664
|
return
|
|
506
|
-
|
|
507
|
-
original_text = self.processing_context.get(
|
|
508
|
-
anonymized_text = self.processing_context.get(
|
|
509
|
-
|
|
665
|
+
|
|
666
|
+
original_text = self.processing_context.get("original_text")
|
|
667
|
+
anonymized_text = self.processing_context.get("anonymized_text")
|
|
668
|
+
|
|
510
669
|
if not original_text:
|
|
511
670
|
logger.warning("No original text available to apply")
|
|
512
671
|
return
|
|
513
|
-
|
|
672
|
+
|
|
514
673
|
# Store extracted text
|
|
515
674
|
self.current_pdf.text = original_text
|
|
516
675
|
logger.info(f"Extracted {len(original_text)} characters of text from PDF")
|
|
517
|
-
|
|
676
|
+
|
|
518
677
|
# Handle anonymized text
|
|
519
678
|
if anonymized_text and anonymized_text != original_text:
|
|
520
679
|
self.current_pdf.anonymized = True
|
|
@@ -525,56 +684,56 @@ class PdfImportService:
|
|
|
525
684
|
if not self.current_pdf:
|
|
526
685
|
logger.warning("Cannot apply metadata results - no PDF instance available")
|
|
527
686
|
return
|
|
528
|
-
|
|
529
|
-
extracted_metadata = self.processing_context.get(
|
|
530
|
-
|
|
687
|
+
|
|
688
|
+
extracted_metadata = self.processing_context.get("extracted_metadata")
|
|
689
|
+
|
|
531
690
|
if not self.current_pdf.sensitive_meta or not extracted_metadata:
|
|
532
691
|
logger.debug("No sensitive meta or extracted metadata available")
|
|
533
692
|
return
|
|
534
|
-
|
|
693
|
+
|
|
535
694
|
sm = self.current_pdf.sensitive_meta
|
|
536
|
-
|
|
695
|
+
|
|
537
696
|
# Map ReportReader metadata to SensitiveMeta fields
|
|
538
697
|
metadata_mapping = {
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
698
|
+
"patient_first_name": "patient_first_name",
|
|
699
|
+
"patient_last_name": "patient_last_name",
|
|
700
|
+
"patient_dob": "patient_dob",
|
|
701
|
+
"examination_date": "examination_date",
|
|
702
|
+
"examiner_first_name": "examiner_first_name",
|
|
703
|
+
"examiner_last_name": "examiner_last_name",
|
|
704
|
+
"endoscope_type": "endoscope_type",
|
|
705
|
+
"casenumber": "case_number",
|
|
547
706
|
}
|
|
548
|
-
|
|
707
|
+
|
|
549
708
|
# Update fields with extracted information
|
|
550
709
|
updated_fields = []
|
|
551
710
|
for meta_key, sm_field in metadata_mapping.items():
|
|
552
711
|
if extracted_metadata.get(meta_key) and hasattr(sm, sm_field):
|
|
553
712
|
old_value = getattr(sm, sm_field)
|
|
554
713
|
raw_value = extracted_metadata[meta_key]
|
|
555
|
-
|
|
714
|
+
|
|
556
715
|
# Skip if we just got the field name as a string (indicates no actual data)
|
|
557
716
|
if isinstance(raw_value, str) and raw_value == meta_key:
|
|
558
717
|
continue
|
|
559
|
-
|
|
718
|
+
|
|
560
719
|
# Handle date fields specially
|
|
561
|
-
if sm_field in [
|
|
720
|
+
if sm_field in ["patient_dob", "examination_date"]:
|
|
562
721
|
new_value = self._parse_date_field(raw_value, meta_key, sm_field)
|
|
563
722
|
if new_value is None:
|
|
564
723
|
continue
|
|
565
724
|
else:
|
|
566
725
|
new_value = raw_value
|
|
567
|
-
|
|
726
|
+
|
|
568
727
|
# Configurable overwrite policy
|
|
569
728
|
should_overwrite = (
|
|
570
729
|
self.allow_meta_overwrite
|
|
571
730
|
or not old_value
|
|
572
|
-
or old_value in [
|
|
731
|
+
or old_value in ["Patient", "Unknown"]
|
|
573
732
|
)
|
|
574
733
|
if new_value and should_overwrite:
|
|
575
734
|
setattr(sm, sm_field, new_value)
|
|
576
735
|
updated_fields.append(sm_field)
|
|
577
|
-
|
|
736
|
+
|
|
578
737
|
if updated_fields:
|
|
579
738
|
sm.save()
|
|
580
739
|
logger.info(f"Updated SensitiveMeta fields: {updated_fields}")
|
|
@@ -587,26 +746,29 @@ class PdfImportService:
|
|
|
587
746
|
if raw_value == meta_key:
|
|
588
747
|
logger.warning(
|
|
589
748
|
"Skipping date field %s - got field name '%s' instead of actual date",
|
|
590
|
-
sm_field,
|
|
749
|
+
sm_field,
|
|
750
|
+
raw_value,
|
|
591
751
|
)
|
|
592
752
|
return None
|
|
593
|
-
|
|
753
|
+
|
|
594
754
|
# Try common date formats
|
|
595
|
-
date_formats = [
|
|
755
|
+
date_formats = ["%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y"]
|
|
596
756
|
for fmt in date_formats:
|
|
597
757
|
try:
|
|
598
758
|
return datetime.strptime(raw_value, fmt).date()
|
|
599
759
|
except ValueError:
|
|
600
760
|
continue
|
|
601
|
-
|
|
602
|
-
logger.warning(
|
|
761
|
+
|
|
762
|
+
logger.warning(
|
|
763
|
+
"Could not parse date '%s' for field %s", raw_value, sm_field
|
|
764
|
+
)
|
|
603
765
|
return None
|
|
604
|
-
|
|
605
|
-
elif hasattr(raw_value,
|
|
766
|
+
|
|
767
|
+
elif hasattr(raw_value, "date"):
|
|
606
768
|
return raw_value.date()
|
|
607
769
|
else:
|
|
608
770
|
return raw_value
|
|
609
|
-
|
|
771
|
+
|
|
610
772
|
except (ValueError, AttributeError) as e:
|
|
611
773
|
logger.warning("Date parsing failed for %s: %s", sm_field, e)
|
|
612
774
|
return None
|
|
@@ -626,14 +788,17 @@ class PdfImportService:
|
|
|
626
788
|
logger.warning("Cannot apply anonymized PDF - no PDF instance available")
|
|
627
789
|
return
|
|
628
790
|
|
|
629
|
-
anonymized_pdf_path = self.processing_context.get(
|
|
791
|
+
anonymized_pdf_path = self.processing_context.get("anonymized_pdf_path")
|
|
630
792
|
if not anonymized_pdf_path:
|
|
631
793
|
logger.debug("No anonymized_pdf_path present in processing context")
|
|
632
794
|
return
|
|
633
795
|
|
|
634
796
|
anonymized_path = Path(anonymized_pdf_path)
|
|
635
797
|
if not anonymized_path.exists():
|
|
636
|
-
logger.warning(
|
|
798
|
+
logger.warning(
|
|
799
|
+
"Anonymized PDF path returned but file does not exist: %s",
|
|
800
|
+
anonymized_path,
|
|
801
|
+
)
|
|
637
802
|
return
|
|
638
803
|
|
|
639
804
|
logger.info("Anonymized PDF created by ReportReader at: %s", anonymized_path)
|
|
@@ -647,7 +812,7 @@ class PdfImportService:
|
|
|
647
812
|
relative_name = str(anonymized_path)
|
|
648
813
|
|
|
649
814
|
# Only update if something actually changed
|
|
650
|
-
if getattr(self.current_pdf.anonymized_file,
|
|
815
|
+
if getattr(self.current_pdf.anonymized_file, "name", None) != relative_name:
|
|
651
816
|
self.current_pdf.anonymized_file.name = relative_name
|
|
652
817
|
|
|
653
818
|
# Ensure model/state reflect anonymization even if text didn't differ
|
|
@@ -656,14 +821,16 @@ class PdfImportService:
|
|
|
656
821
|
|
|
657
822
|
# Persist cropped regions info somewhere useful (optional & non-breaking)
|
|
658
823
|
# If your model has a field for this, persist there; otherwise we just log.
|
|
659
|
-
cropped_regions = self.processing_context.get(
|
|
824
|
+
cropped_regions = self.processing_context.get("cropped_regions")
|
|
660
825
|
if cropped_regions:
|
|
661
|
-
logger.debug(
|
|
826
|
+
logger.debug(
|
|
827
|
+
"Cropped regions recorded (%d regions).", len(cropped_regions)
|
|
828
|
+
)
|
|
662
829
|
|
|
663
830
|
# Save model changes
|
|
664
|
-
update_fields = [
|
|
665
|
-
if
|
|
666
|
-
update_fields.append(
|
|
831
|
+
update_fields = ["anonymized_file"]
|
|
832
|
+
if "anonymized" in self.current_pdf.__dict__:
|
|
833
|
+
update_fields.append("anonymized")
|
|
667
834
|
self.current_pdf.save(update_fields=update_fields)
|
|
668
835
|
|
|
669
836
|
# Mark state as anonymized immediately; this keeps downstream flows working
|
|
@@ -671,31 +838,41 @@ class PdfImportService:
|
|
|
671
838
|
if state and not state.anonymized:
|
|
672
839
|
state.mark_anonymized(save=True)
|
|
673
840
|
|
|
674
|
-
logger.info(
|
|
841
|
+
logger.info(
|
|
842
|
+
"Updated anonymized_file reference to: %s",
|
|
843
|
+
self.current_pdf.anonymized_file.name,
|
|
844
|
+
)
|
|
675
845
|
|
|
676
846
|
except Exception as e:
|
|
677
847
|
logger.warning("Could not set anonymized file reference: %s", e)
|
|
678
848
|
|
|
679
|
-
|
|
680
849
|
def _finalize_processing(self):
|
|
681
850
|
"""Finalize processing and update state."""
|
|
682
851
|
if not self.current_pdf:
|
|
683
852
|
logger.warning("Cannot finalize processing - no PDF instance available")
|
|
684
853
|
return
|
|
685
|
-
|
|
854
|
+
|
|
686
855
|
try:
|
|
687
856
|
# Update state based on processing results
|
|
688
857
|
state = self._ensure_state(self.current_pdf)
|
|
689
|
-
|
|
690
|
-
if self.processing_context.get(
|
|
858
|
+
|
|
859
|
+
if self.processing_context.get("text_extracted") and state:
|
|
691
860
|
state.mark_anonymized()
|
|
692
|
-
|
|
861
|
+
|
|
862
|
+
# Mark as ready for validation after successful anonymization
|
|
863
|
+
if self.processing_context.get("anonymization_completed") and state:
|
|
864
|
+
state.mark_sensitive_meta_processed()
|
|
865
|
+
logger.info(
|
|
866
|
+
f"PDF {self.current_pdf.pdf_hash} processing completed - "
|
|
867
|
+
f"ready for validation (status: {state.anonymization_status})"
|
|
868
|
+
)
|
|
869
|
+
|
|
693
870
|
# Save all changes
|
|
694
871
|
with transaction.atomic():
|
|
695
872
|
self.current_pdf.save()
|
|
696
873
|
if state:
|
|
697
874
|
state.save()
|
|
698
|
-
|
|
875
|
+
|
|
699
876
|
logger.info("PDF processing completed successfully")
|
|
700
877
|
except Exception as e:
|
|
701
878
|
logger.warning(f"Failed to finalize processing: {e}")
|
|
@@ -703,9 +880,11 @@ class PdfImportService:
|
|
|
703
880
|
def _mark_processing_incomplete(self, reason: str):
|
|
704
881
|
"""Mark processing as incomplete with reason."""
|
|
705
882
|
if not self.current_pdf:
|
|
706
|
-
logger.warning(
|
|
883
|
+
logger.warning(
|
|
884
|
+
f"Cannot mark processing incomplete - no PDF instance available. Reason: {reason}"
|
|
885
|
+
)
|
|
707
886
|
return
|
|
708
|
-
|
|
887
|
+
|
|
709
888
|
try:
|
|
710
889
|
state = self._ensure_state(self.current_pdf)
|
|
711
890
|
if state:
|
|
@@ -714,7 +893,7 @@ class PdfImportService:
|
|
|
714
893
|
state.sensitive_meta_processed = False
|
|
715
894
|
state.save()
|
|
716
895
|
logger.info(f"Set PDF state: processed=False due to {reason}")
|
|
717
|
-
|
|
896
|
+
|
|
718
897
|
# Save changes
|
|
719
898
|
with transaction.atomic():
|
|
720
899
|
self.current_pdf.save()
|
|
@@ -722,31 +901,53 @@ class PdfImportService:
|
|
|
722
901
|
logger.warning(f"Failed to mark processing incomplete: {e}")
|
|
723
902
|
|
|
724
903
|
def _retry_existing_pdf(self, existing_pdf):
|
|
725
|
-
"""
|
|
904
|
+
"""
|
|
905
|
+
Retry processing for existing PDF.
|
|
906
|
+
|
|
907
|
+
Uses get_raw_file_path() to find the original raw file instead of
|
|
908
|
+
relying on the file field which may point to a deleted sensitive file.
|
|
909
|
+
"""
|
|
726
910
|
try:
|
|
911
|
+
# ✅ FIX: Use get_raw_file_path() to find original file
|
|
912
|
+
raw_file_path = existing_pdf.get_raw_file_path()
|
|
913
|
+
|
|
914
|
+
if not raw_file_path or not raw_file_path.exists():
|
|
915
|
+
logger.error(
|
|
916
|
+
f"Cannot retry PDF {existing_pdf.pdf_hash}: Raw file not found. "
|
|
917
|
+
f"Please re-upload the original PDF file."
|
|
918
|
+
)
|
|
919
|
+
self.current_pdf = existing_pdf
|
|
920
|
+
return existing_pdf
|
|
921
|
+
|
|
922
|
+
logger.info(f"Found raw file for retry at: {raw_file_path}")
|
|
923
|
+
|
|
727
924
|
# Remove from processed files to allow retry
|
|
728
|
-
file_path_str = str(
|
|
729
|
-
if file_path_str
|
|
925
|
+
file_path_str = str(raw_file_path)
|
|
926
|
+
if file_path_str in self.processed_files:
|
|
730
927
|
self.processed_files.remove(file_path_str)
|
|
731
928
|
logger.debug(f"Removed {file_path_str} from processed files for retry")
|
|
732
|
-
|
|
929
|
+
|
|
733
930
|
return self.import_and_anonymize(
|
|
734
|
-
file_path=
|
|
735
|
-
center_name=existing_pdf.center.name
|
|
736
|
-
|
|
737
|
-
|
|
931
|
+
file_path=raw_file_path, # ✅ Use raw file path, not sensitive path
|
|
932
|
+
center_name=existing_pdf.center.name
|
|
933
|
+
if existing_pdf.center
|
|
934
|
+
else "unknown_center",
|
|
935
|
+
delete_source=False, # Never delete during retry
|
|
936
|
+
retry=True,
|
|
738
937
|
)
|
|
739
938
|
except Exception as e:
|
|
740
|
-
logger.error(
|
|
939
|
+
logger.error(
|
|
940
|
+
f"Failed to re-import existing PDF {existing_pdf.pdf_hash}: {e}"
|
|
941
|
+
)
|
|
741
942
|
self.current_pdf = existing_pdf
|
|
742
943
|
return existing_pdf
|
|
743
944
|
|
|
744
945
|
def _cleanup_on_error(self):
|
|
745
946
|
"""Cleanup processing context on error."""
|
|
746
947
|
try:
|
|
747
|
-
if self.current_pdf and hasattr(self.current_pdf,
|
|
948
|
+
if self.current_pdf and hasattr(self.current_pdf, "state"):
|
|
748
949
|
state = self._ensure_state(self.current_pdf)
|
|
749
|
-
if state and self.processing_context.get(
|
|
950
|
+
if state and self.processing_context.get("processing_started"):
|
|
750
951
|
state.text_meta_extracted = False
|
|
751
952
|
state.pdf_meta_extracted = False
|
|
752
953
|
state.sensitive_meta_processed = False
|
|
@@ -756,7 +957,7 @@ class PdfImportService:
|
|
|
756
957
|
logger.warning(f"Error during cleanup: {e}")
|
|
757
958
|
finally:
|
|
758
959
|
# Remove any sensitive copy created during this processing run
|
|
759
|
-
sensitive_created = self.processing_context.get(
|
|
960
|
+
sensitive_created = self.processing_context.get("sensitive_copy_created")
|
|
760
961
|
if sensitive_created:
|
|
761
962
|
pdf_obj = self.current_pdf
|
|
762
963
|
try:
|
|
@@ -765,30 +966,51 @@ class PdfImportService:
|
|
|
765
966
|
if file_field and getattr(file_field, "name", None):
|
|
766
967
|
storage_name = file_field.name
|
|
767
968
|
file_field.delete(save=False)
|
|
768
|
-
logger.debug(
|
|
969
|
+
logger.debug(
|
|
970
|
+
"Deleted sensitive copy %s during error cleanup",
|
|
971
|
+
storage_name,
|
|
972
|
+
)
|
|
769
973
|
except Exception as cleanup_exc:
|
|
770
|
-
logger.warning(
|
|
974
|
+
logger.warning(
|
|
975
|
+
"Failed to remove sensitive copy during error cleanup: %s",
|
|
976
|
+
cleanup_exc,
|
|
977
|
+
)
|
|
771
978
|
|
|
772
979
|
# Always clean up processed files set to prevent blocks
|
|
773
|
-
file_path = self.processing_context.get(
|
|
980
|
+
file_path = self.processing_context.get("file_path")
|
|
774
981
|
if file_path and str(file_path) in self.processed_files:
|
|
775
982
|
self.processed_files.remove(str(file_path))
|
|
776
|
-
logger.debug(
|
|
983
|
+
logger.debug(
|
|
984
|
+
f"Removed {file_path} from processed files during error cleanup"
|
|
985
|
+
)
|
|
777
986
|
|
|
778
987
|
try:
|
|
779
|
-
original_path = self.processing_context.get(
|
|
780
|
-
logger.debug(
|
|
781
|
-
|
|
988
|
+
original_path = self.processing_context.get("original_file_path")
|
|
989
|
+
logger.debug(
|
|
990
|
+
"PDF cleanup original path: %s (%s)",
|
|
991
|
+
original_path,
|
|
992
|
+
type(original_path),
|
|
993
|
+
)
|
|
994
|
+
raw_dir = (
|
|
995
|
+
original_path.parent if isinstance(original_path, Path) else None
|
|
996
|
+
)
|
|
782
997
|
if (
|
|
783
998
|
isinstance(original_path, Path)
|
|
784
999
|
and original_path.exists()
|
|
785
|
-
and not self.processing_context.get(
|
|
1000
|
+
and not self.processing_context.get("sensitive_copy_created")
|
|
786
1001
|
):
|
|
787
1002
|
try:
|
|
788
1003
|
original_path.unlink()
|
|
789
|
-
logger.info(
|
|
1004
|
+
logger.info(
|
|
1005
|
+
"Removed original file %s during error cleanup",
|
|
1006
|
+
original_path,
|
|
1007
|
+
)
|
|
790
1008
|
except Exception as remove_exc:
|
|
791
|
-
logger.warning(
|
|
1009
|
+
logger.warning(
|
|
1010
|
+
"Could not remove original file %s during error cleanup: %s",
|
|
1011
|
+
original_path,
|
|
1012
|
+
remove_exc,
|
|
1013
|
+
)
|
|
792
1014
|
pdf_dir = self._get_pdf_dir()
|
|
793
1015
|
if not pdf_dir and raw_dir:
|
|
794
1016
|
base_dir = raw_dir.parent
|
|
@@ -805,7 +1027,12 @@ class PdfImportService:
|
|
|
805
1027
|
|
|
806
1028
|
# Remove empty PDF subdirectories that might have been created during setup
|
|
807
1029
|
if pdf_dir and pdf_dir.exists():
|
|
808
|
-
for subdir_name in (
|
|
1030
|
+
for subdir_name in (
|
|
1031
|
+
"sensitive",
|
|
1032
|
+
"cropped_regions",
|
|
1033
|
+
"anonymized",
|
|
1034
|
+
"_processing",
|
|
1035
|
+
):
|
|
809
1036
|
subdir_path = pdf_dir / subdir_name
|
|
810
1037
|
if subdir_path.exists() and subdir_path.is_dir():
|
|
811
1038
|
try:
|
|
@@ -813,22 +1040,49 @@ class PdfImportService:
|
|
|
813
1040
|
except StopIteration:
|
|
814
1041
|
try:
|
|
815
1042
|
subdir_path.rmdir()
|
|
816
|
-
logger.debug(
|
|
1043
|
+
logger.debug(
|
|
1044
|
+
"Removed empty directory %s during error cleanup",
|
|
1045
|
+
subdir_path,
|
|
1046
|
+
)
|
|
817
1047
|
except OSError as rm_err:
|
|
818
|
-
logger.debug(
|
|
1048
|
+
logger.debug(
|
|
1049
|
+
"Could not remove directory %s: %s",
|
|
1050
|
+
subdir_path,
|
|
1051
|
+
rm_err,
|
|
1052
|
+
)
|
|
819
1053
|
except Exception as iter_err:
|
|
820
|
-
logger.debug(
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
1054
|
+
logger.debug(
|
|
1055
|
+
"Could not inspect directory %s: %s",
|
|
1056
|
+
subdir_path,
|
|
1057
|
+
iter_err,
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
raw_count = (
|
|
1061
|
+
len(list(raw_dir.glob("*")))
|
|
1062
|
+
if raw_dir and raw_dir.exists()
|
|
1063
|
+
else None
|
|
1064
|
+
)
|
|
1065
|
+
pdf_count = (
|
|
1066
|
+
len(list(pdf_dir.glob("*")))
|
|
1067
|
+
if pdf_dir and pdf_dir.exists()
|
|
1068
|
+
else None
|
|
1069
|
+
)
|
|
824
1070
|
|
|
825
|
-
sensitive_path = self.processing_context.get(
|
|
1071
|
+
sensitive_path = self.processing_context.get("sensitive_file_path")
|
|
826
1072
|
if sensitive_path:
|
|
827
1073
|
sensitive_parent = Path(sensitive_path).parent
|
|
828
|
-
sensitive_count =
|
|
1074
|
+
sensitive_count = (
|
|
1075
|
+
len(list(sensitive_parent.glob("*")))
|
|
1076
|
+
if sensitive_parent.exists()
|
|
1077
|
+
else None
|
|
1078
|
+
)
|
|
829
1079
|
else:
|
|
830
1080
|
sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
|
|
831
|
-
sensitive_count =
|
|
1081
|
+
sensitive_count = (
|
|
1082
|
+
len(list(sensitive_dir.glob("*")))
|
|
1083
|
+
if sensitive_dir and sensitive_dir.exists()
|
|
1084
|
+
else None
|
|
1085
|
+
)
|
|
832
1086
|
|
|
833
1087
|
logger.info(
|
|
834
1088
|
"PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
|
|
@@ -843,17 +1097,17 @@ class PdfImportService:
|
|
|
843
1097
|
"""Cleanup processing context."""
|
|
844
1098
|
try:
|
|
845
1099
|
# Clean up temporary directories
|
|
846
|
-
if self.processing_context.get(
|
|
847
|
-
crops_dir = path_utils.PDF_DIR /
|
|
1100
|
+
if self.processing_context.get("text_extracted"):
|
|
1101
|
+
crops_dir = path_utils.PDF_DIR / "cropped_regions"
|
|
848
1102
|
if crops_dir.exists() and not any(crops_dir.iterdir()):
|
|
849
1103
|
crops_dir.rmdir()
|
|
850
|
-
|
|
1104
|
+
|
|
851
1105
|
# Always remove from processed files set after processing attempt
|
|
852
|
-
file_path = self.processing_context.get(
|
|
1106
|
+
file_path = self.processing_context.get("file_path")
|
|
853
1107
|
if file_path and str(file_path) in self.processed_files:
|
|
854
1108
|
self.processed_files.remove(str(file_path))
|
|
855
1109
|
logger.debug(f"Removed {file_path} from processed files set")
|
|
856
|
-
|
|
1110
|
+
|
|
857
1111
|
except Exception as e:
|
|
858
1112
|
logger.warning(f"Error during context cleanup: {e}")
|
|
859
1113
|
finally:
|
|
@@ -862,44 +1116,43 @@ class PdfImportService:
|
|
|
862
1116
|
self.processing_context = {}
|
|
863
1117
|
|
|
864
1118
|
def import_simple(
|
|
865
|
-
self,
|
|
866
|
-
file_path: Union[Path, str],
|
|
867
|
-
center_name: str,
|
|
868
|
-
delete_source: bool = False
|
|
1119
|
+
self, file_path: Union[Path, str], center_name: str, delete_source: bool = False
|
|
869
1120
|
) -> "RawPdfFile":
|
|
870
1121
|
"""
|
|
871
1122
|
Simple PDF import without text processing or anonymization.
|
|
872
1123
|
Uses centralized PDF instance management pattern.
|
|
873
|
-
|
|
1124
|
+
|
|
874
1125
|
Args:
|
|
875
1126
|
file_path: Path to the PDF file to import
|
|
876
1127
|
center_name: Name of the center to associate with PDF
|
|
877
1128
|
delete_source: Whether to delete the source file after import
|
|
878
|
-
|
|
1129
|
+
|
|
879
1130
|
Returns:
|
|
880
1131
|
RawPdfFile instance after basic import
|
|
881
1132
|
"""
|
|
882
1133
|
try:
|
|
883
1134
|
# Initialize simple processing context
|
|
884
|
-
self._initialize_processing_context(
|
|
885
|
-
|
|
1135
|
+
self._initialize_processing_context(
|
|
1136
|
+
file_path, center_name, delete_source, False
|
|
1137
|
+
)
|
|
1138
|
+
|
|
886
1139
|
# Validate file
|
|
887
1140
|
self._validate_and_prepare_file()
|
|
888
|
-
|
|
1141
|
+
|
|
889
1142
|
# Create PDF instance
|
|
890
1143
|
logger.info("Starting simple import - creating RawPdfFile instance...")
|
|
891
1144
|
self.current_pdf = RawPdfFile.create_from_file_initialized(
|
|
892
|
-
file_path=self.processing_context[
|
|
1145
|
+
file_path=self.processing_context["file_path"],
|
|
893
1146
|
center_name=center_name,
|
|
894
1147
|
delete_source=delete_source,
|
|
895
1148
|
)
|
|
896
|
-
|
|
1149
|
+
|
|
897
1150
|
if not self.current_pdf:
|
|
898
1151
|
raise RuntimeError("Failed to create RawPdfFile instance")
|
|
899
|
-
|
|
1152
|
+
|
|
900
1153
|
# Mark as processed
|
|
901
|
-
self.processed_files.add(str(self.processing_context[
|
|
902
|
-
|
|
1154
|
+
self.processed_files.add(str(self.processing_context["file_path"]))
|
|
1155
|
+
|
|
903
1156
|
# Set basic state for simple import
|
|
904
1157
|
state = self._ensure_state(self.current_pdf)
|
|
905
1158
|
if state:
|
|
@@ -908,56 +1161,68 @@ class PdfImportService:
|
|
|
908
1161
|
state.sensitive_meta_processed = False
|
|
909
1162
|
state.save()
|
|
910
1163
|
logger.info("Set PDF state: processed=False for simple import")
|
|
911
|
-
|
|
1164
|
+
|
|
912
1165
|
# Save changes
|
|
913
1166
|
with transaction.atomic():
|
|
914
1167
|
self.current_pdf.save()
|
|
915
|
-
|
|
916
|
-
logger.info(
|
|
1168
|
+
|
|
1169
|
+
logger.info(
|
|
1170
|
+
"Simple import completed for RawPdfFile hash: %s",
|
|
1171
|
+
self.current_pdf.pdf_hash,
|
|
1172
|
+
)
|
|
917
1173
|
return self.current_pdf
|
|
918
|
-
|
|
1174
|
+
|
|
919
1175
|
except Exception as e:
|
|
920
1176
|
logger.error(f"Simple PDF import failed for {file_path}: {e}")
|
|
921
1177
|
self._cleanup_on_error()
|
|
922
1178
|
raise
|
|
923
1179
|
finally:
|
|
924
1180
|
self._cleanup_processing_context()
|
|
925
|
-
|
|
926
|
-
def check_storage_capacity(
|
|
1181
|
+
|
|
1182
|
+
def check_storage_capacity(
|
|
1183
|
+
self, file_path: Union[Path, str], storage_root, min_required_space
|
|
1184
|
+
) -> None:
|
|
927
1185
|
"""
|
|
928
1186
|
Check if there is sufficient storage capacity for the PDF file.
|
|
929
|
-
|
|
1187
|
+
|
|
930
1188
|
Args:
|
|
931
1189
|
file_path: Path to the PDF file to check
|
|
932
|
-
|
|
1190
|
+
|
|
933
1191
|
Raises:
|
|
934
1192
|
InsufficientStorageError: If there is not enough space
|
|
935
1193
|
"""
|
|
936
1194
|
import shutil
|
|
1195
|
+
|
|
937
1196
|
from endoreg_db.exceptions import InsufficientStorageError
|
|
938
|
-
|
|
1197
|
+
|
|
939
1198
|
file_path = Path(file_path)
|
|
940
1199
|
if not file_path.exists():
|
|
941
1200
|
raise FileNotFoundError(f"File not found for storage check: {file_path}")
|
|
942
|
-
|
|
1201
|
+
|
|
943
1202
|
# Get the size of the file
|
|
944
1203
|
file_size = file_path.stat().st_size
|
|
945
|
-
|
|
1204
|
+
|
|
946
1205
|
# Get available space in the storage directory
|
|
947
1206
|
|
|
948
1207
|
total, used, free = shutil.disk_usage(storage_root)
|
|
949
|
-
|
|
1208
|
+
|
|
950
1209
|
if file_size:
|
|
951
1210
|
min_required_space = file_size if isinstance(min_required_space, int) else 0
|
|
952
1211
|
|
|
953
1212
|
# Check if there is enough space
|
|
954
1213
|
if file_size > free:
|
|
955
|
-
raise InsufficientStorageError(
|
|
956
|
-
|
|
957
|
-
|
|
1214
|
+
raise InsufficientStorageError(
|
|
1215
|
+
f"Not enough space to store PDF file: {file_path}"
|
|
1216
|
+
)
|
|
1217
|
+
logger.info(
|
|
1218
|
+
f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available"
|
|
1219
|
+
)
|
|
1220
|
+
|
|
958
1221
|
return True
|
|
959
|
-
|
|
960
|
-
def create_sensitive_file(
|
|
1222
|
+
|
|
1223
|
+
def create_sensitive_file(
|
|
1224
|
+
self, pdf_instance: "RawPdfFile" = None, file_path: Union[Path, str] = None
|
|
1225
|
+
) -> None:
|
|
961
1226
|
"""
|
|
962
1227
|
Create a copy of the PDF file in the sensitive directory and update the file reference.
|
|
963
1228
|
Delete the source path to avoid duplicates.
|
|
@@ -966,7 +1231,9 @@ class PdfImportService:
|
|
|
966
1231
|
Ensures the FileField points to the file under STORAGE_DIR/pdfs/sensitive and never back to raw_pdfs.
|
|
967
1232
|
"""
|
|
968
1233
|
pdf_file = pdf_instance or self.current_pdf
|
|
969
|
-
source_path =
|
|
1234
|
+
source_path = (
|
|
1235
|
+
Path(file_path) if file_path else self.processing_context.get("file_path")
|
|
1236
|
+
)
|
|
970
1237
|
|
|
971
1238
|
if not pdf_file:
|
|
972
1239
|
raise ValueError("No PDF instance available for creating sensitive file")
|
|
@@ -989,25 +1256,37 @@ class PdfImportService:
|
|
|
989
1256
|
try:
|
|
990
1257
|
target.unlink()
|
|
991
1258
|
except Exception as e:
|
|
992
|
-
logger.warning(
|
|
1259
|
+
logger.warning(
|
|
1260
|
+
"Could not remove existing sensitive target %s: %s",
|
|
1261
|
+
target,
|
|
1262
|
+
e,
|
|
1263
|
+
)
|
|
993
1264
|
shutil.move(str(source_path), str(target))
|
|
994
1265
|
logger.info(f"Moved PDF to sensitive directory: {target}")
|
|
995
1266
|
|
|
996
1267
|
# Update FileField to reference the file under STORAGE_DIR
|
|
997
1268
|
# We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
|
|
998
1269
|
try:
|
|
999
|
-
relative_name = str(
|
|
1270
|
+
relative_name = str(
|
|
1271
|
+
target.relative_to(path_utils.STORAGE_DIR)
|
|
1272
|
+
) # Point Django FileField to sensitive storage
|
|
1000
1273
|
except ValueError:
|
|
1001
1274
|
# Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
|
|
1002
1275
|
relative_name = str(target)
|
|
1003
1276
|
|
|
1004
1277
|
# Only update when changed
|
|
1005
|
-
if getattr(pdf_file.file,
|
|
1278
|
+
if getattr(pdf_file.file, "name", None) != relative_name:
|
|
1006
1279
|
pdf_file.file.name = relative_name
|
|
1007
|
-
pdf_file.save(update_fields=[
|
|
1008
|
-
logger.info(
|
|
1280
|
+
pdf_file.save(update_fields=["file"])
|
|
1281
|
+
logger.info(
|
|
1282
|
+
"Updated PDF FileField reference to sensitive path: %s",
|
|
1283
|
+
pdf_file.file.path,
|
|
1284
|
+
)
|
|
1009
1285
|
else:
|
|
1010
|
-
logger.debug(
|
|
1286
|
+
logger.debug(
|
|
1287
|
+
"PDF FileField already points to sensitive path: %s",
|
|
1288
|
+
pdf_file.file.path,
|
|
1289
|
+
)
|
|
1011
1290
|
|
|
1012
1291
|
# Best-effort: if original source still exists (e.g., copy), remove it to avoid re-triggers
|
|
1013
1292
|
try:
|
|
@@ -1018,57 +1297,81 @@ class PdfImportService:
|
|
|
1018
1297
|
logger.warning(f"Could not delete original PDF file {source_path}: {e}")
|
|
1019
1298
|
|
|
1020
1299
|
except Exception as e:
|
|
1021
|
-
logger.warning(
|
|
1300
|
+
logger.warning(
|
|
1301
|
+
f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}",
|
|
1302
|
+
exc_info=True,
|
|
1303
|
+
)
|
|
1022
1304
|
|
|
1023
|
-
def archive_or_quarantine_file(
|
|
1024
|
-
|
|
1305
|
+
def archive_or_quarantine_file(
|
|
1306
|
+
self,
|
|
1307
|
+
pdf_instance: "RawPdfFile" = None,
|
|
1308
|
+
source_file_path: Union[Path, str] = None,
|
|
1309
|
+
quarantine_reason: str = None,
|
|
1310
|
+
is_pdf_problematic: bool = None,
|
|
1311
|
+
) -> bool:
|
|
1025
1312
|
"""
|
|
1026
1313
|
Archive or quarantine file based on the state of the PDF processing.
|
|
1027
1314
|
Uses the central PDF instance and processing context if parameters not provided.
|
|
1028
|
-
|
|
1315
|
+
|
|
1029
1316
|
Args:
|
|
1030
1317
|
pdf_instance: Optional PDF instance, defaults to self.current_pdf
|
|
1031
1318
|
source_file_path: Optional source file path, defaults to processing_context['file_path']
|
|
1032
1319
|
quarantine_reason: Optional quarantine reason, defaults to processing_context['error_reason']
|
|
1033
1320
|
is_pdf_problematic: Optional override for problematic state
|
|
1034
|
-
|
|
1321
|
+
|
|
1035
1322
|
Returns:
|
|
1036
1323
|
bool: True if file was quarantined, False if archived successfully
|
|
1037
1324
|
"""
|
|
1038
1325
|
pdf_file = pdf_instance or self.current_pdf
|
|
1039
|
-
file_path =
|
|
1040
|
-
|
|
1041
|
-
|
|
1326
|
+
file_path = (
|
|
1327
|
+
Path(source_file_path)
|
|
1328
|
+
if source_file_path
|
|
1329
|
+
else self.processing_context.get("file_path")
|
|
1330
|
+
)
|
|
1331
|
+
quarantine_reason = quarantine_reason or self.processing_context.get(
|
|
1332
|
+
"error_reason"
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1042
1335
|
if not pdf_file:
|
|
1043
1336
|
raise ValueError("No PDF instance available for archiving/quarantine")
|
|
1044
1337
|
if not file_path:
|
|
1045
1338
|
raise ValueError("No file path available for archiving/quarantine")
|
|
1046
|
-
|
|
1339
|
+
|
|
1047
1340
|
# Determine if the PDF is problematic
|
|
1048
|
-
pdf_problematic =
|
|
1049
|
-
|
|
1341
|
+
pdf_problematic = (
|
|
1342
|
+
is_pdf_problematic
|
|
1343
|
+
if is_pdf_problematic is not None
|
|
1344
|
+
else pdf_file.is_problematic
|
|
1345
|
+
)
|
|
1346
|
+
|
|
1050
1347
|
if pdf_problematic:
|
|
1051
1348
|
# Quarantine the file
|
|
1052
|
-
logger.warning(
|
|
1349
|
+
logger.warning(
|
|
1350
|
+
f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}"
|
|
1351
|
+
)
|
|
1053
1352
|
quarantine_dir = path_utils.PDF_DIR / "quarantine"
|
|
1054
1353
|
os.makedirs(quarantine_dir, exist_ok=True)
|
|
1055
|
-
|
|
1354
|
+
|
|
1056
1355
|
quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
1057
1356
|
try:
|
|
1058
1357
|
shutil.move(file_path, quarantine_path)
|
|
1059
|
-
pdf_file.quarantine_reason =
|
|
1060
|
-
|
|
1358
|
+
pdf_file.quarantine_reason = (
|
|
1359
|
+
quarantine_reason or "File processing failed"
|
|
1360
|
+
)
|
|
1361
|
+
pdf_file.save(update_fields=["quarantine_reason"])
|
|
1061
1362
|
logger.info(f"Moved problematic PDF to quarantine: {quarantine_path}")
|
|
1062
1363
|
return True
|
|
1063
1364
|
except Exception as e:
|
|
1064
1365
|
logger.error(f"Failed to quarantine PDF {pdf_file.pdf_hash}: {e}")
|
|
1065
|
-
return
|
|
1366
|
+
return (
|
|
1367
|
+
True # Still consider as quarantined to prevent further processing
|
|
1368
|
+
)
|
|
1066
1369
|
else:
|
|
1067
1370
|
# Archive the file normally
|
|
1068
1371
|
logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
|
|
1069
1372
|
archive_dir = path_utils.PDF_DIR / "processed"
|
|
1070
1373
|
os.makedirs(archive_dir, exist_ok=True)
|
|
1071
|
-
|
|
1374
|
+
|
|
1072
1375
|
archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
1073
1376
|
try:
|
|
1074
1377
|
shutil.move(file_path, archive_path)
|