endoreg-db 0.8.4.7__py3-none-any.whl → 0.8.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of endoreg-db might be problematic. Click here for more details.
- endoreg_db/models/media/video/video_file.py +24 -38
- endoreg_db/services/pdf_import.py +326 -246
- endoreg_db/services/video_import.py +29 -16
- endoreg_db/views/media/video_segments.py +187 -259
- {endoreg_db-0.8.4.7.dist-info → endoreg_db-0.8.4.9.dist-info}/METADATA +1 -1
- {endoreg_db-0.8.4.7.dist-info → endoreg_db-0.8.4.9.dist-info}/RECORD +8 -8
- {endoreg_db-0.8.4.7.dist-info → endoreg_db-0.8.4.9.dist-info}/WHEEL +0 -0
- {endoreg_db-0.8.4.7.dist-info → endoreg_db-0.8.4.9.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,22 +4,25 @@ PDF import service module.
|
|
|
4
4
|
Provides high-level functions for importing and anonymizing PDF files,
|
|
5
5
|
combining RawPdfFile creation with text extraction and anonymization.
|
|
6
6
|
"""
|
|
7
|
-
|
|
7
|
+
|
|
8
8
|
import errno
|
|
9
|
+
import hashlib
|
|
9
10
|
import logging
|
|
11
|
+
import os
|
|
10
12
|
import shutil
|
|
11
13
|
import sys
|
|
12
|
-
import
|
|
13
|
-
import
|
|
14
|
+
import time
|
|
15
|
+
from contextlib import contextmanager
|
|
16
|
+
from datetime import date, datetime
|
|
14
17
|
from pathlib import Path
|
|
15
18
|
from typing import TYPE_CHECKING, Union
|
|
16
|
-
|
|
19
|
+
|
|
17
20
|
from django.db import transaction
|
|
21
|
+
|
|
22
|
+
from endoreg_db.models import SensitiveMeta
|
|
18
23
|
from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
|
|
19
24
|
from endoreg_db.models.state.raw_pdf import RawPdfState
|
|
20
|
-
from endoreg_db.models import SensitiveMeta
|
|
21
25
|
from endoreg_db.utils import paths as path_utils
|
|
22
|
-
import time
|
|
23
26
|
|
|
24
27
|
logger = logging.getLogger(__name__)
|
|
25
28
|
|
|
@@ -34,24 +37,61 @@ class PdfImportService:
|
|
|
34
37
|
"""
|
|
35
38
|
Service class for importing and processing PDF files with text extraction and anonymization.
|
|
36
39
|
Uses a central PDF instance pattern for cleaner state management.
|
|
40
|
+
|
|
41
|
+
Supports two processing modes:
|
|
42
|
+
- 'blackening': Simple PDF masking with black rectangles over sensitive areas
|
|
43
|
+
- 'cropping': Advanced mode that crops sensitive regions to separate images
|
|
37
44
|
"""
|
|
38
|
-
|
|
39
|
-
def __init__(self, allow_meta_overwrite: bool = False):
|
|
45
|
+
|
|
46
|
+
def __init__(self, allow_meta_overwrite: bool = False, processing_mode: str = "blackening"):
|
|
40
47
|
"""
|
|
41
48
|
Initialize the PDF import service.
|
|
42
|
-
|
|
49
|
+
|
|
43
50
|
Args:
|
|
44
51
|
allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
|
|
52
|
+
processing_mode: Processing mode - 'blackening' for simple masking, 'cropping' for advanced cropping
|
|
45
53
|
"""
|
|
46
54
|
self.processed_files = set()
|
|
47
55
|
self._report_reader_available = None
|
|
48
56
|
self._report_reader_class = None
|
|
49
57
|
self.allow_meta_overwrite = allow_meta_overwrite
|
|
50
|
-
|
|
58
|
+
|
|
59
|
+
# Validate and set processing mode
|
|
60
|
+
valid_modes = ["blackening", "cropping"]
|
|
61
|
+
if processing_mode not in valid_modes:
|
|
62
|
+
raise ValueError(f"Invalid processing_mode '{processing_mode}'. Must be one of: {valid_modes}")
|
|
63
|
+
self.processing_mode = processing_mode
|
|
64
|
+
|
|
51
65
|
# Central PDF instance management
|
|
52
66
|
self.current_pdf = None
|
|
53
67
|
self.processing_context = {}
|
|
54
|
-
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def with_blackening(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
|
|
71
|
+
"""
|
|
72
|
+
Create a PdfImportService configured for simple PDF blackening mode.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
PdfImportService instance configured for blackening mode
|
|
79
|
+
"""
|
|
80
|
+
return cls(allow_meta_overwrite=allow_meta_overwrite, processing_mode="blackening")
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def with_cropping(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
|
|
84
|
+
"""
|
|
85
|
+
Create a PdfImportService configured for advanced cropping mode.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
PdfImportService instance configured for cropping mode
|
|
92
|
+
"""
|
|
93
|
+
return cls(allow_meta_overwrite=allow_meta_overwrite, processing_mode="cropping")
|
|
94
|
+
|
|
55
95
|
@contextmanager
|
|
56
96
|
def _file_lock(self, path: Path):
|
|
57
97
|
"""Create a file lock to prevent duplicate processing.
|
|
@@ -75,10 +115,7 @@ class PdfImportService:
|
|
|
75
115
|
|
|
76
116
|
if age is not None and age > STALE_LOCK_SECONDS:
|
|
77
117
|
try:
|
|
78
|
-
logger.warning(
|
|
79
|
-
"Stale lock detected for %s (age %.0fs). Reclaiming lock...",
|
|
80
|
-
path, age
|
|
81
|
-
)
|
|
118
|
+
logger.warning("Stale lock detected for %s (age %.0fs). Reclaiming lock...", path, age)
|
|
82
119
|
lock_path.unlink()
|
|
83
120
|
except Exception as e:
|
|
84
121
|
logger.warning("Failed to remove stale lock %s: %s", lock_path, e)
|
|
@@ -100,7 +137,7 @@ class PdfImportService:
|
|
|
100
137
|
lock_path.unlink()
|
|
101
138
|
except OSError:
|
|
102
139
|
pass
|
|
103
|
-
|
|
140
|
+
|
|
104
141
|
def _sha256(self, path: Path, chunk: int = 1024 * 1024) -> str:
|
|
105
142
|
"""Compute SHA256 hash of a file."""
|
|
106
143
|
h = hashlib.sha256()
|
|
@@ -134,7 +171,7 @@ class PdfImportService:
|
|
|
134
171
|
return Path(str(candidate))
|
|
135
172
|
except Exception:
|
|
136
173
|
return None
|
|
137
|
-
|
|
174
|
+
|
|
138
175
|
def _quarantine(self, source: Path) -> Path:
|
|
139
176
|
"""Move file to quarantine directory to prevent re-processing."""
|
|
140
177
|
qdir = path_utils.PDF_DIR / "_processing"
|
|
@@ -150,7 +187,7 @@ class PdfImportService:
|
|
|
150
187
|
else:
|
|
151
188
|
raise
|
|
152
189
|
return target
|
|
153
|
-
|
|
190
|
+
|
|
154
191
|
def _ensure_state(self, pdf_file: "RawPdfFile"):
|
|
155
192
|
"""Ensure PDF file has a state object."""
|
|
156
193
|
if getattr(pdf_file, "state", None):
|
|
@@ -166,29 +203,30 @@ class PdfImportService:
|
|
|
166
203
|
return state
|
|
167
204
|
except Exception:
|
|
168
205
|
return None
|
|
169
|
-
|
|
206
|
+
|
|
170
207
|
def _ensure_report_reading_available(self):
|
|
171
208
|
"""
|
|
172
209
|
Ensure report reading modules are available by adding lx-anonymizer to path.
|
|
173
|
-
|
|
210
|
+
|
|
174
211
|
Returns:
|
|
175
212
|
Tuple of (availability_flag, ReportReader_class)
|
|
176
213
|
"""
|
|
177
214
|
if self._report_reader_available is not None:
|
|
178
215
|
return self._report_reader_available, self._report_reader_class
|
|
179
|
-
|
|
216
|
+
|
|
180
217
|
try:
|
|
181
218
|
# Try direct import first
|
|
182
219
|
from lx_anonymizer import ReportReader
|
|
183
|
-
|
|
220
|
+
|
|
184
221
|
logger.info("Successfully imported lx_anonymizer ReportReader module")
|
|
185
222
|
self._report_reader_available = True
|
|
186
223
|
self._report_reader_class = ReportReader
|
|
187
224
|
return True, ReportReader
|
|
188
|
-
|
|
225
|
+
|
|
189
226
|
except ImportError:
|
|
190
227
|
# Optional: honor LX_ANONYMIZER_PATH=/abs/path/to/src
|
|
191
228
|
import importlib
|
|
229
|
+
|
|
192
230
|
extra = os.getenv("LX_ANONYMIZER_PATH")
|
|
193
231
|
if extra and extra not in sys.path and Path(extra).exists():
|
|
194
232
|
sys.path.insert(0, extra)
|
|
@@ -205,18 +243,17 @@ class PdfImportService:
|
|
|
205
243
|
# Keep path for future imports if it worked; otherwise remove.
|
|
206
244
|
if "ReportReader" not in locals() and extra in sys.path:
|
|
207
245
|
sys.path.remove(extra)
|
|
208
|
-
|
|
246
|
+
|
|
209
247
|
self._report_reader_available = False
|
|
210
248
|
self._report_reader_class = None
|
|
211
249
|
return False, None
|
|
212
250
|
|
|
213
|
-
|
|
214
251
|
def _ensure_default_patient_data(self, pdf_instance: "RawPdfFile" = None) -> None:
|
|
215
252
|
"""
|
|
216
253
|
Ensure PDF has minimum required patient data in SensitiveMeta.
|
|
217
254
|
Creates default values if data is missing after text processing.
|
|
218
255
|
Uses the central PDF instance if no specific instance provided.
|
|
219
|
-
|
|
256
|
+
|
|
220
257
|
Args:
|
|
221
258
|
pdf_instance: Optional specific PDF instance, defaults to self.current_pdf
|
|
222
259
|
"""
|
|
@@ -224,76 +261,80 @@ class PdfImportService:
|
|
|
224
261
|
if not pdf_file:
|
|
225
262
|
logger.warning("No PDF instance available for ensuring default patient data")
|
|
226
263
|
return
|
|
227
|
-
|
|
264
|
+
|
|
228
265
|
if not pdf_file.sensitive_meta:
|
|
229
266
|
logger.info(f"No SensitiveMeta found for PDF {pdf_file.pdf_hash}, creating default")
|
|
230
|
-
|
|
267
|
+
|
|
231
268
|
# Create default SensitiveMeta with placeholder data
|
|
232
269
|
default_data = {
|
|
233
270
|
"patient_first_name": "Patient",
|
|
234
|
-
"patient_last_name": "Unknown",
|
|
271
|
+
"patient_last_name": "Unknown",
|
|
235
272
|
"patient_dob": date(1990, 1, 1), # Default DOB
|
|
236
273
|
"examination_date": date.today(),
|
|
237
|
-
"center_name": pdf_file.center.name if pdf_file.center else "university_hospital_wuerzburg"
|
|
274
|
+
"center_name": pdf_file.center.name if pdf_file.center else "university_hospital_wuerzburg",
|
|
238
275
|
}
|
|
239
|
-
|
|
276
|
+
|
|
240
277
|
try:
|
|
241
278
|
sensitive_meta = SensitiveMeta.create_from_dict(default_data)
|
|
242
279
|
pdf_file.sensitive_meta = sensitive_meta
|
|
243
|
-
pdf_file.save(update_fields=[
|
|
280
|
+
pdf_file.save(update_fields=["sensitive_meta"])
|
|
244
281
|
logger.info(f"Created default SensitiveMeta for PDF {pdf_file.pdf_hash}")
|
|
245
282
|
except Exception as e:
|
|
246
283
|
logger.error(f"Failed to create default SensitiveMeta for PDF {pdf_file.pdf_hash}: {e}")
|
|
247
284
|
|
|
248
285
|
def import_and_anonymize(
|
|
249
|
-
self,
|
|
250
|
-
file_path: Union[Path, str],
|
|
251
|
-
center_name: str,
|
|
286
|
+
self,
|
|
287
|
+
file_path: Union[Path, str],
|
|
288
|
+
center_name: str,
|
|
252
289
|
delete_source: bool = False,
|
|
253
290
|
retry: bool = False,
|
|
254
291
|
) -> "RawPdfFile":
|
|
255
292
|
"""
|
|
256
293
|
Import a PDF file and anonymize it using ReportReader.
|
|
257
294
|
Uses centralized PDF instance management pattern.
|
|
258
|
-
|
|
295
|
+
|
|
296
|
+
The processing mode is determined by the service initialization:
|
|
297
|
+
- 'blackening': Creates an anonymized PDF with black rectangles over sensitive regions
|
|
298
|
+
- 'cropping': Advanced mode that crops sensitive regions to separate images
|
|
299
|
+
|
|
259
300
|
Args:
|
|
260
301
|
file_path: Path to the PDF file to import
|
|
261
302
|
center_name: Name of the center to associate with PDF
|
|
262
303
|
delete_source: Whether to delete the source file after import
|
|
263
304
|
retry: Whether this is a retry attempt
|
|
264
|
-
|
|
305
|
+
|
|
265
306
|
Returns:
|
|
266
307
|
RawPdfFile instance after import and processing
|
|
267
|
-
|
|
308
|
+
|
|
268
309
|
Raises:
|
|
269
310
|
Exception: On any failure during import or processing
|
|
270
311
|
"""
|
|
271
312
|
try:
|
|
272
313
|
# Initialize processing context
|
|
273
314
|
self._initialize_processing_context(file_path, center_name, delete_source, retry)
|
|
274
|
-
|
|
315
|
+
|
|
275
316
|
# Step 1: Validate and prepare file
|
|
276
317
|
self._validate_and_prepare_file()
|
|
277
|
-
|
|
318
|
+
|
|
278
319
|
# Step 2: Create or retrieve PDF instance
|
|
279
320
|
self._create_or_retrieve_pdf_instance()
|
|
280
|
-
|
|
321
|
+
|
|
281
322
|
# Early return check - if no PDF instance was created, return None
|
|
282
323
|
if not self.current_pdf:
|
|
283
324
|
logger.warning(f"No PDF instance created for {file_path}, returning None")
|
|
284
325
|
return None
|
|
285
|
-
|
|
326
|
+
|
|
286
327
|
# Step 3: Setup processing environment
|
|
287
328
|
self._setup_processing_environment()
|
|
288
|
-
|
|
329
|
+
|
|
289
330
|
# Step 4: Process text and metadata
|
|
290
331
|
self._process_text_and_metadata()
|
|
291
|
-
|
|
332
|
+
|
|
292
333
|
# Step 5: Finalize processing
|
|
293
334
|
self._finalize_processing()
|
|
294
|
-
|
|
335
|
+
|
|
295
336
|
return self.current_pdf
|
|
296
|
-
|
|
337
|
+
|
|
297
338
|
except ValueError as e:
|
|
298
339
|
# Handle "File already being processed" case specifically
|
|
299
340
|
if "already being processed" in str(e):
|
|
@@ -312,50 +353,49 @@ class PdfImportService:
|
|
|
312
353
|
# Always cleanup context
|
|
313
354
|
self._cleanup_processing_context()
|
|
314
355
|
|
|
315
|
-
def _initialize_processing_context(self, file_path: Union[Path, str], center_name: str,
|
|
316
|
-
delete_source: bool, retry: bool):
|
|
356
|
+
def _initialize_processing_context(self, file_path: Union[Path, str], center_name: str, delete_source: bool, retry: bool):
|
|
317
357
|
"""Initialize the processing context for the current PDF."""
|
|
318
358
|
self.processing_context = {
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
359
|
+
"file_path": Path(file_path),
|
|
360
|
+
"original_file_path": Path(file_path),
|
|
361
|
+
"center_name": center_name,
|
|
362
|
+
"delete_source": delete_source,
|
|
363
|
+
"retry": retry,
|
|
364
|
+
"file_hash": None,
|
|
365
|
+
"processing_started": False,
|
|
366
|
+
"text_extracted": False,
|
|
367
|
+
"metadata_processed": False,
|
|
368
|
+
"anonymization_completed": False,
|
|
329
369
|
}
|
|
330
|
-
|
|
370
|
+
|
|
331
371
|
# Check if already processed (only during current session to prevent race conditions)
|
|
332
372
|
if str(file_path) in self.processed_files:
|
|
333
373
|
logger.info(f"File {file_path} already being processed in current session, skipping")
|
|
334
374
|
raise ValueError("File already being processed")
|
|
335
|
-
|
|
375
|
+
|
|
336
376
|
logger.info(f"Starting import and processing for: {file_path}")
|
|
337
377
|
|
|
338
378
|
def _validate_and_prepare_file(self):
|
|
339
379
|
"""Validate file existence and calculate hash."""
|
|
340
|
-
file_path = self.processing_context[
|
|
341
|
-
|
|
380
|
+
file_path = self.processing_context["file_path"]
|
|
381
|
+
|
|
342
382
|
if not file_path.exists():
|
|
343
383
|
raise FileNotFoundError(f"PDF file not found: {file_path}")
|
|
344
|
-
|
|
384
|
+
|
|
345
385
|
try:
|
|
346
|
-
self.processing_context[
|
|
386
|
+
self.processing_context["file_hash"] = self._sha256(file_path)
|
|
347
387
|
except Exception as e:
|
|
348
388
|
logger.warning(f"Could not calculate file hash: {e}")
|
|
349
|
-
self.processing_context[
|
|
389
|
+
self.processing_context["file_hash"] = None
|
|
350
390
|
|
|
351
391
|
def _create_or_retrieve_pdf_instance(self):
|
|
352
392
|
"""Create new or retrieve existing PDF instance."""
|
|
353
|
-
file_path = self.processing_context[
|
|
354
|
-
center_name = self.processing_context[
|
|
355
|
-
delete_source = self.processing_context[
|
|
356
|
-
retry = self.processing_context[
|
|
357
|
-
file_hash = self.processing_context[
|
|
358
|
-
|
|
393
|
+
file_path = self.processing_context["file_path"]
|
|
394
|
+
center_name = self.processing_context["center_name"]
|
|
395
|
+
delete_source = self.processing_context["delete_source"]
|
|
396
|
+
retry = self.processing_context["retry"]
|
|
397
|
+
file_hash = self.processing_context["file_hash"]
|
|
398
|
+
|
|
359
399
|
if not retry:
|
|
360
400
|
# Check for existing PDF and handle duplicates
|
|
361
401
|
with self._file_lock(file_path):
|
|
@@ -373,11 +413,11 @@ class PdfImportService:
|
|
|
373
413
|
# Retry processing
|
|
374
414
|
logger.info(f"Reprocessing existing PDF {existing.pdf_hash}")
|
|
375
415
|
return self._retry_existing_pdf(existing)
|
|
376
|
-
|
|
416
|
+
|
|
377
417
|
# Create new PDF instance
|
|
378
418
|
logger.info("Creating new RawPdfFile instance...")
|
|
379
419
|
from django.db import IntegrityError
|
|
380
|
-
|
|
420
|
+
|
|
381
421
|
try:
|
|
382
422
|
if not retry:
|
|
383
423
|
self.current_pdf = RawPdfFile.create_from_file_initialized(
|
|
@@ -389,17 +429,17 @@ class PdfImportService:
|
|
|
389
429
|
# Retrieve existing for retry
|
|
390
430
|
self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
|
|
391
431
|
logger.info(f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}")
|
|
392
|
-
|
|
432
|
+
|
|
393
433
|
# Check if retry is actually needed
|
|
394
434
|
if self.current_pdf.text:
|
|
395
435
|
logger.info(f"Existing PDF {self.current_pdf.pdf_hash} already processed during retry - returning")
|
|
396
436
|
return
|
|
397
|
-
|
|
437
|
+
|
|
398
438
|
if not self.current_pdf:
|
|
399
439
|
raise RuntimeError("Failed to create RawPdfFile instance")
|
|
400
|
-
|
|
440
|
+
|
|
401
441
|
logger.info(f"PDF instance ready: {self.current_pdf.pdf_hash}")
|
|
402
|
-
|
|
442
|
+
|
|
403
443
|
except IntegrityError:
|
|
404
444
|
# Race condition - another worker created it
|
|
405
445
|
if file_hash:
|
|
@@ -410,27 +450,27 @@ class PdfImportService:
|
|
|
410
450
|
|
|
411
451
|
def _setup_processing_environment(self):
|
|
412
452
|
"""Setup processing environment and state."""
|
|
413
|
-
original_path = self.processing_context.get(
|
|
453
|
+
original_path = self.processing_context.get("file_path")
|
|
414
454
|
|
|
415
455
|
# Create sensitive file copy
|
|
416
456
|
self.create_sensitive_file(self.current_pdf, original_path)
|
|
417
|
-
|
|
457
|
+
|
|
418
458
|
# Update file path to point to sensitive copy
|
|
419
|
-
self.processing_context[
|
|
420
|
-
self.processing_context[
|
|
459
|
+
self.processing_context["file_path"] = self.current_pdf.file.path
|
|
460
|
+
self.processing_context["sensitive_copy_created"] = True
|
|
421
461
|
try:
|
|
422
|
-
self.processing_context[
|
|
462
|
+
self.processing_context["sensitive_file_path"] = Path(self.current_pdf.file.path)
|
|
423
463
|
except Exception:
|
|
424
|
-
self.processing_context[
|
|
425
|
-
|
|
464
|
+
self.processing_context["sensitive_file_path"] = None
|
|
465
|
+
|
|
426
466
|
# Ensure state exists
|
|
427
467
|
state = self.current_pdf.get_or_create_state()
|
|
428
468
|
state.mark_processing_started()
|
|
429
|
-
self.processing_context[
|
|
430
|
-
|
|
469
|
+
self.processing_context["processing_started"] = True
|
|
470
|
+
|
|
431
471
|
# Mark as processed to prevent duplicates
|
|
432
|
-
self.processed_files.add(str(self.processing_context[
|
|
433
|
-
|
|
472
|
+
self.processed_files.add(str(self.processing_context["file_path"]))
|
|
473
|
+
|
|
434
474
|
# Ensure default patient data
|
|
435
475
|
logger.info("Ensuring default patient data...")
|
|
436
476
|
self._ensure_default_patient_data(self.current_pdf)
|
|
@@ -438,83 +478,138 @@ class PdfImportService:
|
|
|
438
478
|
def _process_text_and_metadata(self):
|
|
439
479
|
"""Process text extraction and metadata using ReportReader."""
|
|
440
480
|
report_reading_available, ReportReader = self._ensure_report_reading_available()
|
|
441
|
-
|
|
481
|
+
|
|
442
482
|
if not report_reading_available:
|
|
443
483
|
logger.warning("Report reading not available (lx_anonymizer not found)")
|
|
444
484
|
self._mark_processing_incomplete("no_report_reader")
|
|
445
485
|
return
|
|
446
|
-
|
|
486
|
+
|
|
447
487
|
if not self.current_pdf.file:
|
|
448
488
|
logger.warning("No file available for text processing")
|
|
449
489
|
self._mark_processing_incomplete("no_file")
|
|
450
490
|
return
|
|
451
|
-
|
|
491
|
+
|
|
452
492
|
try:
|
|
453
|
-
logger.info("Starting text extraction and metadata processing with ReportReader...")
|
|
454
|
-
|
|
455
|
-
# Setup output directories
|
|
456
|
-
crops_dir = path_utils.PDF_DIR / 'cropped_regions'
|
|
457
|
-
anonymized_dir = path_utils.PDF_DIR / 'anonymized'
|
|
458
|
-
crops_dir.mkdir(parents=True, exist_ok=True)
|
|
459
|
-
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
493
|
+
logger.info(f"Starting text extraction and metadata processing with ReportReader (mode: {self.processing_mode})...")
|
|
460
494
|
|
|
461
495
|
# Initialize ReportReader
|
|
462
|
-
report_reader = ReportReader(
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
496
|
+
report_reader = ReportReader(report_root_path=str(path_utils.STORAGE_DIR), locale="de_DE", text_date_format="%d.%m.%Y")
|
|
497
|
+
|
|
498
|
+
if self.processing_mode == "cropping":
|
|
499
|
+
# Use advanced cropping method (existing implementation)
|
|
500
|
+
self._process_with_cropping(report_reader)
|
|
501
|
+
else: # blackening mode
|
|
502
|
+
# Use enhanced process_report with PDF masking
|
|
503
|
+
self._process_with_blackening(report_reader)
|
|
467
504
|
|
|
468
|
-
# Process with cropping
|
|
469
|
-
original_text, anonymized_text, extracted_metadata, cropped_regions, anonymized_pdf_path = report_reader.process_report_with_cropping(
|
|
470
|
-
pdf_path=self.processing_context['file_path'],
|
|
471
|
-
crop_sensitive_regions=True,
|
|
472
|
-
crop_output_dir=str(crops_dir),
|
|
473
|
-
anonymization_output_dir=str(anonymized_dir)
|
|
474
|
-
)
|
|
475
|
-
|
|
476
|
-
# Store results in context
|
|
477
|
-
self.processing_context.update({
|
|
478
|
-
'original_text': original_text,
|
|
479
|
-
'anonymized_text': anonymized_text,
|
|
480
|
-
'extracted_metadata': extracted_metadata,
|
|
481
|
-
'cropped_regions': cropped_regions,
|
|
482
|
-
'anonymized_pdf_path': anonymized_pdf_path
|
|
483
|
-
})
|
|
484
|
-
|
|
485
|
-
if original_text:
|
|
486
|
-
self._apply_text_results()
|
|
487
|
-
self.processing_context['text_extracted'] = True
|
|
488
|
-
|
|
489
|
-
if extracted_metadata:
|
|
490
|
-
self._apply_metadata_results()
|
|
491
|
-
self.processing_context['metadata_processed'] = True
|
|
492
|
-
|
|
493
|
-
if anonymized_pdf_path:
|
|
494
|
-
self._apply_anonymized_pdf()
|
|
495
|
-
self.processing_context['anonymization_completed'] = True
|
|
496
|
-
|
|
497
505
|
except Exception as e:
|
|
498
506
|
logger.warning(f"Text processing failed: {e}")
|
|
499
507
|
self._mark_processing_incomplete("text_processing_failed")
|
|
500
508
|
|
|
509
|
+
def _process_with_blackening(self, report_reader):
|
|
510
|
+
"""Process PDF using simple blackening/masking mode."""
|
|
511
|
+
logger.info("Using simple PDF blackening mode...")
|
|
512
|
+
|
|
513
|
+
# Setup anonymized directory
|
|
514
|
+
anonymized_dir = path_utils.PDF_DIR / "anonymized"
|
|
515
|
+
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
516
|
+
|
|
517
|
+
# Generate output path for anonymized PDF
|
|
518
|
+
pdf_hash = self.current_pdf.pdf_hash
|
|
519
|
+
anonymized_output_path = anonymized_dir / f"{pdf_hash}_anonymized.pdf"
|
|
520
|
+
|
|
521
|
+
# Process with enhanced process_report method (returns 4-tuple now)
|
|
522
|
+
original_text, anonymized_text, extracted_metadata, anonymized_pdf_path = report_reader.process_report(
|
|
523
|
+
pdf_path=self.processing_context["file_path"], create_anonymized_pdf=True, anonymized_pdf_output_path=str(anonymized_output_path)
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
# Store results in context
|
|
527
|
+
self.processing_context.update(
|
|
528
|
+
{
|
|
529
|
+
"original_text": original_text,
|
|
530
|
+
"anonymized_text": anonymized_text,
|
|
531
|
+
"extracted_metadata": extracted_metadata,
|
|
532
|
+
"cropped_regions": None, # Not available in blackening mode
|
|
533
|
+
"anonymized_pdf_path": anonymized_pdf_path,
|
|
534
|
+
}
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
# Apply results
|
|
538
|
+
if original_text:
|
|
539
|
+
self._apply_text_results()
|
|
540
|
+
self.processing_context["text_extracted"] = True
|
|
541
|
+
|
|
542
|
+
if extracted_metadata:
|
|
543
|
+
self._apply_metadata_results()
|
|
544
|
+
self.processing_context["metadata_processed"] = True
|
|
545
|
+
|
|
546
|
+
if anonymized_pdf_path:
|
|
547
|
+
self._apply_anonymized_pdf()
|
|
548
|
+
self.processing_context["anonymization_completed"] = True
|
|
549
|
+
|
|
550
|
+
logger.info("PDF blackening processing completed")
|
|
551
|
+
|
|
552
|
+
def _process_with_cropping(self, report_reader):
|
|
553
|
+
"""Process PDF using advanced cropping mode (existing implementation)."""
|
|
554
|
+
logger.info("Using advanced cropping mode...")
|
|
555
|
+
|
|
556
|
+
# Setup output directories
|
|
557
|
+
crops_dir = path_utils.PDF_DIR / "cropped_regions"
|
|
558
|
+
anonymized_dir = path_utils.PDF_DIR / "anonymized"
|
|
559
|
+
crops_dir.mkdir(parents=True, exist_ok=True)
|
|
560
|
+
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
561
|
+
|
|
562
|
+
# Process with cropping (returns 5-tuple)
|
|
563
|
+
original_text, anonymized_text, extracted_metadata, cropped_regions, anonymized_pdf_path = report_reader.process_report_with_cropping(
|
|
564
|
+
pdf_path=self.processing_context["file_path"],
|
|
565
|
+
crop_sensitive_regions=True,
|
|
566
|
+
crop_output_dir=str(crops_dir),
|
|
567
|
+
anonymization_output_dir=str(anonymized_dir),
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
# Store results in context
|
|
571
|
+
self.processing_context.update(
|
|
572
|
+
{
|
|
573
|
+
"original_text": original_text,
|
|
574
|
+
"anonymized_text": anonymized_text,
|
|
575
|
+
"extracted_metadata": extracted_metadata,
|
|
576
|
+
"cropped_regions": cropped_regions,
|
|
577
|
+
"anonymized_pdf_path": anonymized_pdf_path,
|
|
578
|
+
}
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
# Apply results
|
|
582
|
+
if original_text:
|
|
583
|
+
self._apply_text_results()
|
|
584
|
+
self.processing_context["text_extracted"] = True
|
|
585
|
+
|
|
586
|
+
if extracted_metadata:
|
|
587
|
+
self._apply_metadata_results()
|
|
588
|
+
self.processing_context["metadata_processed"] = True
|
|
589
|
+
|
|
590
|
+
if anonymized_pdf_path:
|
|
591
|
+
self._apply_anonymized_pdf()
|
|
592
|
+
self.processing_context["anonymization_completed"] = True
|
|
593
|
+
|
|
594
|
+
logger.info("PDF cropping processing completed")
|
|
595
|
+
|
|
501
596
|
def _apply_text_results(self):
|
|
502
597
|
"""Apply text extraction results to the PDF instance."""
|
|
503
598
|
if not self.current_pdf:
|
|
504
599
|
logger.warning("Cannot apply text results - no PDF instance available")
|
|
505
600
|
return
|
|
506
|
-
|
|
507
|
-
original_text = self.processing_context.get(
|
|
508
|
-
anonymized_text = self.processing_context.get(
|
|
509
|
-
|
|
601
|
+
|
|
602
|
+
original_text = self.processing_context.get("original_text")
|
|
603
|
+
anonymized_text = self.processing_context.get("anonymized_text")
|
|
604
|
+
|
|
510
605
|
if not original_text:
|
|
511
606
|
logger.warning("No original text available to apply")
|
|
512
607
|
return
|
|
513
|
-
|
|
608
|
+
|
|
514
609
|
# Store extracted text
|
|
515
610
|
self.current_pdf.text = original_text
|
|
516
611
|
logger.info(f"Extracted {len(original_text)} characters of text from PDF")
|
|
517
|
-
|
|
612
|
+
|
|
518
613
|
# Handle anonymized text
|
|
519
614
|
if anonymized_text and anonymized_text != original_text:
|
|
520
615
|
self.current_pdf.anonymized = True
|
|
@@ -525,56 +620,52 @@ class PdfImportService:
|
|
|
525
620
|
if not self.current_pdf:
|
|
526
621
|
logger.warning("Cannot apply metadata results - no PDF instance available")
|
|
527
622
|
return
|
|
528
|
-
|
|
529
|
-
extracted_metadata = self.processing_context.get(
|
|
530
|
-
|
|
623
|
+
|
|
624
|
+
extracted_metadata = self.processing_context.get("extracted_metadata")
|
|
625
|
+
|
|
531
626
|
if not self.current_pdf.sensitive_meta or not extracted_metadata:
|
|
532
627
|
logger.debug("No sensitive meta or extracted metadata available")
|
|
533
628
|
return
|
|
534
|
-
|
|
629
|
+
|
|
535
630
|
sm = self.current_pdf.sensitive_meta
|
|
536
|
-
|
|
631
|
+
|
|
537
632
|
# Map ReportReader metadata to SensitiveMeta fields
|
|
538
633
|
metadata_mapping = {
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
634
|
+
"patient_first_name": "patient_first_name",
|
|
635
|
+
"patient_last_name": "patient_last_name",
|
|
636
|
+
"patient_dob": "patient_dob",
|
|
637
|
+
"examination_date": "examination_date",
|
|
638
|
+
"examiner_first_name": "examiner_first_name",
|
|
639
|
+
"examiner_last_name": "examiner_last_name",
|
|
640
|
+
"endoscope_type": "endoscope_type",
|
|
641
|
+
"casenumber": "case_number",
|
|
547
642
|
}
|
|
548
|
-
|
|
643
|
+
|
|
549
644
|
# Update fields with extracted information
|
|
550
645
|
updated_fields = []
|
|
551
646
|
for meta_key, sm_field in metadata_mapping.items():
|
|
552
647
|
if extracted_metadata.get(meta_key) and hasattr(sm, sm_field):
|
|
553
648
|
old_value = getattr(sm, sm_field)
|
|
554
649
|
raw_value = extracted_metadata[meta_key]
|
|
555
|
-
|
|
650
|
+
|
|
556
651
|
# Skip if we just got the field name as a string (indicates no actual data)
|
|
557
652
|
if isinstance(raw_value, str) and raw_value == meta_key:
|
|
558
653
|
continue
|
|
559
|
-
|
|
654
|
+
|
|
560
655
|
# Handle date fields specially
|
|
561
|
-
if sm_field in [
|
|
656
|
+
if sm_field in ["patient_dob", "examination_date"]:
|
|
562
657
|
new_value = self._parse_date_field(raw_value, meta_key, sm_field)
|
|
563
658
|
if new_value is None:
|
|
564
659
|
continue
|
|
565
660
|
else:
|
|
566
661
|
new_value = raw_value
|
|
567
|
-
|
|
662
|
+
|
|
568
663
|
# Configurable overwrite policy
|
|
569
|
-
should_overwrite =
|
|
570
|
-
self.allow_meta_overwrite
|
|
571
|
-
or not old_value
|
|
572
|
-
or old_value in ['Patient', 'Unknown']
|
|
573
|
-
)
|
|
664
|
+
should_overwrite = self.allow_meta_overwrite or not old_value or old_value in ["Patient", "Unknown"]
|
|
574
665
|
if new_value and should_overwrite:
|
|
575
666
|
setattr(sm, sm_field, new_value)
|
|
576
667
|
updated_fields.append(sm_field)
|
|
577
|
-
|
|
668
|
+
|
|
578
669
|
if updated_fields:
|
|
579
670
|
sm.save()
|
|
580
671
|
logger.info(f"Updated SensitiveMeta fields: {updated_fields}")
|
|
@@ -585,28 +676,25 @@ class PdfImportService:
|
|
|
585
676
|
if isinstance(raw_value, str):
|
|
586
677
|
# Skip if the value is just the field name itself
|
|
587
678
|
if raw_value == meta_key:
|
|
588
|
-
logger.warning(
|
|
589
|
-
"Skipping date field %s - got field name '%s' instead of actual date",
|
|
590
|
-
sm_field, raw_value
|
|
591
|
-
)
|
|
679
|
+
logger.warning("Skipping date field %s - got field name '%s' instead of actual date", sm_field, raw_value)
|
|
592
680
|
return None
|
|
593
|
-
|
|
681
|
+
|
|
594
682
|
# Try common date formats
|
|
595
|
-
date_formats = [
|
|
683
|
+
date_formats = ["%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y"]
|
|
596
684
|
for fmt in date_formats:
|
|
597
685
|
try:
|
|
598
686
|
return datetime.strptime(raw_value, fmt).date()
|
|
599
687
|
except ValueError:
|
|
600
688
|
continue
|
|
601
|
-
|
|
689
|
+
|
|
602
690
|
logger.warning("Could not parse date '%s' for field %s", raw_value, sm_field)
|
|
603
691
|
return None
|
|
604
|
-
|
|
605
|
-
elif hasattr(raw_value,
|
|
692
|
+
|
|
693
|
+
elif hasattr(raw_value, "date"):
|
|
606
694
|
return raw_value.date()
|
|
607
695
|
else:
|
|
608
696
|
return raw_value
|
|
609
|
-
|
|
697
|
+
|
|
610
698
|
except (ValueError, AttributeError) as e:
|
|
611
699
|
logger.warning("Date parsing failed for %s: %s", sm_field, e)
|
|
612
700
|
return None
|
|
@@ -626,7 +714,7 @@ class PdfImportService:
|
|
|
626
714
|
logger.warning("Cannot apply anonymized PDF - no PDF instance available")
|
|
627
715
|
return
|
|
628
716
|
|
|
629
|
-
anonymized_pdf_path = self.processing_context.get(
|
|
717
|
+
anonymized_pdf_path = self.processing_context.get("anonymized_pdf_path")
|
|
630
718
|
if not anonymized_pdf_path:
|
|
631
719
|
logger.debug("No anonymized_pdf_path present in processing context")
|
|
632
720
|
return
|
|
@@ -647,7 +735,7 @@ class PdfImportService:
|
|
|
647
735
|
relative_name = str(anonymized_path)
|
|
648
736
|
|
|
649
737
|
# Only update if something actually changed
|
|
650
|
-
if getattr(self.current_pdf.anonymized_file,
|
|
738
|
+
if getattr(self.current_pdf.anonymized_file, "name", None) != relative_name:
|
|
651
739
|
self.current_pdf.anonymized_file.name = relative_name
|
|
652
740
|
|
|
653
741
|
# Ensure model/state reflect anonymization even if text didn't differ
|
|
@@ -656,14 +744,14 @@ class PdfImportService:
|
|
|
656
744
|
|
|
657
745
|
# Persist cropped regions info somewhere useful (optional & non-breaking)
|
|
658
746
|
# If your model has a field for this, persist there; otherwise we just log.
|
|
659
|
-
cropped_regions = self.processing_context.get(
|
|
747
|
+
cropped_regions = self.processing_context.get("cropped_regions")
|
|
660
748
|
if cropped_regions:
|
|
661
749
|
logger.debug("Cropped regions recorded (%d regions).", len(cropped_regions))
|
|
662
750
|
|
|
663
751
|
# Save model changes
|
|
664
|
-
update_fields = [
|
|
665
|
-
if
|
|
666
|
-
update_fields.append(
|
|
752
|
+
update_fields = ["anonymized_file"]
|
|
753
|
+
if "anonymized" in self.current_pdf.__dict__:
|
|
754
|
+
update_fields.append("anonymized")
|
|
667
755
|
self.current_pdf.save(update_fields=update_fields)
|
|
668
756
|
|
|
669
757
|
# Mark state as anonymized immediately; this keeps downstream flows working
|
|
@@ -676,26 +764,25 @@ class PdfImportService:
|
|
|
676
764
|
except Exception as e:
|
|
677
765
|
logger.warning("Could not set anonymized file reference: %s", e)
|
|
678
766
|
|
|
679
|
-
|
|
680
767
|
def _finalize_processing(self):
|
|
681
768
|
"""Finalize processing and update state."""
|
|
682
769
|
if not self.current_pdf:
|
|
683
770
|
logger.warning("Cannot finalize processing - no PDF instance available")
|
|
684
771
|
return
|
|
685
|
-
|
|
772
|
+
|
|
686
773
|
try:
|
|
687
774
|
# Update state based on processing results
|
|
688
775
|
state = self._ensure_state(self.current_pdf)
|
|
689
|
-
|
|
690
|
-
if self.processing_context.get(
|
|
776
|
+
|
|
777
|
+
if self.processing_context.get("text_extracted") and state:
|
|
691
778
|
state.mark_anonymized()
|
|
692
|
-
|
|
779
|
+
|
|
693
780
|
# Save all changes
|
|
694
781
|
with transaction.atomic():
|
|
695
782
|
self.current_pdf.save()
|
|
696
783
|
if state:
|
|
697
784
|
state.save()
|
|
698
|
-
|
|
785
|
+
|
|
699
786
|
logger.info("PDF processing completed successfully")
|
|
700
787
|
except Exception as e:
|
|
701
788
|
logger.warning(f"Failed to finalize processing: {e}")
|
|
@@ -705,7 +792,7 @@ class PdfImportService:
|
|
|
705
792
|
if not self.current_pdf:
|
|
706
793
|
logger.warning(f"Cannot mark processing incomplete - no PDF instance available. Reason: {reason}")
|
|
707
794
|
return
|
|
708
|
-
|
|
795
|
+
|
|
709
796
|
try:
|
|
710
797
|
state = self._ensure_state(self.current_pdf)
|
|
711
798
|
if state:
|
|
@@ -714,7 +801,7 @@ class PdfImportService:
|
|
|
714
801
|
state.sensitive_meta_processed = False
|
|
715
802
|
state.save()
|
|
716
803
|
logger.info(f"Set PDF state: processed=False due to {reason}")
|
|
717
|
-
|
|
804
|
+
|
|
718
805
|
# Save changes
|
|
719
806
|
with transaction.atomic():
|
|
720
807
|
self.current_pdf.save()
|
|
@@ -729,12 +816,12 @@ class PdfImportService:
|
|
|
729
816
|
if file_path_str and file_path_str in self.processed_files:
|
|
730
817
|
self.processed_files.remove(file_path_str)
|
|
731
818
|
logger.debug(f"Removed {file_path_str} from processed files for retry")
|
|
732
|
-
|
|
819
|
+
|
|
733
820
|
return self.import_and_anonymize(
|
|
734
821
|
file_path=existing_pdf.file.path,
|
|
735
822
|
center_name=existing_pdf.center.name if existing_pdf.center else "unknown_center",
|
|
736
823
|
delete_source=False,
|
|
737
|
-
retry=True
|
|
824
|
+
retry=True,
|
|
738
825
|
)
|
|
739
826
|
except Exception as e:
|
|
740
827
|
logger.error(f"Failed to re-import existing PDF {existing_pdf.pdf_hash}: {e}")
|
|
@@ -744,9 +831,9 @@ class PdfImportService:
|
|
|
744
831
|
def _cleanup_on_error(self):
|
|
745
832
|
"""Cleanup processing context on error."""
|
|
746
833
|
try:
|
|
747
|
-
if self.current_pdf and hasattr(self.current_pdf,
|
|
834
|
+
if self.current_pdf and hasattr(self.current_pdf, "state"):
|
|
748
835
|
state = self._ensure_state(self.current_pdf)
|
|
749
|
-
if state and self.processing_context.get(
|
|
836
|
+
if state and self.processing_context.get("processing_started"):
|
|
750
837
|
state.text_meta_extracted = False
|
|
751
838
|
state.pdf_meta_extracted = False
|
|
752
839
|
state.sensitive_meta_processed = False
|
|
@@ -756,7 +843,7 @@ class PdfImportService:
|
|
|
756
843
|
logger.warning(f"Error during cleanup: {e}")
|
|
757
844
|
finally:
|
|
758
845
|
# Remove any sensitive copy created during this processing run
|
|
759
|
-
sensitive_created = self.processing_context.get(
|
|
846
|
+
sensitive_created = self.processing_context.get("sensitive_copy_created")
|
|
760
847
|
if sensitive_created:
|
|
761
848
|
pdf_obj = self.current_pdf
|
|
762
849
|
try:
|
|
@@ -770,20 +857,16 @@ class PdfImportService:
|
|
|
770
857
|
logger.warning("Failed to remove sensitive copy during error cleanup: %s", cleanup_exc)
|
|
771
858
|
|
|
772
859
|
# Always clean up processed files set to prevent blocks
|
|
773
|
-
file_path = self.processing_context.get(
|
|
860
|
+
file_path = self.processing_context.get("file_path")
|
|
774
861
|
if file_path and str(file_path) in self.processed_files:
|
|
775
862
|
self.processed_files.remove(str(file_path))
|
|
776
863
|
logger.debug(f"Removed {file_path} from processed files during error cleanup")
|
|
777
864
|
|
|
778
865
|
try:
|
|
779
|
-
original_path = self.processing_context.get(
|
|
866
|
+
original_path = self.processing_context.get("original_file_path")
|
|
780
867
|
logger.debug("PDF cleanup original path: %s (%s)", original_path, type(original_path))
|
|
781
868
|
raw_dir = original_path.parent if isinstance(original_path, Path) else None
|
|
782
|
-
if (
|
|
783
|
-
isinstance(original_path, Path)
|
|
784
|
-
and original_path.exists()
|
|
785
|
-
and not self.processing_context.get('sensitive_copy_created')
|
|
786
|
-
):
|
|
869
|
+
if isinstance(original_path, Path) and original_path.exists() and not self.processing_context.get("sensitive_copy_created"):
|
|
787
870
|
try:
|
|
788
871
|
original_path.unlink()
|
|
789
872
|
logger.info("Removed original file %s during error cleanup", original_path)
|
|
@@ -822,7 +905,7 @@ class PdfImportService:
|
|
|
822
905
|
raw_count = len(list(raw_dir.glob("*"))) if raw_dir and raw_dir.exists() else None
|
|
823
906
|
pdf_count = len(list(pdf_dir.glob("*"))) if pdf_dir and pdf_dir.exists() else None
|
|
824
907
|
|
|
825
|
-
sensitive_path = self.processing_context.get(
|
|
908
|
+
sensitive_path = self.processing_context.get("sensitive_file_path")
|
|
826
909
|
if sensitive_path:
|
|
827
910
|
sensitive_parent = Path(sensitive_path).parent
|
|
828
911
|
sensitive_count = len(list(sensitive_parent.glob("*"))) if sensitive_parent.exists() else None
|
|
@@ -843,17 +926,17 @@ class PdfImportService:
|
|
|
843
926
|
"""Cleanup processing context."""
|
|
844
927
|
try:
|
|
845
928
|
# Clean up temporary directories
|
|
846
|
-
if self.processing_context.get(
|
|
847
|
-
crops_dir = path_utils.PDF_DIR /
|
|
929
|
+
if self.processing_context.get("text_extracted"):
|
|
930
|
+
crops_dir = path_utils.PDF_DIR / "cropped_regions"
|
|
848
931
|
if crops_dir.exists() and not any(crops_dir.iterdir()):
|
|
849
932
|
crops_dir.rmdir()
|
|
850
|
-
|
|
933
|
+
|
|
851
934
|
# Always remove from processed files set after processing attempt
|
|
852
|
-
file_path = self.processing_context.get(
|
|
935
|
+
file_path = self.processing_context.get("file_path")
|
|
853
936
|
if file_path and str(file_path) in self.processed_files:
|
|
854
937
|
self.processed_files.remove(str(file_path))
|
|
855
938
|
logger.debug(f"Removed {file_path} from processed files set")
|
|
856
|
-
|
|
939
|
+
|
|
857
940
|
except Exception as e:
|
|
858
941
|
logger.warning(f"Error during context cleanup: {e}")
|
|
859
942
|
finally:
|
|
@@ -861,45 +944,40 @@ class PdfImportService:
|
|
|
861
944
|
self.current_pdf = None
|
|
862
945
|
self.processing_context = {}
|
|
863
946
|
|
|
864
|
-
def import_simple(
|
|
865
|
-
self,
|
|
866
|
-
file_path: Union[Path, str],
|
|
867
|
-
center_name: str,
|
|
868
|
-
delete_source: bool = False
|
|
869
|
-
) -> "RawPdfFile":
|
|
947
|
+
def import_simple(self, file_path: Union[Path, str], center_name: str, delete_source: bool = False) -> "RawPdfFile":
|
|
870
948
|
"""
|
|
871
949
|
Simple PDF import without text processing or anonymization.
|
|
872
950
|
Uses centralized PDF instance management pattern.
|
|
873
|
-
|
|
951
|
+
|
|
874
952
|
Args:
|
|
875
953
|
file_path: Path to the PDF file to import
|
|
876
954
|
center_name: Name of the center to associate with PDF
|
|
877
955
|
delete_source: Whether to delete the source file after import
|
|
878
|
-
|
|
956
|
+
|
|
879
957
|
Returns:
|
|
880
958
|
RawPdfFile instance after basic import
|
|
881
959
|
"""
|
|
882
960
|
try:
|
|
883
961
|
# Initialize simple processing context
|
|
884
962
|
self._initialize_processing_context(file_path, center_name, delete_source, False)
|
|
885
|
-
|
|
963
|
+
|
|
886
964
|
# Validate file
|
|
887
965
|
self._validate_and_prepare_file()
|
|
888
|
-
|
|
966
|
+
|
|
889
967
|
# Create PDF instance
|
|
890
968
|
logger.info("Starting simple import - creating RawPdfFile instance...")
|
|
891
969
|
self.current_pdf = RawPdfFile.create_from_file_initialized(
|
|
892
|
-
file_path=self.processing_context[
|
|
970
|
+
file_path=self.processing_context["file_path"],
|
|
893
971
|
center_name=center_name,
|
|
894
972
|
delete_source=delete_source,
|
|
895
973
|
)
|
|
896
|
-
|
|
974
|
+
|
|
897
975
|
if not self.current_pdf:
|
|
898
976
|
raise RuntimeError("Failed to create RawPdfFile instance")
|
|
899
|
-
|
|
977
|
+
|
|
900
978
|
# Mark as processed
|
|
901
|
-
self.processed_files.add(str(self.processing_context[
|
|
902
|
-
|
|
979
|
+
self.processed_files.add(str(self.processing_context["file_path"]))
|
|
980
|
+
|
|
903
981
|
# Set basic state for simple import
|
|
904
982
|
state = self._ensure_state(self.current_pdf)
|
|
905
983
|
if state:
|
|
@@ -908,45 +986,46 @@ class PdfImportService:
|
|
|
908
986
|
state.sensitive_meta_processed = False
|
|
909
987
|
state.save()
|
|
910
988
|
logger.info("Set PDF state: processed=False for simple import")
|
|
911
|
-
|
|
989
|
+
|
|
912
990
|
# Save changes
|
|
913
991
|
with transaction.atomic():
|
|
914
992
|
self.current_pdf.save()
|
|
915
|
-
|
|
993
|
+
|
|
916
994
|
logger.info("Simple import completed for RawPdfFile hash: %s", self.current_pdf.pdf_hash)
|
|
917
995
|
return self.current_pdf
|
|
918
|
-
|
|
996
|
+
|
|
919
997
|
except Exception as e:
|
|
920
998
|
logger.error(f"Simple PDF import failed for {file_path}: {e}")
|
|
921
999
|
self._cleanup_on_error()
|
|
922
1000
|
raise
|
|
923
1001
|
finally:
|
|
924
1002
|
self._cleanup_processing_context()
|
|
925
|
-
|
|
1003
|
+
|
|
926
1004
|
def check_storage_capacity(self, file_path: Union[Path, str], storage_root, min_required_space) -> None:
|
|
927
1005
|
"""
|
|
928
1006
|
Check if there is sufficient storage capacity for the PDF file.
|
|
929
|
-
|
|
1007
|
+
|
|
930
1008
|
Args:
|
|
931
1009
|
file_path: Path to the PDF file to check
|
|
932
|
-
|
|
1010
|
+
|
|
933
1011
|
Raises:
|
|
934
1012
|
InsufficientStorageError: If there is not enough space
|
|
935
1013
|
"""
|
|
936
1014
|
import shutil
|
|
1015
|
+
|
|
937
1016
|
from endoreg_db.exceptions import InsufficientStorageError
|
|
938
|
-
|
|
1017
|
+
|
|
939
1018
|
file_path = Path(file_path)
|
|
940
1019
|
if not file_path.exists():
|
|
941
1020
|
raise FileNotFoundError(f"File not found for storage check: {file_path}")
|
|
942
|
-
|
|
1021
|
+
|
|
943
1022
|
# Get the size of the file
|
|
944
1023
|
file_size = file_path.stat().st_size
|
|
945
|
-
|
|
1024
|
+
|
|
946
1025
|
# Get available space in the storage directory
|
|
947
1026
|
|
|
948
1027
|
total, used, free = shutil.disk_usage(storage_root)
|
|
949
|
-
|
|
1028
|
+
|
|
950
1029
|
if file_size:
|
|
951
1030
|
min_required_space = file_size if isinstance(min_required_space, int) else 0
|
|
952
1031
|
|
|
@@ -954,9 +1033,9 @@ class PdfImportService:
|
|
|
954
1033
|
if file_size > free:
|
|
955
1034
|
raise InsufficientStorageError(f"Not enough space to store PDF file: {file_path}")
|
|
956
1035
|
logger.info(f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available")
|
|
957
|
-
|
|
1036
|
+
|
|
958
1037
|
return True
|
|
959
|
-
|
|
1038
|
+
|
|
960
1039
|
def create_sensitive_file(self, pdf_instance: "RawPdfFile" = None, file_path: Union[Path, str] = None) -> None:
|
|
961
1040
|
"""
|
|
962
1041
|
Create a copy of the PDF file in the sensitive directory and update the file reference.
|
|
@@ -966,7 +1045,7 @@ class PdfImportService:
|
|
|
966
1045
|
Ensures the FileField points to the file under STORAGE_DIR/pdfs/sensitive and never back to raw_pdfs.
|
|
967
1046
|
"""
|
|
968
1047
|
pdf_file = pdf_instance or self.current_pdf
|
|
969
|
-
source_path = Path(file_path) if file_path else self.processing_context.get(
|
|
1048
|
+
source_path = Path(file_path) if file_path else self.processing_context.get("file_path")
|
|
970
1049
|
|
|
971
1050
|
if not pdf_file:
|
|
972
1051
|
raise ValueError("No PDF instance available for creating sensitive file")
|
|
@@ -1002,9 +1081,9 @@ class PdfImportService:
|
|
|
1002
1081
|
relative_name = str(target)
|
|
1003
1082
|
|
|
1004
1083
|
# Only update when changed
|
|
1005
|
-
if getattr(pdf_file.file,
|
|
1084
|
+
if getattr(pdf_file.file, "name", None) != relative_name:
|
|
1006
1085
|
pdf_file.file.name = relative_name
|
|
1007
|
-
pdf_file.save(update_fields=[
|
|
1086
|
+
pdf_file.save(update_fields=["file"])
|
|
1008
1087
|
logger.info("Updated PDF FileField reference to sensitive path: %s", pdf_file.file.path)
|
|
1009
1088
|
else:
|
|
1010
1089
|
logger.debug("PDF FileField already points to sensitive path: %s", pdf_file.file.path)
|
|
@@ -1020,44 +1099,45 @@ class PdfImportService:
|
|
|
1020
1099
|
except Exception as e:
|
|
1021
1100
|
logger.warning(f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}", exc_info=True)
|
|
1022
1101
|
|
|
1023
|
-
def archive_or_quarantine_file(
|
|
1024
|
-
|
|
1102
|
+
def archive_or_quarantine_file(
|
|
1103
|
+
self, pdf_instance: "RawPdfFile" = None, source_file_path: Union[Path, str] = None, quarantine_reason: str = None, is_pdf_problematic: bool = None
|
|
1104
|
+
) -> bool:
|
|
1025
1105
|
"""
|
|
1026
1106
|
Archive or quarantine file based on the state of the PDF processing.
|
|
1027
1107
|
Uses the central PDF instance and processing context if parameters not provided.
|
|
1028
|
-
|
|
1108
|
+
|
|
1029
1109
|
Args:
|
|
1030
1110
|
pdf_instance: Optional PDF instance, defaults to self.current_pdf
|
|
1031
1111
|
source_file_path: Optional source file path, defaults to processing_context['file_path']
|
|
1032
1112
|
quarantine_reason: Optional quarantine reason, defaults to processing_context['error_reason']
|
|
1033
1113
|
is_pdf_problematic: Optional override for problematic state
|
|
1034
|
-
|
|
1114
|
+
|
|
1035
1115
|
Returns:
|
|
1036
1116
|
bool: True if file was quarantined, False if archived successfully
|
|
1037
1117
|
"""
|
|
1038
1118
|
pdf_file = pdf_instance or self.current_pdf
|
|
1039
|
-
file_path = Path(source_file_path) if source_file_path else self.processing_context.get(
|
|
1040
|
-
quarantine_reason = quarantine_reason or self.processing_context.get(
|
|
1041
|
-
|
|
1119
|
+
file_path = Path(source_file_path) if source_file_path else self.processing_context.get("file_path")
|
|
1120
|
+
quarantine_reason = quarantine_reason or self.processing_context.get("error_reason")
|
|
1121
|
+
|
|
1042
1122
|
if not pdf_file:
|
|
1043
1123
|
raise ValueError("No PDF instance available for archiving/quarantine")
|
|
1044
1124
|
if not file_path:
|
|
1045
1125
|
raise ValueError("No file path available for archiving/quarantine")
|
|
1046
|
-
|
|
1126
|
+
|
|
1047
1127
|
# Determine if the PDF is problematic
|
|
1048
1128
|
pdf_problematic = is_pdf_problematic if is_pdf_problematic is not None else pdf_file.is_problematic
|
|
1049
|
-
|
|
1129
|
+
|
|
1050
1130
|
if pdf_problematic:
|
|
1051
1131
|
# Quarantine the file
|
|
1052
1132
|
logger.warning(f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}")
|
|
1053
1133
|
quarantine_dir = path_utils.PDF_DIR / "quarantine"
|
|
1054
1134
|
os.makedirs(quarantine_dir, exist_ok=True)
|
|
1055
|
-
|
|
1135
|
+
|
|
1056
1136
|
quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
1057
1137
|
try:
|
|
1058
1138
|
shutil.move(file_path, quarantine_path)
|
|
1059
1139
|
pdf_file.quarantine_reason = quarantine_reason or "File processing failed"
|
|
1060
|
-
pdf_file.save(update_fields=[
|
|
1140
|
+
pdf_file.save(update_fields=["quarantine_reason"])
|
|
1061
1141
|
logger.info(f"Moved problematic PDF to quarantine: {quarantine_path}")
|
|
1062
1142
|
return True
|
|
1063
1143
|
except Exception as e:
|
|
@@ -1068,7 +1148,7 @@ class PdfImportService:
|
|
|
1068
1148
|
logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
|
|
1069
1149
|
archive_dir = path_utils.PDF_DIR / "processed"
|
|
1070
1150
|
os.makedirs(archive_dir, exist_ok=True)
|
|
1071
|
-
|
|
1151
|
+
|
|
1072
1152
|
archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
1073
1153
|
try:
|
|
1074
1154
|
shutil.move(file_path, archive_path)
|