endoreg-db 0.8.5.4__py3-none-any.whl → 0.8.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of endoreg-db might be problematic. Click here for more details.
- endoreg_db/models/media/pdf/raw_pdf.py +241 -97
- endoreg_db/models/media/video/video_file.py +23 -5
- endoreg_db/serializers/__init__.py +26 -55
- endoreg_db/serializers/video_examination.py +198 -0
- endoreg_db/urls/__init__.py +36 -23
- endoreg_db/views/pdf/reimport.py +110 -94
- endoreg_db/views/video/video_examination_viewset.py +202 -289
- {endoreg_db-0.8.5.4.dist-info → endoreg_db-0.8.5.5.dist-info}/METADATA +1 -1
- {endoreg_db-0.8.5.4.dist-info → endoreg_db-0.8.5.5.dist-info}/RECORD +11 -10
- {endoreg_db-0.8.5.4.dist-info → endoreg_db-0.8.5.5.dist-info}/WHEEL +0 -0
- {endoreg_db-0.8.5.4.dist-info → endoreg_db-0.8.5.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,36 +5,42 @@
|
|
|
5
5
|
# objects contains methods to extract text, extract metadata from text and anonymize text from pdf file uzing agl_report_reader.ReportReader class
|
|
6
6
|
# ------------------------------------------------------------------------------
|
|
7
7
|
import os
|
|
8
|
-
from
|
|
8
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
9
|
+
|
|
9
10
|
from django.core.exceptions import ValidationError
|
|
10
|
-
from django.core.validators import FileExtensionValidator
|
|
11
11
|
from django.core.files import File
|
|
12
|
+
from django.core.validators import FileExtensionValidator
|
|
13
|
+
from django.db import models
|
|
12
14
|
from numpy import isin # Import Django File
|
|
15
|
+
|
|
13
16
|
from endoreg_db.utils.file_operations import get_uuid_filename
|
|
14
|
-
from
|
|
17
|
+
from endoreg_db.utils.hashs import get_pdf_hash
|
|
18
|
+
from endoreg_db.utils.paths import PDF_DIR, RAW_PDF_DIR
|
|
19
|
+
|
|
15
20
|
# Use the specific paths from the centralized paths module
|
|
16
21
|
from ...utils import PDF_DIR
|
|
17
|
-
from endoreg_db.utils.hashs import get_pdf_hash
|
|
18
22
|
|
|
19
23
|
if TYPE_CHECKING:
|
|
20
24
|
from endoreg_db.models.administration.person import (
|
|
21
|
-
Patient,
|
|
22
25
|
Examiner,
|
|
26
|
+
Patient,
|
|
23
27
|
)
|
|
24
|
-
|
|
25
|
-
from ...medical.patient import PatientExamination
|
|
28
|
+
|
|
26
29
|
from ...administration import Center
|
|
30
|
+
from ...medical.patient import PatientExamination
|
|
27
31
|
from ...metadata.pdf_meta import PdfType
|
|
28
32
|
from ...state import RawPdfState
|
|
29
|
-
from
|
|
33
|
+
from .report_file import AnonymExaminationReport
|
|
30
34
|
|
|
31
35
|
# setup logging to pdf_import.log
|
|
32
36
|
import logging
|
|
33
|
-
|
|
34
37
|
from pathlib import Path
|
|
35
38
|
|
|
39
|
+
from ...metadata import SensitiveMeta
|
|
40
|
+
|
|
36
41
|
logger = logging.getLogger("raw_pdf")
|
|
37
42
|
|
|
43
|
+
|
|
38
44
|
class RawPdfFile(models.Model):
|
|
39
45
|
# Fields from AbstractPdfFile
|
|
40
46
|
pdf_hash = models.CharField(max_length=255, unique=True)
|
|
@@ -49,38 +55,40 @@ class RawPdfFile(models.Model):
|
|
|
49
55
|
on_delete=models.SET_NULL,
|
|
50
56
|
blank=True,
|
|
51
57
|
null=True,
|
|
52
|
-
)
|
|
58
|
+
) # type: ignore
|
|
53
59
|
examination = models.ForeignKey(
|
|
54
60
|
"PatientExamination",
|
|
55
61
|
on_delete=models.SET_NULL,
|
|
56
62
|
blank=True,
|
|
57
63
|
null=True,
|
|
58
64
|
related_name="raw_pdf_files",
|
|
59
|
-
)
|
|
65
|
+
) # type: ignore
|
|
60
66
|
examiner = models.ForeignKey(
|
|
61
67
|
"Examiner",
|
|
62
68
|
on_delete=models.SET_NULL,
|
|
63
69
|
blank=True,
|
|
64
70
|
null=True,
|
|
65
|
-
)
|
|
71
|
+
) # type: ignore
|
|
66
72
|
text = models.TextField(blank=True, null=True)
|
|
67
73
|
date_created = models.DateTimeField(auto_now_add=True)
|
|
68
74
|
date_modified = models.DateTimeField(auto_now=True)
|
|
69
|
-
anonymized = models.BooleanField(
|
|
75
|
+
anonymized = models.BooleanField(
|
|
76
|
+
default=False, help_text="True if the PDF has been anonymized."
|
|
77
|
+
)
|
|
70
78
|
|
|
71
79
|
# Fields specific to RawPdfFile (keeping existing related_names)
|
|
72
80
|
file = models.FileField(
|
|
73
81
|
# Use the relative path from the specific PDF_DIR
|
|
74
82
|
upload_to=PDF_DIR.name,
|
|
75
83
|
validators=[FileExtensionValidator(allowed_extensions=["pdf"])],
|
|
76
|
-
)
|
|
77
|
-
|
|
84
|
+
) # type: ignore
|
|
85
|
+
|
|
78
86
|
anonymized_file = models.FileField(
|
|
79
87
|
upload_to=PDF_DIR.name,
|
|
80
88
|
validators=[FileExtensionValidator(allowed_extensions=["pdf"])],
|
|
81
89
|
null=True,
|
|
82
90
|
blank=True,
|
|
83
|
-
)
|
|
91
|
+
) # type: ignore
|
|
84
92
|
|
|
85
93
|
state = models.OneToOneField(
|
|
86
94
|
"RawPdfState",
|
|
@@ -88,33 +96,45 @@ class RawPdfFile(models.Model):
|
|
|
88
96
|
blank=True,
|
|
89
97
|
null=True,
|
|
90
98
|
related_name="raw_pdf_file",
|
|
91
|
-
)
|
|
92
|
-
|
|
99
|
+
) # type: ignore
|
|
100
|
+
|
|
93
101
|
objects = models.Manager()
|
|
94
|
-
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def uuid(self):
|
|
105
|
+
"""
|
|
106
|
+
Compatibility property - returns pdf_hash as UUID-like identifier.
|
|
107
|
+
|
|
108
|
+
Note: RawPdfFile uses pdf_hash instead of UUID for identification.
|
|
109
|
+
This property exists for API backward compatibility.
|
|
110
|
+
"""
|
|
111
|
+
return self.pdf_hash
|
|
112
|
+
|
|
95
113
|
@property
|
|
96
|
-
def file_path(self) -> Path|None:
|
|
114
|
+
def file_path(self) -> Path | None:
|
|
97
115
|
"""
|
|
98
116
|
Returns the file path of the stored PDF file if available; otherwise, returns None.
|
|
99
117
|
"""
|
|
118
|
+
from django.db.models.fields.files import FieldFile
|
|
119
|
+
|
|
100
120
|
# assert self.file has path attribute
|
|
101
|
-
assert isinstance(self.file,
|
|
121
|
+
assert isinstance(self.file, FieldFile)
|
|
102
122
|
if self.file and self.file.name:
|
|
103
123
|
try:
|
|
104
124
|
return Path(self.file.path)
|
|
105
125
|
except (ValueError, AttributeError, NotImplementedError):
|
|
106
126
|
return None
|
|
107
127
|
return None
|
|
108
|
-
|
|
128
|
+
|
|
109
129
|
def set_file_path(self, file_path: Path):
|
|
110
130
|
"""
|
|
111
131
|
Sets the file path of the stored PDF file.
|
|
112
132
|
"""
|
|
113
|
-
self.file = File(file_path)
|
|
114
|
-
self.save(update_fields=[
|
|
133
|
+
self.file = File(file_path) # type: ignore
|
|
134
|
+
self.save(update_fields=["file"])
|
|
115
135
|
|
|
116
136
|
@property
|
|
117
|
-
def anonymized_file_path(self) -> Path|None:
|
|
137
|
+
def anonymized_file_path(self) -> Path | None:
|
|
118
138
|
"""
|
|
119
139
|
Returns the file path of the anonymized PDF file if available; otherwise, returns None.
|
|
120
140
|
"""
|
|
@@ -124,13 +144,73 @@ class RawPdfFile(models.Model):
|
|
|
124
144
|
except (ValueError, AttributeError, NotImplementedError):
|
|
125
145
|
return None
|
|
126
146
|
return None
|
|
127
|
-
|
|
147
|
+
|
|
128
148
|
def set_anonymized_file_path(self, file_path: Path):
|
|
129
149
|
"""
|
|
130
150
|
Sets the file path of the anonymized PDF file.
|
|
131
151
|
"""
|
|
132
|
-
self.anonymized_file = File(file_path)
|
|
133
|
-
self.save(update_fields=[
|
|
152
|
+
self.anonymized_file = File(file_path) # type: ignore
|
|
153
|
+
self.save(update_fields=["anonymized_file"])
|
|
154
|
+
|
|
155
|
+
def get_raw_file_path(self) -> Optional[Path]:
|
|
156
|
+
"""
|
|
157
|
+
Get the path to the raw PDF file, searching common locations.
|
|
158
|
+
|
|
159
|
+
This method attempts to find the original raw PDF file by checking:
|
|
160
|
+
1. Direct hash-based path in raw_pdfs/
|
|
161
|
+
2. Scanning raw_pdfs/ directory for files matching the hash
|
|
162
|
+
3. Checking the file field if it exists
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Path to raw file if it exists, None otherwise
|
|
166
|
+
"""
|
|
167
|
+
from django.conf import settings
|
|
168
|
+
|
|
169
|
+
# Check if file field already points to a valid file
|
|
170
|
+
if self.file and self.file.name:
|
|
171
|
+
try:
|
|
172
|
+
file_path = Path(self.file.path)
|
|
173
|
+
if file_path.exists():
|
|
174
|
+
logger.debug(f"Found raw PDF via file field: {file_path}")
|
|
175
|
+
return file_path
|
|
176
|
+
except (ValueError, AttributeError, NotImplementedError):
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
# Define potential raw directories
|
|
180
|
+
raw_dirs = [
|
|
181
|
+
PDF_DIR / "sensitive", # Files might be in sensitive dir
|
|
182
|
+
Path(settings.BASE_DIR) / "data" / "raw_pdfs",
|
|
183
|
+
Path(settings.BASE_DIR) / "data" / "pdfs" / "raw",
|
|
184
|
+
PDF_DIR, # General PDF directory
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
# Check direct hash-based name in each directory
|
|
188
|
+
for raw_dir in raw_dirs:
|
|
189
|
+
if not raw_dir.exists():
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
hash_path = raw_dir / f"{self.pdf_hash}.pdf"
|
|
193
|
+
if hash_path.exists():
|
|
194
|
+
logger.debug(f"Found raw PDF at: {hash_path}")
|
|
195
|
+
return hash_path
|
|
196
|
+
|
|
197
|
+
# Scan directories for matching hash
|
|
198
|
+
for raw_dir in raw_dirs:
|
|
199
|
+
if not raw_dir.exists():
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
for file_path in raw_dir.glob("*.pdf"):
|
|
203
|
+
try:
|
|
204
|
+
file_hash = get_pdf_hash(file_path)
|
|
205
|
+
if file_hash == self.pdf_hash:
|
|
206
|
+
logger.debug(f"Found matching PDF by hash: {file_path}")
|
|
207
|
+
return file_path
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.debug(f"Error checking {file_path}: {e}")
|
|
210
|
+
continue
|
|
211
|
+
|
|
212
|
+
logger.warning(f"No raw file found for PDF hash: {self.pdf_hash}")
|
|
213
|
+
return None
|
|
134
214
|
|
|
135
215
|
@property
|
|
136
216
|
def file_url(self):
|
|
@@ -141,14 +221,18 @@ class RawPdfFile(models.Model):
|
|
|
141
221
|
return self.file.url if self.file and self.file.name else None
|
|
142
222
|
except (ValueError, AttributeError):
|
|
143
223
|
return None
|
|
144
|
-
|
|
224
|
+
|
|
145
225
|
@property
|
|
146
226
|
def anonymized_file_url(self):
|
|
147
227
|
"""
|
|
148
228
|
Returns the URL of the stored PDF file if available; otherwise, returns None.
|
|
149
229
|
"""
|
|
150
230
|
try:
|
|
151
|
-
return
|
|
231
|
+
return (
|
|
232
|
+
self.anonymized_file.url
|
|
233
|
+
if self.anonymized_file and self.anonymized_file.name
|
|
234
|
+
else None
|
|
235
|
+
)
|
|
152
236
|
except (ValueError, AttributeError):
|
|
153
237
|
return None
|
|
154
238
|
|
|
@@ -158,14 +242,14 @@ class RawPdfFile(models.Model):
|
|
|
158
242
|
blank=True,
|
|
159
243
|
null=True,
|
|
160
244
|
related_name="raw_pdf_files",
|
|
161
|
-
)
|
|
245
|
+
) # type: ignore
|
|
162
246
|
sensitive_meta = models.ForeignKey(
|
|
163
247
|
"SensitiveMeta",
|
|
164
248
|
on_delete=models.SET_NULL,
|
|
165
249
|
related_name="raw_pdf_files",
|
|
166
250
|
null=True,
|
|
167
251
|
blank=True,
|
|
168
|
-
)
|
|
252
|
+
) # type: ignore
|
|
169
253
|
state_report_processing_required = models.BooleanField(default=True)
|
|
170
254
|
state_report_processed = models.BooleanField(default=False)
|
|
171
255
|
raw_meta = models.JSONField(blank=True, null=True)
|
|
@@ -175,19 +259,21 @@ class RawPdfFile(models.Model):
|
|
|
175
259
|
blank=True,
|
|
176
260
|
null=True,
|
|
177
261
|
related_name="raw_pdf_file",
|
|
178
|
-
)
|
|
262
|
+
) # type: ignore
|
|
179
263
|
anonymized_text = models.TextField(blank=True, null=True)
|
|
180
264
|
|
|
181
265
|
# Type hinting is needed, improve and use correct django types
|
|
182
266
|
if TYPE_CHECKING:
|
|
183
|
-
file
|
|
184
|
-
anonymized_file
|
|
267
|
+
file: Optional[Union[models.FieldFile, models.FileField]]
|
|
268
|
+
anonymized_file: Optional[Union[models.FieldFile, models.FileField]]
|
|
185
269
|
pdf_type: Optional[models.ForeignKey]
|
|
186
270
|
examination: Optional[models.ForeignKey["PatientExamination"]]
|
|
187
271
|
examiner: Optional[models.ForeignKey["Examiner"]]
|
|
188
272
|
patient: Optional[models.ForeignKey["Patient"]]
|
|
189
273
|
center: Optional[models.ForeignKey["Center"]]
|
|
190
|
-
anonym_examination_report: Optional[
|
|
274
|
+
anonym_examination_report: Optional[
|
|
275
|
+
models.OneToOneField["AnonymExaminationReport"]
|
|
276
|
+
]
|
|
191
277
|
sensitive_meta: Optional[models.ForeignKey["SensitiveMeta"]]
|
|
192
278
|
state: Optional[models.ForeignKey["RawPdfState"]]
|
|
193
279
|
|
|
@@ -201,7 +287,7 @@ class RawPdfFile(models.Model):
|
|
|
201
287
|
def delete(self, *args, **kwargs):
|
|
202
288
|
"""
|
|
203
289
|
Deletes the RawPdfFile instance from the database and removes the associated file from storage if it exists.
|
|
204
|
-
|
|
290
|
+
|
|
205
291
|
This method ensures that the physical PDF file is deleted from the file system after the database record is removed. Logs warnings or errors if the file cannot be found or deleted.
|
|
206
292
|
"""
|
|
207
293
|
# Call the original delete method first to remove DB record
|
|
@@ -211,45 +297,54 @@ class RawPdfFile(models.Model):
|
|
|
211
297
|
os.remove(Path(self.file_path))
|
|
212
298
|
logger.info("Original file removed: %s", self.file)
|
|
213
299
|
except Exception as e:
|
|
214
|
-
logger.warning(
|
|
300
|
+
logger.warning(
|
|
301
|
+
f"Could not get file path for {self.file.name} before deletion: {e}"
|
|
302
|
+
)
|
|
215
303
|
if self.anonymized_file:
|
|
216
304
|
try:
|
|
217
305
|
if self.anonymized_file_path:
|
|
218
306
|
os.remove(Path(self.anonymized_file_path))
|
|
219
|
-
logger.info(
|
|
307
|
+
logger.info(
|
|
308
|
+
"Anonymized file removed: %s", self.anonymized_file.name
|
|
309
|
+
)
|
|
220
310
|
except OSError as e:
|
|
221
|
-
logger.error(
|
|
311
|
+
logger.error(
|
|
312
|
+
"Error removing anonymized file %s: %s",
|
|
313
|
+
self.anonymized_file.name,
|
|
314
|
+
e,
|
|
315
|
+
)
|
|
222
316
|
|
|
223
317
|
super().delete(*args, **kwargs)
|
|
224
318
|
|
|
225
|
-
|
|
226
|
-
|
|
319
|
+
def validate_metadata_annotation(
|
|
320
|
+
self, extracted_data_dict: Optional[dict] = None
|
|
321
|
+
) -> bool:
|
|
227
322
|
"""
|
|
228
323
|
Validate the metadata of the RawPdf instance.
|
|
229
|
-
|
|
324
|
+
|
|
230
325
|
Called after annotation in the frontend, this method deletes the associated active file, updates the sensitive meta data with the user annotated data.
|
|
231
326
|
It also ensures the video file is properly saved after the metadata update.
|
|
232
327
|
"""
|
|
233
|
-
|
|
328
|
+
|
|
234
329
|
if not self.sensitive_meta:
|
|
235
330
|
logger.error("No sensitive meta data associated with this PDF file.")
|
|
236
331
|
return False
|
|
237
|
-
|
|
332
|
+
|
|
238
333
|
if not extracted_data_dict:
|
|
239
334
|
logger.error("No extracted data provided for validation.")
|
|
240
335
|
return False
|
|
241
|
-
|
|
336
|
+
|
|
242
337
|
# Update sensitive meta with the provided data
|
|
243
338
|
self.sensitive_meta.update_from_dict(extracted_data_dict)
|
|
244
|
-
|
|
339
|
+
|
|
245
340
|
# Save the sensitive meta to ensure changes are persisted
|
|
246
341
|
self.sensitive_meta.save()
|
|
247
|
-
|
|
342
|
+
|
|
248
343
|
# Save the RawPdfFile instance to ensure all changes are saved
|
|
249
344
|
self.save()
|
|
250
|
-
|
|
345
|
+
|
|
251
346
|
logger.info(f"Metadata for PDF {self.pk} validated and updated successfully.")
|
|
252
|
-
|
|
347
|
+
|
|
253
348
|
if self.file_path:
|
|
254
349
|
try:
|
|
255
350
|
os.unlink(self.file_path) # Delete the original file if it exists
|
|
@@ -260,13 +355,14 @@ class RawPdfFile(models.Model):
|
|
|
260
355
|
try:
|
|
261
356
|
os.unlink(self.anonymized_file_path)
|
|
262
357
|
except OSError as e:
|
|
263
|
-
logger.error(
|
|
358
|
+
logger.error(
|
|
359
|
+
f"Error removing anonymized file {self.anonymized_file_path}: {e}"
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
self.save() # Save the model to persist the cleared file fields
|
|
264
363
|
|
|
265
|
-
self.save() # Save the model to persist the cleared file fields
|
|
266
|
-
|
|
267
364
|
logger.info(f"Files for PDF {self.pk} deleted successfully.")
|
|
268
365
|
return True
|
|
269
|
-
|
|
270
366
|
|
|
271
367
|
@classmethod
|
|
272
368
|
def create_from_file_initialized(
|
|
@@ -277,18 +373,18 @@ class RawPdfFile(models.Model):
|
|
|
277
373
|
):
|
|
278
374
|
"""
|
|
279
375
|
Creates a RawPdfFile instance from a file and center name, ensuring an associated RawPdfState exists.
|
|
280
|
-
|
|
376
|
+
|
|
281
377
|
Parameters:
|
|
282
378
|
file_path (Path): Path to the source PDF file.
|
|
283
379
|
center_name (str): Name of the center to associate with the PDF.
|
|
284
380
|
delete_source (bool): Whether to delete the source file after processing. Defaults to True.
|
|
285
|
-
|
|
381
|
+
|
|
286
382
|
Returns:
|
|
287
383
|
RawPdfFile: The created or retrieved RawPdfFile instance with an associated RawPdfState.
|
|
288
384
|
"""
|
|
289
385
|
raw_pdf = cls.create_from_file(
|
|
290
386
|
file_path=file_path,
|
|
291
|
-
center_name=center_name,
|
|
387
|
+
center_name=center_name,
|
|
292
388
|
delete_source=delete_source,
|
|
293
389
|
)
|
|
294
390
|
_state = raw_pdf.get_or_create_state()
|
|
@@ -305,18 +401,18 @@ class RawPdfFile(models.Model):
|
|
|
305
401
|
):
|
|
306
402
|
"""
|
|
307
403
|
Creates or retrieves a RawPdfFile instance from a given PDF file path and center name.
|
|
308
|
-
|
|
404
|
+
|
|
309
405
|
If a RawPdfFile with the same PDF hash already exists, verifies the file exists in storage and restores it if missing. Otherwise, creates a new RawPdfFile, assigns the file, and saves it to storage. Optionally deletes the source file after processing.
|
|
310
|
-
|
|
406
|
+
|
|
311
407
|
Parameters:
|
|
312
408
|
file_path (Path): Path to the source PDF file.
|
|
313
409
|
center_name (str): Name of the center to associate with the file.
|
|
314
410
|
save (bool, optional): Deprecated; saving occurs internally.
|
|
315
411
|
delete_source (bool, optional): Whether to delete the source file after processing (default True).
|
|
316
|
-
|
|
412
|
+
|
|
317
413
|
Returns:
|
|
318
414
|
RawPdfFile: The created or retrieved RawPdfFile instance.
|
|
319
|
-
|
|
415
|
+
|
|
320
416
|
Raises:
|
|
321
417
|
FileNotFoundError: If the source file does not exist.
|
|
322
418
|
Center.DoesNotExist: If the specified center is not found.
|
|
@@ -338,28 +434,45 @@ class RawPdfFile(models.Model):
|
|
|
338
434
|
raise ValueError(f"Could not calculate hash for {file_path}") from e
|
|
339
435
|
|
|
340
436
|
# 2. Check if record with this hash already exists
|
|
341
|
-
existing_pdf_file = cls.objects.filter(pdf_hash=pdf_hash).first()
|
|
437
|
+
existing_pdf_file = cls.objects.filter(pdf_hash=pdf_hash).first()
|
|
342
438
|
if existing_pdf_file:
|
|
343
|
-
logger.warning(
|
|
439
|
+
logger.warning(
|
|
440
|
+
"RawPdfFile with hash %s already exists (ID: %s)",
|
|
441
|
+
pdf_hash,
|
|
442
|
+
existing_pdf_file.pk,
|
|
443
|
+
)
|
|
344
444
|
|
|
345
445
|
# Verify physical file exists for the existing record
|
|
346
446
|
try:
|
|
347
447
|
if existing_pdf_file is not None and isinstance(existing_pdf_file, cls):
|
|
348
|
-
|
|
448
|
+
# Use storage API to check existence
|
|
349
449
|
_file = existing_pdf_file.file
|
|
350
450
|
assert _file is not None
|
|
351
451
|
if not _file.storage.exists(_file.name):
|
|
352
|
-
logger.warning(
|
|
452
|
+
logger.warning(
|
|
453
|
+
"File for existing RawPdfFile %s not found in storage at %s. Attempting to restore from source %s",
|
|
454
|
+
pdf_hash,
|
|
455
|
+
_file.name,
|
|
456
|
+
file_path,
|
|
457
|
+
)
|
|
353
458
|
# Re-save the file from the source to potentially fix it
|
|
354
459
|
with file_path.open("rb") as f:
|
|
355
|
-
django_file = File(
|
|
356
|
-
|
|
357
|
-
|
|
460
|
+
django_file = File(
|
|
461
|
+
f, name=Path(_file.name).name
|
|
462
|
+
) # Use existing name if possible
|
|
463
|
+
existing_pdf_file.file = django_file # type: ignore
|
|
464
|
+
existing_pdf_file.save(
|
|
465
|
+
update_fields=["file"]
|
|
466
|
+
) # Only update file field
|
|
358
467
|
else:
|
|
359
468
|
pass
|
|
360
469
|
# logger.debug("File for existing RawPdfFile %s already exists in storage.", pdf_hash)
|
|
361
470
|
except Exception as e:
|
|
362
|
-
logger.error(
|
|
471
|
+
logger.error(
|
|
472
|
+
"Error verifying/restoring file for existing record %s: %s",
|
|
473
|
+
pdf_hash,
|
|
474
|
+
e,
|
|
475
|
+
)
|
|
363
476
|
|
|
364
477
|
# Delete the source temp file if requested
|
|
365
478
|
if delete_source:
|
|
@@ -396,7 +509,9 @@ class RawPdfFile(models.Model):
|
|
|
396
509
|
_file = raw_pdf.file
|
|
397
510
|
assert _file is not None
|
|
398
511
|
logger.info(
|
|
399
|
-
"Created and saved new RawPdfFile %s with file %s",
|
|
512
|
+
"Created and saved new RawPdfFile %s with file %s",
|
|
513
|
+
raw_pdf.pk,
|
|
514
|
+
_file.name,
|
|
400
515
|
)
|
|
401
516
|
|
|
402
517
|
if not _file.storage.exists(_file.name):
|
|
@@ -417,18 +532,22 @@ class RawPdfFile(models.Model):
|
|
|
417
532
|
)
|
|
418
533
|
|
|
419
534
|
except Exception as e:
|
|
420
|
-
logger.error(
|
|
535
|
+
logger.error(
|
|
536
|
+
"Error processing or saving file %s for new record: %s", file_path, e
|
|
537
|
+
)
|
|
421
538
|
raise
|
|
422
539
|
|
|
423
540
|
# Delete source file *after* successful save and verification
|
|
424
541
|
if delete_source:
|
|
425
542
|
try:
|
|
426
543
|
file_path.unlink()
|
|
427
|
-
logger.info(
|
|
544
|
+
logger.info(
|
|
545
|
+
"Deleted source file %s after creating new record.", file_path
|
|
546
|
+
)
|
|
428
547
|
except OSError as e:
|
|
429
548
|
logger.error("Error deleting source file %s: %s", file_path, e)
|
|
430
549
|
|
|
431
|
-
# raw_pdf.save() # unnecessary?
|
|
550
|
+
# raw_pdf.save() # unnecessary?
|
|
432
551
|
return raw_pdf
|
|
433
552
|
|
|
434
553
|
def save(self, *args, **kwargs):
|
|
@@ -436,7 +555,7 @@ class RawPdfFile(models.Model):
|
|
|
436
555
|
# This is primarily a fallback if instance created manually without using create_from_file
|
|
437
556
|
"""
|
|
438
557
|
Saves the RawPdfFile instance, ensuring the PDF hash is set and related fields are derived from metadata.
|
|
439
|
-
|
|
558
|
+
|
|
440
559
|
If the PDF hash is missing, attempts to calculate it from the file before saving. Validates that the file has a `.pdf` extension. If related fields such as patient, examination, center, or examiner are unset but available in the associated sensitive metadata, they are populated accordingly before saving.
|
|
441
560
|
"""
|
|
442
561
|
if not self.pk and not self.pdf_hash and self.file:
|
|
@@ -445,16 +564,22 @@ class RawPdfFile(models.Model):
|
|
|
445
564
|
if not file_path.exists():
|
|
446
565
|
raise FileNotFoundError(f"File path does not exist: {file_path}")
|
|
447
566
|
# Read from the file object before it's saved by storage
|
|
448
|
-
self.file.open(
|
|
567
|
+
self.file.open("rb") # Ensure file is open
|
|
449
568
|
self.file.seek(0) # Go to beginning
|
|
450
|
-
self.pdf_hash = get_pdf_hash(
|
|
569
|
+
self.pdf_hash = get_pdf_hash(
|
|
570
|
+
file_path
|
|
571
|
+
) # Assuming get_pdf_hash can handle file obj
|
|
451
572
|
self.file.seek(0) # Reset position
|
|
452
573
|
self.file.close() # Close after reading
|
|
453
574
|
logger.info(f"Calculated hash during pre-save for {self.file.name}")
|
|
454
575
|
except Exception as e:
|
|
455
|
-
logger.warning(
|
|
576
|
+
logger.warning(
|
|
577
|
+
"Could not calculate hash before initial save for %s: %s",
|
|
578
|
+
self.file.name,
|
|
579
|
+
e,
|
|
580
|
+
)
|
|
456
581
|
# Ensure file is closed if opened
|
|
457
|
-
if hasattr(self.file,
|
|
582
|
+
if hasattr(self.file, "closed") and not self.file.closed:
|
|
458
583
|
self.file.close()
|
|
459
584
|
|
|
460
585
|
if self.file and not self.file.name.endswith(".pdf"):
|
|
@@ -463,18 +588,31 @@ class RawPdfFile(models.Model):
|
|
|
463
588
|
# If hash is still missing after potential creation logic (e.g., direct instantiation)
|
|
464
589
|
# and the file exists in storage, try calculating it from storage path.
|
|
465
590
|
# This is less ideal as it requires the file to be saved first.
|
|
466
|
-
if
|
|
591
|
+
if (
|
|
592
|
+
not self.pdf_hash
|
|
593
|
+
and self.pk
|
|
594
|
+
and self.file
|
|
595
|
+
and self.file.storage.exists(self.file.name)
|
|
596
|
+
):
|
|
467
597
|
try:
|
|
468
598
|
file_path = Path(self.file.path).resolve()
|
|
469
599
|
if not file_path.exists():
|
|
470
600
|
raise FileNotFoundError(f"File path does not exist: {file_path}")
|
|
471
|
-
logger.warning(
|
|
472
|
-
|
|
473
|
-
|
|
601
|
+
logger.warning(
|
|
602
|
+
f"Hash missing for saved file {self.file.name}. Recalculating."
|
|
603
|
+
)
|
|
604
|
+
with self.file.storage.open(self.file.name, "rb") as f:
|
|
605
|
+
self.pdf_hash = get_pdf_hash(
|
|
606
|
+
file_path
|
|
607
|
+
) # Assuming get_pdf_hash handles file obj
|
|
474
608
|
# No need to save again just for hash unless update_fields is used carefully
|
|
475
609
|
# Let the main super().save() handle saving the hash if it changed
|
|
476
610
|
except Exception as e:
|
|
477
|
-
logger.error(
|
|
611
|
+
logger.error(
|
|
612
|
+
"Could not calculate hash during save for existing file %s: %s",
|
|
613
|
+
self.file.name,
|
|
614
|
+
e,
|
|
615
|
+
)
|
|
478
616
|
|
|
479
617
|
# Derive related fields from sensitive_meta if available
|
|
480
618
|
if not self.patient and self.sensitive_meta:
|
|
@@ -483,7 +621,7 @@ class RawPdfFile(models.Model):
|
|
|
483
621
|
self.examination = self.sensitive_meta.pseudo_examination
|
|
484
622
|
if not self.center and self.sensitive_meta:
|
|
485
623
|
self.center = self.sensitive_meta.center
|
|
486
|
-
#TODO Outdated?
|
|
624
|
+
# TODO Outdated?
|
|
487
625
|
# if not self.examiner and self.sensitive_meta and hasattr(self.sensitive_meta, 'pseudo_examiner'):
|
|
488
626
|
# self.examiner = self.sensitive_meta.pseudo_examiner
|
|
489
627
|
|
|
@@ -492,7 +630,7 @@ class RawPdfFile(models.Model):
|
|
|
492
630
|
def get_or_create_state(self) -> "RawPdfState":
|
|
493
631
|
"""
|
|
494
632
|
Retrieve the associated RawPdfState for this RawPdfFile, creating and linking a new one if none exists.
|
|
495
|
-
|
|
633
|
+
|
|
496
634
|
Returns:
|
|
497
635
|
RawPdfState: The existing or newly created RawPdfState instance linked to this RawPdfFile.
|
|
498
636
|
"""
|
|
@@ -514,7 +652,7 @@ class RawPdfFile(models.Model):
|
|
|
514
652
|
# Ensure fallback_file is a Path object.
|
|
515
653
|
"""
|
|
516
654
|
Checks if the stored PDF file exists in storage and attempts to restore it from a fallback file path if missing.
|
|
517
|
-
|
|
655
|
+
|
|
518
656
|
Parameters:
|
|
519
657
|
fallback_file: Path or string representing the fallback file location to restore from if the stored file is missing.
|
|
520
658
|
"""
|
|
@@ -525,16 +663,21 @@ class RawPdfFile(models.Model):
|
|
|
525
663
|
assert _file is not None
|
|
526
664
|
try:
|
|
527
665
|
if not _file.field.storage.exists(_file.name):
|
|
528
|
-
logger.warning(
|
|
666
|
+
logger.warning(
|
|
667
|
+
f"File missing at storage path {_file.name}. Attempting copy from fallback {fallback_file}"
|
|
668
|
+
)
|
|
529
669
|
if fallback_file.exists():
|
|
530
670
|
with fallback_file.open("rb") as f:
|
|
531
671
|
# Use save method which handles storage backend
|
|
532
|
-
_file.save(
|
|
533
|
-
|
|
672
|
+
_file.save(
|
|
673
|
+
Path(_file.name).name, File(f), save=True
|
|
674
|
+
) # Re-save the file content
|
|
675
|
+
logger.info(
|
|
676
|
+
f"Successfully restored file from fallback {fallback_file} to {_file.name}"
|
|
677
|
+
)
|
|
534
678
|
else:
|
|
535
679
|
logger.error(f"Fallback file {fallback_file} does not exist.")
|
|
536
680
|
except Exception as e:
|
|
537
|
-
|
|
538
681
|
logger.error(f"Error during verify_existing_file for {_file.name}: {e}")
|
|
539
682
|
|
|
540
683
|
def process_file(self, text, anonymized_text, report_meta, verbose):
|
|
@@ -562,18 +705,19 @@ class RawPdfFile(models.Model):
|
|
|
562
705
|
for key, value in serializable_report_meta.items():
|
|
563
706
|
if isinstance(value, (datetime, date)):
|
|
564
707
|
serializable_report_meta[key] = value.isoformat()
|
|
565
|
-
|
|
566
|
-
self.raw_meta = serializable_report_meta # Assign the version with string dates
|
|
567
708
|
|
|
568
|
-
|
|
569
|
-
|
|
709
|
+
self.raw_meta = serializable_report_meta # Assign the version with string dates
|
|
710
|
+
|
|
711
|
+
sensitive_meta.save() # Save SensitiveMeta first
|
|
712
|
+
self.save() # Then save RawPdfFile
|
|
570
713
|
|
|
571
714
|
return text, anonymized_text, report_meta
|
|
572
715
|
|
|
573
716
|
def get_report_reader_config(self):
|
|
717
|
+
from warnings import warn
|
|
718
|
+
|
|
574
719
|
from ...administration import Center
|
|
575
720
|
from ...metadata.pdf_meta import PdfType
|
|
576
|
-
from warnings import warn
|
|
577
721
|
|
|
578
722
|
_center = self.center
|
|
579
723
|
assert _center is not None, "Center must be set to get report reader config"
|
|
@@ -604,10 +748,10 @@ class RawPdfFile(models.Model):
|
|
|
604
748
|
}
|
|
605
749
|
|
|
606
750
|
return settings_dict
|
|
607
|
-
|
|
751
|
+
|
|
608
752
|
@staticmethod
|
|
609
753
|
def get_pdf_by_id(pdf_id: int) -> "RawPdfFile":
|
|
610
754
|
try:
|
|
611
755
|
return RawPdfFile.objects.get(pk=pdf_id)
|
|
612
756
|
except RawPdfFile.DoesNotExist:
|
|
613
|
-
raise ValueError(f"PDF with ID {pdf_id} does not exist.")
|
|
757
|
+
raise ValueError(f"PDF with ID {pdf_id} does not exist.")
|