endoreg-db 0.8.5.4__py3-none-any.whl → 0.8.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of endoreg-db might be problematic. Click here for more details.

@@ -5,36 +5,42 @@
5
5
  # objects contains methods to extract text, extract metadata from text and anonymize text from pdf file uzing agl_report_reader.ReportReader class
6
6
  # ------------------------------------------------------------------------------
7
7
  import os
8
- from django.db import models
8
+ from typing import TYPE_CHECKING, Optional, Union
9
+
9
10
  from django.core.exceptions import ValidationError
10
- from django.core.validators import FileExtensionValidator
11
11
  from django.core.files import File
12
+ from django.core.validators import FileExtensionValidator
13
+ from django.db import models
12
14
  from numpy import isin # Import Django File
15
+
13
16
  from endoreg_db.utils.file_operations import get_uuid_filename
14
- from typing import TYPE_CHECKING, Optional, Union
17
+ from endoreg_db.utils.hashs import get_pdf_hash
18
+ from endoreg_db.utils.paths import PDF_DIR, RAW_PDF_DIR
19
+
15
20
  # Use the specific paths from the centralized paths module
16
21
  from ...utils import PDF_DIR
17
- from endoreg_db.utils.hashs import get_pdf_hash
18
22
 
19
23
  if TYPE_CHECKING:
20
24
  from endoreg_db.models.administration.person import (
21
- Patient,
22
25
  Examiner,
26
+ Patient,
23
27
  )
24
- from .report_file import AnonymExaminationReport
25
- from ...medical.patient import PatientExamination
28
+
26
29
  from ...administration import Center
30
+ from ...medical.patient import PatientExamination
27
31
  from ...metadata.pdf_meta import PdfType
28
32
  from ...state import RawPdfState
29
- from ...metadata import SensitiveMeta
33
+ from .report_file import AnonymExaminationReport
30
34
 
31
35
  # setup logging to pdf_import.log
32
36
  import logging
33
-
34
37
  from pathlib import Path
35
38
 
39
+ from ...metadata import SensitiveMeta
40
+
36
41
  logger = logging.getLogger("raw_pdf")
37
42
 
43
+
38
44
  class RawPdfFile(models.Model):
39
45
  # Fields from AbstractPdfFile
40
46
  pdf_hash = models.CharField(max_length=255, unique=True)
@@ -49,38 +55,40 @@ class RawPdfFile(models.Model):
49
55
  on_delete=models.SET_NULL,
50
56
  blank=True,
51
57
  null=True,
52
- ) # type: ignore
58
+ ) # type: ignore
53
59
  examination = models.ForeignKey(
54
60
  "PatientExamination",
55
61
  on_delete=models.SET_NULL,
56
62
  blank=True,
57
63
  null=True,
58
64
  related_name="raw_pdf_files",
59
- ) # type: ignore
65
+ ) # type: ignore
60
66
  examiner = models.ForeignKey(
61
67
  "Examiner",
62
68
  on_delete=models.SET_NULL,
63
69
  blank=True,
64
70
  null=True,
65
- ) # type: ignore
71
+ ) # type: ignore
66
72
  text = models.TextField(blank=True, null=True)
67
73
  date_created = models.DateTimeField(auto_now_add=True)
68
74
  date_modified = models.DateTimeField(auto_now=True)
69
- anonymized = models.BooleanField(default=False, help_text="True if the PDF has been anonymized.")
75
+ anonymized = models.BooleanField(
76
+ default=False, help_text="True if the PDF has been anonymized."
77
+ )
70
78
 
71
79
  # Fields specific to RawPdfFile (keeping existing related_names)
72
80
  file = models.FileField(
73
81
  # Use the relative path from the specific PDF_DIR
74
82
  upload_to=PDF_DIR.name,
75
83
  validators=[FileExtensionValidator(allowed_extensions=["pdf"])],
76
- ) # type: ignore
77
-
84
+ ) # type: ignore
85
+
78
86
  anonymized_file = models.FileField(
79
87
  upload_to=PDF_DIR.name,
80
88
  validators=[FileExtensionValidator(allowed_extensions=["pdf"])],
81
89
  null=True,
82
90
  blank=True,
83
- ) # type: ignore
91
+ ) # type: ignore
84
92
 
85
93
  state = models.OneToOneField(
86
94
  "RawPdfState",
@@ -88,33 +96,45 @@ class RawPdfFile(models.Model):
88
96
  blank=True,
89
97
  null=True,
90
98
  related_name="raw_pdf_file",
91
- ) # type: ignore
92
-
99
+ ) # type: ignore
100
+
93
101
  objects = models.Manager()
94
-
102
+
103
+ @property
104
+ def uuid(self):
105
+ """
106
+ Compatibility property - returns pdf_hash as UUID-like identifier.
107
+
108
+ Note: RawPdfFile uses pdf_hash instead of UUID for identification.
109
+ This property exists for API backward compatibility.
110
+ """
111
+ return self.pdf_hash
112
+
95
113
  @property
96
- def file_path(self) -> Path|None:
114
+ def file_path(self) -> Path | None:
97
115
  """
98
116
  Returns the file path of the stored PDF file if available; otherwise, returns None.
99
117
  """
118
+ from django.db.models.fields.files import FieldFile
119
+
100
120
  # assert self.file has path attribute
101
- assert isinstance(self.file, models.FieldFile)
121
+ assert isinstance(self.file, FieldFile)
102
122
  if self.file and self.file.name:
103
123
  try:
104
124
  return Path(self.file.path)
105
125
  except (ValueError, AttributeError, NotImplementedError):
106
126
  return None
107
127
  return None
108
-
128
+
109
129
  def set_file_path(self, file_path: Path):
110
130
  """
111
131
  Sets the file path of the stored PDF file.
112
132
  """
113
- self.file = File(file_path) # type: ignore
114
- self.save(update_fields=['file'])
133
+ self.file = File(file_path) # type: ignore
134
+ self.save(update_fields=["file"])
115
135
 
116
136
  @property
117
- def anonymized_file_path(self) -> Path|None:
137
+ def anonymized_file_path(self) -> Path | None:
118
138
  """
119
139
  Returns the file path of the anonymized PDF file if available; otherwise, returns None.
120
140
  """
@@ -124,13 +144,73 @@ class RawPdfFile(models.Model):
124
144
  except (ValueError, AttributeError, NotImplementedError):
125
145
  return None
126
146
  return None
127
-
147
+
128
148
  def set_anonymized_file_path(self, file_path: Path):
129
149
  """
130
150
  Sets the file path of the anonymized PDF file.
131
151
  """
132
- self.anonymized_file = File(file_path) # type: ignore
133
- self.save(update_fields=['anonymized_file'])
152
+ self.anonymized_file = File(file_path) # type: ignore
153
+ self.save(update_fields=["anonymized_file"])
154
+
155
+ def get_raw_file_path(self) -> Optional[Path]:
156
+ """
157
+ Get the path to the raw PDF file, searching common locations.
158
+
159
+ This method attempts to find the original raw PDF file by checking:
160
+ 1. Direct hash-based path in raw_pdfs/
161
+ 2. Scanning raw_pdfs/ directory for files matching the hash
162
+ 3. Checking the file field if it exists
163
+
164
+ Returns:
165
+ Path to raw file if it exists, None otherwise
166
+ """
167
+ from django.conf import settings
168
+
169
+ # Check if file field already points to a valid file
170
+ if self.file and self.file.name:
171
+ try:
172
+ file_path = Path(self.file.path)
173
+ if file_path.exists():
174
+ logger.debug(f"Found raw PDF via file field: {file_path}")
175
+ return file_path
176
+ except (ValueError, AttributeError, NotImplementedError):
177
+ pass
178
+
179
+ # Define potential raw directories
180
+ raw_dirs = [
181
+ PDF_DIR / "sensitive", # Files might be in sensitive dir
182
+ Path(settings.BASE_DIR) / "data" / "raw_pdfs",
183
+ Path(settings.BASE_DIR) / "data" / "pdfs" / "raw",
184
+ PDF_DIR, # General PDF directory
185
+ ]
186
+
187
+ # Check direct hash-based name in each directory
188
+ for raw_dir in raw_dirs:
189
+ if not raw_dir.exists():
190
+ continue
191
+
192
+ hash_path = raw_dir / f"{self.pdf_hash}.pdf"
193
+ if hash_path.exists():
194
+ logger.debug(f"Found raw PDF at: {hash_path}")
195
+ return hash_path
196
+
197
+ # Scan directories for matching hash
198
+ for raw_dir in raw_dirs:
199
+ if not raw_dir.exists():
200
+ continue
201
+
202
+ for file_path in raw_dir.glob("*.pdf"):
203
+ try:
204
+ file_hash = get_pdf_hash(file_path)
205
+ if file_hash == self.pdf_hash:
206
+ logger.debug(f"Found matching PDF by hash: {file_path}")
207
+ return file_path
208
+ except Exception as e:
209
+ logger.debug(f"Error checking {file_path}: {e}")
210
+ continue
211
+
212
+ logger.warning(f"No raw file found for PDF hash: {self.pdf_hash}")
213
+ return None
134
214
 
135
215
  @property
136
216
  def file_url(self):
@@ -141,14 +221,18 @@ class RawPdfFile(models.Model):
141
221
  return self.file.url if self.file and self.file.name else None
142
222
  except (ValueError, AttributeError):
143
223
  return None
144
-
224
+
145
225
  @property
146
226
  def anonymized_file_url(self):
147
227
  """
148
228
  Returns the URL of the stored PDF file if available; otherwise, returns None.
149
229
  """
150
230
  try:
151
- return self.anonymized_file.url if self.anonymized_file and self.anonymized_file.name else None
231
+ return (
232
+ self.anonymized_file.url
233
+ if self.anonymized_file and self.anonymized_file.name
234
+ else None
235
+ )
152
236
  except (ValueError, AttributeError):
153
237
  return None
154
238
 
@@ -158,14 +242,14 @@ class RawPdfFile(models.Model):
158
242
  blank=True,
159
243
  null=True,
160
244
  related_name="raw_pdf_files",
161
- ) # type: ignore
245
+ ) # type: ignore
162
246
  sensitive_meta = models.ForeignKey(
163
247
  "SensitiveMeta",
164
248
  on_delete=models.SET_NULL,
165
249
  related_name="raw_pdf_files",
166
250
  null=True,
167
251
  blank=True,
168
- ) # type: ignore
252
+ ) # type: ignore
169
253
  state_report_processing_required = models.BooleanField(default=True)
170
254
  state_report_processed = models.BooleanField(default=False)
171
255
  raw_meta = models.JSONField(blank=True, null=True)
@@ -175,19 +259,21 @@ class RawPdfFile(models.Model):
175
259
  blank=True,
176
260
  null=True,
177
261
  related_name="raw_pdf_file",
178
- ) # type: ignore
262
+ ) # type: ignore
179
263
  anonymized_text = models.TextField(blank=True, null=True)
180
264
 
181
265
  # Type hinting is needed, improve and use correct django types
182
266
  if TYPE_CHECKING:
183
- file : Optional[Union[models.FieldFile,models.FileField]]
184
- anonymized_file : Optional[Union[models.FieldFile,models.FileField]]
267
+ file: Optional[Union[models.FieldFile, models.FileField]]
268
+ anonymized_file: Optional[Union[models.FieldFile, models.FileField]]
185
269
  pdf_type: Optional[models.ForeignKey]
186
270
  examination: Optional[models.ForeignKey["PatientExamination"]]
187
271
  examiner: Optional[models.ForeignKey["Examiner"]]
188
272
  patient: Optional[models.ForeignKey["Patient"]]
189
273
  center: Optional[models.ForeignKey["Center"]]
190
- anonym_examination_report: Optional[models.OneToOneField["AnonymExaminationReport"]]
274
+ anonym_examination_report: Optional[
275
+ models.OneToOneField["AnonymExaminationReport"]
276
+ ]
191
277
  sensitive_meta: Optional[models.ForeignKey["SensitiveMeta"]]
192
278
  state: Optional[models.ForeignKey["RawPdfState"]]
193
279
 
@@ -201,7 +287,7 @@ class RawPdfFile(models.Model):
201
287
  def delete(self, *args, **kwargs):
202
288
  """
203
289
  Deletes the RawPdfFile instance from the database and removes the associated file from storage if it exists.
204
-
290
+
205
291
  This method ensures that the physical PDF file is deleted from the file system after the database record is removed. Logs warnings or errors if the file cannot be found or deleted.
206
292
  """
207
293
  # Call the original delete method first to remove DB record
@@ -211,45 +297,54 @@ class RawPdfFile(models.Model):
211
297
  os.remove(Path(self.file_path))
212
298
  logger.info("Original file removed: %s", self.file)
213
299
  except Exception as e:
214
- logger.warning(f"Could not get file path for {self.file.name} before deletion: {e}")
300
+ logger.warning(
301
+ f"Could not get file path for {self.file.name} before deletion: {e}"
302
+ )
215
303
  if self.anonymized_file:
216
304
  try:
217
305
  if self.anonymized_file_path:
218
306
  os.remove(Path(self.anonymized_file_path))
219
- logger.info("Anonymized file removed: %s", self.anonymized_file.name)
307
+ logger.info(
308
+ "Anonymized file removed: %s", self.anonymized_file.name
309
+ )
220
310
  except OSError as e:
221
- logger.error("Error removing anonymized file %s: %s", self.anonymized_file.name, e)
311
+ logger.error(
312
+ "Error removing anonymized file %s: %s",
313
+ self.anonymized_file.name,
314
+ e,
315
+ )
222
316
 
223
317
  super().delete(*args, **kwargs)
224
318
 
225
-
226
- def validate_metadata_annotation(self, extracted_data_dict: Optional[dict] = None) -> bool:
319
+ def validate_metadata_annotation(
320
+ self, extracted_data_dict: Optional[dict] = None
321
+ ) -> bool:
227
322
  """
228
323
  Validate the metadata of the RawPdf instance.
229
-
324
+
230
325
  Called after annotation in the frontend, this method deletes the associated active file, updates the sensitive meta data with the user annotated data.
231
326
  It also ensures the video file is properly saved after the metadata update.
232
327
  """
233
-
328
+
234
329
  if not self.sensitive_meta:
235
330
  logger.error("No sensitive meta data associated with this PDF file.")
236
331
  return False
237
-
332
+
238
333
  if not extracted_data_dict:
239
334
  logger.error("No extracted data provided for validation.")
240
335
  return False
241
-
336
+
242
337
  # Update sensitive meta with the provided data
243
338
  self.sensitive_meta.update_from_dict(extracted_data_dict)
244
-
339
+
245
340
  # Save the sensitive meta to ensure changes are persisted
246
341
  self.sensitive_meta.save()
247
-
342
+
248
343
  # Save the RawPdfFile instance to ensure all changes are saved
249
344
  self.save()
250
-
345
+
251
346
  logger.info(f"Metadata for PDF {self.pk} validated and updated successfully.")
252
-
347
+
253
348
  if self.file_path:
254
349
  try:
255
350
  os.unlink(self.file_path) # Delete the original file if it exists
@@ -260,13 +355,14 @@ class RawPdfFile(models.Model):
260
355
  try:
261
356
  os.unlink(self.anonymized_file_path)
262
357
  except OSError as e:
263
- logger.error(f"Error removing anonymized file {self.anonymized_file_path}: {e}")
358
+ logger.error(
359
+ f"Error removing anonymized file {self.anonymized_file_path}: {e}"
360
+ )
361
+
362
+ self.save() # Save the model to persist the cleared file fields
264
363
 
265
- self.save() # Save the model to persist the cleared file fields
266
-
267
364
  logger.info(f"Files for PDF {self.pk} deleted successfully.")
268
365
  return True
269
-
270
366
 
271
367
  @classmethod
272
368
  def create_from_file_initialized(
@@ -277,18 +373,18 @@ class RawPdfFile(models.Model):
277
373
  ):
278
374
  """
279
375
  Creates a RawPdfFile instance from a file and center name, ensuring an associated RawPdfState exists.
280
-
376
+
281
377
  Parameters:
282
378
  file_path (Path): Path to the source PDF file.
283
379
  center_name (str): Name of the center to associate with the PDF.
284
380
  delete_source (bool): Whether to delete the source file after processing. Defaults to True.
285
-
381
+
286
382
  Returns:
287
383
  RawPdfFile: The created or retrieved RawPdfFile instance with an associated RawPdfState.
288
384
  """
289
385
  raw_pdf = cls.create_from_file(
290
386
  file_path=file_path,
291
- center_name=center_name,
387
+ center_name=center_name,
292
388
  delete_source=delete_source,
293
389
  )
294
390
  _state = raw_pdf.get_or_create_state()
@@ -305,18 +401,18 @@ class RawPdfFile(models.Model):
305
401
  ):
306
402
  """
307
403
  Creates or retrieves a RawPdfFile instance from a given PDF file path and center name.
308
-
404
+
309
405
  If a RawPdfFile with the same PDF hash already exists, verifies the file exists in storage and restores it if missing. Otherwise, creates a new RawPdfFile, assigns the file, and saves it to storage. Optionally deletes the source file after processing.
310
-
406
+
311
407
  Parameters:
312
408
  file_path (Path): Path to the source PDF file.
313
409
  center_name (str): Name of the center to associate with the file.
314
410
  save (bool, optional): Deprecated; saving occurs internally.
315
411
  delete_source (bool, optional): Whether to delete the source file after processing (default True).
316
-
412
+
317
413
  Returns:
318
414
  RawPdfFile: The created or retrieved RawPdfFile instance.
319
-
415
+
320
416
  Raises:
321
417
  FileNotFoundError: If the source file does not exist.
322
418
  Center.DoesNotExist: If the specified center is not found.
@@ -338,28 +434,45 @@ class RawPdfFile(models.Model):
338
434
  raise ValueError(f"Could not calculate hash for {file_path}") from e
339
435
 
340
436
  # 2. Check if record with this hash already exists
341
- existing_pdf_file = cls.objects.filter(pdf_hash=pdf_hash).first()
437
+ existing_pdf_file = cls.objects.filter(pdf_hash=pdf_hash).first()
342
438
  if existing_pdf_file:
343
- logger.warning("RawPdfFile with hash %s already exists (ID: %s)", pdf_hash, existing_pdf_file.pk)
439
+ logger.warning(
440
+ "RawPdfFile with hash %s already exists (ID: %s)",
441
+ pdf_hash,
442
+ existing_pdf_file.pk,
443
+ )
344
444
 
345
445
  # Verify physical file exists for the existing record
346
446
  try:
347
447
  if existing_pdf_file is not None and isinstance(existing_pdf_file, cls):
348
- # Use storage API to check existence
448
+ # Use storage API to check existence
349
449
  _file = existing_pdf_file.file
350
450
  assert _file is not None
351
451
  if not _file.storage.exists(_file.name):
352
- logger.warning("File for existing RawPdfFile %s not found in storage at %s. Attempting to restore from source %s", pdf_hash, _file.name, file_path)
452
+ logger.warning(
453
+ "File for existing RawPdfFile %s not found in storage at %s. Attempting to restore from source %s",
454
+ pdf_hash,
455
+ _file.name,
456
+ file_path,
457
+ )
353
458
  # Re-save the file from the source to potentially fix it
354
459
  with file_path.open("rb") as f:
355
- django_file = File(f, name=Path(_file.name).name) # Use existing name if possible
356
- existing_pdf_file.file = django_file # type: ignore
357
- existing_pdf_file.save(update_fields=['file']) # Only update file field
460
+ django_file = File(
461
+ f, name=Path(_file.name).name
462
+ ) # Use existing name if possible
463
+ existing_pdf_file.file = django_file # type: ignore
464
+ existing_pdf_file.save(
465
+ update_fields=["file"]
466
+ ) # Only update file field
358
467
  else:
359
468
  pass
360
469
  # logger.debug("File for existing RawPdfFile %s already exists in storage.", pdf_hash)
361
470
  except Exception as e:
362
- logger.error("Error verifying/restoring file for existing record %s: %s", pdf_hash, e)
471
+ logger.error(
472
+ "Error verifying/restoring file for existing record %s: %s",
473
+ pdf_hash,
474
+ e,
475
+ )
363
476
 
364
477
  # Delete the source temp file if requested
365
478
  if delete_source:
@@ -396,7 +509,9 @@ class RawPdfFile(models.Model):
396
509
  _file = raw_pdf.file
397
510
  assert _file is not None
398
511
  logger.info(
399
- "Created and saved new RawPdfFile %s with file %s", raw_pdf.pk, _file.name
512
+ "Created and saved new RawPdfFile %s with file %s",
513
+ raw_pdf.pk,
514
+ _file.name,
400
515
  )
401
516
 
402
517
  if not _file.storage.exists(_file.name):
@@ -417,18 +532,22 @@ class RawPdfFile(models.Model):
417
532
  )
418
533
 
419
534
  except Exception as e:
420
- logger.error("Error processing or saving file %s for new record: %s", file_path, e)
535
+ logger.error(
536
+ "Error processing or saving file %s for new record: %s", file_path, e
537
+ )
421
538
  raise
422
539
 
423
540
  # Delete source file *after* successful save and verification
424
541
  if delete_source:
425
542
  try:
426
543
  file_path.unlink()
427
- logger.info("Deleted source file %s after creating new record.", file_path)
544
+ logger.info(
545
+ "Deleted source file %s after creating new record.", file_path
546
+ )
428
547
  except OSError as e:
429
548
  logger.error("Error deleting source file %s: %s", file_path, e)
430
549
 
431
- # raw_pdf.save() # unnecessary?
550
+ # raw_pdf.save() # unnecessary?
432
551
  return raw_pdf
433
552
 
434
553
  def save(self, *args, **kwargs):
@@ -436,7 +555,7 @@ class RawPdfFile(models.Model):
436
555
  # This is primarily a fallback if instance created manually without using create_from_file
437
556
  """
438
557
  Saves the RawPdfFile instance, ensuring the PDF hash is set and related fields are derived from metadata.
439
-
558
+
440
559
  If the PDF hash is missing, attempts to calculate it from the file before saving. Validates that the file has a `.pdf` extension. If related fields such as patient, examination, center, or examiner are unset but available in the associated sensitive metadata, they are populated accordingly before saving.
441
560
  """
442
561
  if not self.pk and not self.pdf_hash and self.file:
@@ -445,16 +564,22 @@ class RawPdfFile(models.Model):
445
564
  if not file_path.exists():
446
565
  raise FileNotFoundError(f"File path does not exist: {file_path}")
447
566
  # Read from the file object before it's saved by storage
448
- self.file.open('rb') # Ensure file is open
567
+ self.file.open("rb") # Ensure file is open
449
568
  self.file.seek(0) # Go to beginning
450
- self.pdf_hash = get_pdf_hash(file_path) # Assuming get_pdf_hash can handle file obj
569
+ self.pdf_hash = get_pdf_hash(
570
+ file_path
571
+ ) # Assuming get_pdf_hash can handle file obj
451
572
  self.file.seek(0) # Reset position
452
573
  self.file.close() # Close after reading
453
574
  logger.info(f"Calculated hash during pre-save for {self.file.name}")
454
575
  except Exception as e:
455
- logger.warning("Could not calculate hash before initial save for %s: %s", self.file.name, e)
576
+ logger.warning(
577
+ "Could not calculate hash before initial save for %s: %s",
578
+ self.file.name,
579
+ e,
580
+ )
456
581
  # Ensure file is closed if opened
457
- if hasattr(self.file, 'closed') and not self.file.closed:
582
+ if hasattr(self.file, "closed") and not self.file.closed:
458
583
  self.file.close()
459
584
 
460
585
  if self.file and not self.file.name.endswith(".pdf"):
@@ -463,18 +588,31 @@ class RawPdfFile(models.Model):
463
588
  # If hash is still missing after potential creation logic (e.g., direct instantiation)
464
589
  # and the file exists in storage, try calculating it from storage path.
465
590
  # This is less ideal as it requires the file to be saved first.
466
- if not self.pdf_hash and self.pk and self.file and self.file.storage.exists(self.file.name):
591
+ if (
592
+ not self.pdf_hash
593
+ and self.pk
594
+ and self.file
595
+ and self.file.storage.exists(self.file.name)
596
+ ):
467
597
  try:
468
598
  file_path = Path(self.file.path).resolve()
469
599
  if not file_path.exists():
470
600
  raise FileNotFoundError(f"File path does not exist: {file_path}")
471
- logger.warning(f"Hash missing for saved file {self.file.name}. Recalculating.")
472
- with self.file.storage.open(self.file.name, 'rb') as f:
473
- self.pdf_hash = get_pdf_hash(file_path) # Assuming get_pdf_hash handles file obj
601
+ logger.warning(
602
+ f"Hash missing for saved file {self.file.name}. Recalculating."
603
+ )
604
+ with self.file.storage.open(self.file.name, "rb") as f:
605
+ self.pdf_hash = get_pdf_hash(
606
+ file_path
607
+ ) # Assuming get_pdf_hash handles file obj
474
608
  # No need to save again just for hash unless update_fields is used carefully
475
609
  # Let the main super().save() handle saving the hash if it changed
476
610
  except Exception as e:
477
- logger.error("Could not calculate hash during save for existing file %s: %s", self.file.name, e)
611
+ logger.error(
612
+ "Could not calculate hash during save for existing file %s: %s",
613
+ self.file.name,
614
+ e,
615
+ )
478
616
 
479
617
  # Derive related fields from sensitive_meta if available
480
618
  if not self.patient and self.sensitive_meta:
@@ -483,7 +621,7 @@ class RawPdfFile(models.Model):
483
621
  self.examination = self.sensitive_meta.pseudo_examination
484
622
  if not self.center and self.sensitive_meta:
485
623
  self.center = self.sensitive_meta.center
486
- #TODO Outdated?
624
+ # TODO Outdated?
487
625
  # if not self.examiner and self.sensitive_meta and hasattr(self.sensitive_meta, 'pseudo_examiner'):
488
626
  # self.examiner = self.sensitive_meta.pseudo_examiner
489
627
 
@@ -492,7 +630,7 @@ class RawPdfFile(models.Model):
492
630
  def get_or_create_state(self) -> "RawPdfState":
493
631
  """
494
632
  Retrieve the associated RawPdfState for this RawPdfFile, creating and linking a new one if none exists.
495
-
633
+
496
634
  Returns:
497
635
  RawPdfState: The existing or newly created RawPdfState instance linked to this RawPdfFile.
498
636
  """
@@ -514,7 +652,7 @@ class RawPdfFile(models.Model):
514
652
  # Ensure fallback_file is a Path object.
515
653
  """
516
654
  Checks if the stored PDF file exists in storage and attempts to restore it from a fallback file path if missing.
517
-
655
+
518
656
  Parameters:
519
657
  fallback_file: Path or string representing the fallback file location to restore from if the stored file is missing.
520
658
  """
@@ -525,16 +663,21 @@ class RawPdfFile(models.Model):
525
663
  assert _file is not None
526
664
  try:
527
665
  if not _file.field.storage.exists(_file.name):
528
- logger.warning(f"File missing at storage path {_file.name}. Attempting copy from fallback {fallback_file}")
666
+ logger.warning(
667
+ f"File missing at storage path {_file.name}. Attempting copy from fallback {fallback_file}"
668
+ )
529
669
  if fallback_file.exists():
530
670
  with fallback_file.open("rb") as f:
531
671
  # Use save method which handles storage backend
532
- _file.save(Path(_file.name).name, File(f), save=True) # Re-save the file content
533
- logger.info(f"Successfully restored file from fallback {fallback_file} to {_file.name}")
672
+ _file.save(
673
+ Path(_file.name).name, File(f), save=True
674
+ ) # Re-save the file content
675
+ logger.info(
676
+ f"Successfully restored file from fallback {fallback_file} to {_file.name}"
677
+ )
534
678
  else:
535
679
  logger.error(f"Fallback file {fallback_file} does not exist.")
536
680
  except Exception as e:
537
-
538
681
  logger.error(f"Error during verify_existing_file for {_file.name}: {e}")
539
682
 
540
683
  def process_file(self, text, anonymized_text, report_meta, verbose):
@@ -562,18 +705,19 @@ class RawPdfFile(models.Model):
562
705
  for key, value in serializable_report_meta.items():
563
706
  if isinstance(value, (datetime, date)):
564
707
  serializable_report_meta[key] = value.isoformat()
565
-
566
- self.raw_meta = serializable_report_meta # Assign the version with string dates
567
708
 
568
- sensitive_meta.save() # Save SensitiveMeta first
569
- self.save() # Then save RawPdfFile
709
+ self.raw_meta = serializable_report_meta # Assign the version with string dates
710
+
711
+ sensitive_meta.save() # Save SensitiveMeta first
712
+ self.save() # Then save RawPdfFile
570
713
 
571
714
  return text, anonymized_text, report_meta
572
715
 
573
716
  def get_report_reader_config(self):
717
+ from warnings import warn
718
+
574
719
  from ...administration import Center
575
720
  from ...metadata.pdf_meta import PdfType
576
- from warnings import warn
577
721
 
578
722
  _center = self.center
579
723
  assert _center is not None, "Center must be set to get report reader config"
@@ -604,10 +748,10 @@ class RawPdfFile(models.Model):
604
748
  }
605
749
 
606
750
  return settings_dict
607
-
751
+
608
752
  @staticmethod
609
753
  def get_pdf_by_id(pdf_id: int) -> "RawPdfFile":
610
754
  try:
611
755
  return RawPdfFile.objects.get(pk=pdf_id)
612
756
  except RawPdfFile.DoesNotExist:
613
- raise ValueError(f"PDF with ID {pdf_id} does not exist.")
757
+ raise ValueError(f"PDF with ID {pdf_id} does not exist.")