endoreg-db 0.8.4.4__py3-none-any.whl → 0.8.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of endoreg-db might be problematic. Click here for more details.

Files changed (36) hide show
  1. endoreg_db/management/commands/load_ai_model_data.py +2 -1
  2. endoreg_db/management/commands/setup_endoreg_db.py +11 -7
  3. endoreg_db/models/media/pdf/raw_pdf.py +241 -97
  4. endoreg_db/models/media/video/pipe_1.py +30 -33
  5. endoreg_db/models/media/video/video_file.py +300 -187
  6. endoreg_db/models/metadata/model_meta_logic.py +15 -1
  7. endoreg_db/models/metadata/sensitive_meta_logic.py +391 -70
  8. endoreg_db/serializers/__init__.py +26 -55
  9. endoreg_db/serializers/misc/__init__.py +1 -1
  10. endoreg_db/serializers/misc/file_overview.py +65 -35
  11. endoreg_db/serializers/misc/{vop_patient_data.py → sensitive_patient_data.py} +1 -1
  12. endoreg_db/serializers/video_examination.py +198 -0
  13. endoreg_db/services/lookup_service.py +228 -58
  14. endoreg_db/services/lookup_store.py +174 -30
  15. endoreg_db/services/pdf_import.py +585 -282
  16. endoreg_db/services/video_import.py +340 -101
  17. endoreg_db/urls/__init__.py +36 -23
  18. endoreg_db/urls/label_video_segments.py +2 -0
  19. endoreg_db/urls/media.py +3 -2
  20. endoreg_db/views/__init__.py +6 -3
  21. endoreg_db/views/media/pdf_media.py +3 -1
  22. endoreg_db/views/media/video_media.py +1 -1
  23. endoreg_db/views/media/video_segments.py +187 -259
  24. endoreg_db/views/pdf/__init__.py +5 -8
  25. endoreg_db/views/pdf/pdf_stream.py +187 -0
  26. endoreg_db/views/pdf/reimport.py +110 -94
  27. endoreg_db/views/requirement/lookup.py +171 -287
  28. endoreg_db/views/video/__init__.py +0 -2
  29. endoreg_db/views/video/video_examination_viewset.py +202 -289
  30. {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.6.1.dist-info}/METADATA +1 -1
  31. {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.6.1.dist-info}/RECORD +33 -34
  32. endoreg_db/views/pdf/pdf_media.py +0 -239
  33. endoreg_db/views/pdf/pdf_stream_views.py +0 -127
  34. endoreg_db/views/video/video_media.py +0 -158
  35. {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.6.1.dist-info}/WHEEL +0 -0
  36. {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.6.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,8 @@
1
- from .pdf_media import (
2
- PDFMediaView,
3
- UpdateSensitiveMetaView,
4
- ClosingFileWrapper
5
- )
1
+ from .reimport import PdfReimportView
2
+ from .pdf_stream import PdfStreamView
3
+
6
4
 
7
5
  __all__ = [
8
- "PDFMediaView",
9
- "UpdateSensitiveMetaView",
10
- "ClosingFileWrapper",
6
+ "PdfReimportView",
7
+ "PdfStreamView",
11
8
  ]
@@ -0,0 +1,187 @@
1
+ import logging
2
+ import os
3
+ import re
4
+
5
+ from django.http import FileResponse, Http404, StreamingHttpResponse
6
+ from django.views.decorators.clickjacking import xframe_options_sameorigin
7
+ from rest_framework.views import APIView
8
+
9
+ from endoreg_db.models import RawPdfFile
10
+
11
+ from ...utils.permissions import EnvironmentAwarePermission
12
+
13
+ logger = logging.getLogger(__name__)
14
+ _RANGE_RE = re.compile(r"bytes=(\d+)-(\d*)")
15
+
16
+
17
+ class ClosingFileWrapper:
18
+ """Custom file wrapper that ensures file is closed after streaming"""
19
+
20
+ def __init__(self, file_handle, blksize=8192):
21
+ self.file_handle = file_handle
22
+ self.blksize = blksize
23
+
24
+ def __iter__(self):
25
+ return self
26
+
27
+ def __next__(self):
28
+ data = self.file_handle.read(self.blksize)
29
+ if not data:
30
+ self.file_handle.close()
31
+ raise StopIteration
32
+ return data
33
+
34
+ def close(self):
35
+ if hasattr(self.file_handle, "close"):
36
+ self.file_handle.close()
37
+
38
+
39
+ class PdfStreamView(APIView):
40
+ """
41
+ Streams a PDF file with correct HTTP range support and proper file handle management.
42
+
43
+ Supports streaming both raw (original) and anonymized PDF files.
44
+
45
+ Query Parameters:
46
+ type: 'raw' (default) or 'anonymized' - Selects which PDF file to stream
47
+
48
+ Examples:
49
+ GET /api/media/pdf/1/?type=raw - Stream original raw PDF
50
+ GET /api/media/pdf/1/?type=anonymized - Stream anonymized PDF
51
+ """
52
+
53
+ permission_classes = [EnvironmentAwarePermission]
54
+
55
+ @xframe_options_sameorigin
56
+ def get(self, request, pdf_id: int, *args, **kwargs):
57
+ file_type = "raw" # Initialize for error logging
58
+ try:
59
+ pdf_obj = RawPdfFile.objects.filter(pk=pdf_id).first()
60
+ if not pdf_obj:
61
+ logger.warning(f"PDF not found: ID {pdf_id}")
62
+ raise Http404("PDF not found")
63
+
64
+ # Parse query parameters to determine which file to stream
65
+ file_type = request.query_params.get("type", "raw").lower()
66
+ if file_type not in ["raw", "anonymized"]:
67
+ logger.warning(f"Invalid file_type '{file_type}', defaulting to 'raw'")
68
+ file_type = "raw"
69
+
70
+ # Determine which file field to use
71
+ if file_type == "raw":
72
+ file_field = pdf_obj.file
73
+ if not file_field:
74
+ logger.warning(f"No raw PDF file available for PDF ID {pdf_id}")
75
+ raise Http404("Raw PDF file not available")
76
+ else: # anonymized
77
+ file_field = pdf_obj.anonymized_file
78
+ if not file_field:
79
+ logger.warning(
80
+ f"No anonymized PDF file available for PDF ID {pdf_id}"
81
+ )
82
+ raise Http404("Anonymized PDF file not available")
83
+
84
+ # Check if file exists on filesystem
85
+ try:
86
+ file_path = file_field.path
87
+ if not os.path.exists(file_path):
88
+ logger.error(f"PDF file does not exist on filesystem: {file_path}")
89
+ raise Http404(
90
+ f"{file_type.capitalize()} PDF file not found on filesystem"
91
+ )
92
+
93
+ file_size = os.path.getsize(file_path)
94
+ except (OSError, IOError, AttributeError) as e:
95
+ logger.error(f"Error accessing {file_type} PDF file {pdf_id}: {e}")
96
+ raise Http404(f"{file_type.capitalize()} PDF file not accessible")
97
+
98
+ # Generate safe filename
99
+ base_filename = (
100
+ os.path.basename(file_field.name)
101
+ if file_field.name
102
+ else f"document_{pdf_id}.pdf"
103
+ )
104
+ if not base_filename.endswith(".pdf"):
105
+ base_filename += ".pdf"
106
+
107
+ # Add type indicator to filename for clarity
108
+ if file_type == "anonymized":
109
+ name_parts = base_filename.rsplit(".", 1)
110
+ safe_filename = f"{name_parts[0]}_anonymized.{name_parts[1]}"
111
+ else:
112
+ safe_filename = base_filename
113
+
114
+ # Handle Range requests
115
+ range_header = request.headers.get("Range")
116
+ if range_header:
117
+ logger.debug(
118
+ f"Range request for {file_type} PDF {pdf_id}: {range_header}"
119
+ )
120
+ match = _RANGE_RE.match(range_header)
121
+ if match:
122
+ start = int(match.group(1))
123
+ end = int(match.group(2) or file_size - 1)
124
+
125
+ # Validate range
126
+ if start >= file_size or start < 0:
127
+ logger.warning(
128
+ f"Invalid range start {start} for file size {file_size}"
129
+ )
130
+ raise Http404("Invalid range")
131
+
132
+ if end >= file_size:
133
+ end = file_size - 1
134
+
135
+ chunk_size = end - start + 1
136
+
137
+ try:
138
+ file_handle = open(file_path, "rb")
139
+ file_handle.seek(start)
140
+
141
+ logger.debug(
142
+ f"Serving {file_type} PDF {pdf_id} range {start}-{end}/{file_size}"
143
+ )
144
+
145
+ response = StreamingHttpResponse(
146
+ ClosingFileWrapper(file_handle, blksize=8192),
147
+ status=206,
148
+ content_type="application/pdf",
149
+ )
150
+ response["Content-Length"] = str(chunk_size)
151
+ response["Content-Range"] = f"bytes {start}-{end}/{file_size}"
152
+ response["Accept-Ranges"] = "bytes"
153
+ response["Content-Disposition"] = (
154
+ f'inline; filename="{safe_filename}"'
155
+ )
156
+
157
+ return response
158
+ except (OSError, IOError) as e:
159
+ logger.error(
160
+ f"Error opening {file_type} PDF file for range request: {e}"
161
+ )
162
+ raise Http404(f"Error accessing {file_type} PDF file")
163
+ else:
164
+ logger.warning(f"Invalid Range header format: {range_header}")
165
+
166
+ # Serve entire file using FileResponse (automatically handles file closing)
167
+ logger.debug(f"Serving full {file_type} PDF {pdf_id} ({file_size} bytes)")
168
+
169
+ try:
170
+ file_handle = open(file_path, "rb")
171
+ response = FileResponse(file_handle, content_type="application/pdf")
172
+ response["Content-Length"] = str(file_size)
173
+ response["Accept-Ranges"] = "bytes"
174
+ response["Content-Disposition"] = f'inline; filename="{safe_filename}"'
175
+
176
+ # FileResponse will take ownership of file_handle and close it after response
177
+ return response
178
+ except (OSError, IOError) as e:
179
+ logger.error(f"Error opening {file_type} PDF file: {e}")
180
+ raise Http404(f"Error accessing {file_type} PDF file")
181
+
182
+ except Exception as e:
183
+ logger.error(
184
+ f"Unexpected error streaming {file_type if 'file_type' in locals() else 'PDF'} {pdf_id}: {e}",
185
+ exc_info=True,
186
+ )
187
+ raise Http404("Error streaming PDF")
@@ -1,19 +1,22 @@
1
- from rest_framework.views import APIView
2
- from rest_framework.response import Response
3
- from rest_framework import status
4
1
  import logging
5
- from pathlib import Path
2
+
6
3
  from django.db import transaction
4
+ from rest_framework import status
5
+ from rest_framework.response import Response
6
+ from rest_framework.views import APIView
7
+
7
8
  from ...models import RawPdfFile, SensitiveMeta
8
9
  from ...services.pdf_import import PdfImportService
10
+
9
11
  logger = logging.getLogger(__name__)
10
12
 
13
+
11
14
  class PdfReimportView(APIView):
12
15
  """
13
16
  API endpoint to re-import a pdf file and regenerate metadata.
14
17
  This is useful when OCR failed or metadata is incomplete.
15
18
  """
16
-
19
+
17
20
  def __init__(self, **kwargs):
18
21
  super().__init__(**kwargs)
19
22
  self.pdf_service = PdfImportService()
@@ -22,140 +25,153 @@ class PdfReimportView(APIView):
22
25
  """
23
26
  Re-import a pdf file to regenerate SensitiveMeta and other metadata.
24
27
  Instead of creating a new pdf, this updates the existing one.
25
-
28
+
26
29
  Args:
27
30
  request: HTTP request object
28
31
  pk: PDF primary key (ID)
29
32
  """
30
33
  pdf_id = pk # Align with media framework naming convention
31
-
34
+
32
35
  # Validate pdf_id parameter
33
36
  if not pdf_id or not isinstance(pdf_id, int):
34
37
  return Response(
35
- {"error": "Invalid PDF ID provided."},
36
- status=status.HTTP_400_BAD_REQUEST
38
+ {"error": "Invalid PDF ID provided."},
39
+ status=status.HTTP_400_BAD_REQUEST,
37
40
  )
38
41
 
39
42
  try:
40
43
  pdf = RawPdfFile.objects.get(id=pdf_id)
41
- logger.info(f"Found PDF {pdf.uuid} (ID: {pdf_id}) for re-import")
44
+ logger.info(f"Found PDF {pdf.pdf_hash} (ID: {pdf_id}) for re-import")
42
45
  except RawPdfFile.DoesNotExist:
43
46
  logger.warning(f"PDF with ID {pdf_id} not found")
44
47
  return Response(
45
- {"error": f"PDF with ID {pdf_id} not found."},
46
- status=status.HTTP_404_NOT_FOUND
48
+ {"error": f"PDF with ID {pdf_id} not found."},
49
+ status=status.HTTP_404_NOT_FOUND,
47
50
  )
48
51
 
52
+ # Get raw file path using the model method
53
+ raw_file_path = pdf.get_raw_file_path()
49
54
 
50
-
51
- # Check if the raw file actually exists on disk
52
- raw_file_path = Path(pdf.file.path)
53
- if not raw_file_path.exists():
54
- logger.error(f"Raw file not found on disk: {raw_file_path}")
55
+ if not raw_file_path or not raw_file_path.exists():
56
+ logger.error(
57
+ f"Raw PDF file not found for hash {pdf.pdf_hash}: {raw_file_path}"
58
+ )
55
59
  return Response(
56
- {"error": f"PDF file not found on server: {raw_file_path.name}"},
57
- status=status.HTTP_400_BAD_REQUEST
60
+ {
61
+ "error": f"Raw PDF file not found for PDF {pdf.pdf_hash}. Please upload the original file again."
62
+ },
63
+ status=status.HTTP_404_NOT_FOUND,
58
64
  )
59
65
 
60
66
  # Check if PDF has required relationships
61
67
  if not pdf.center:
62
- logger.warning(f"PDF {pdf.uuid} has no associated center")
68
+ logger.warning(f"PDF {pdf.pdf_hash} has no associated center")
63
69
  return Response(
64
- {"error": "Video has no associated center."},
65
- status=status.HTTP_400_BAD_REQUEST
70
+ {"error": "PDF has no associated center."},
71
+ status=status.HTTP_400_BAD_REQUEST,
66
72
  )
67
73
 
68
74
  try:
69
- logger.info(f"Starting in-place re-import for pdf {pdf.uuid} (ID: {pdf_id})")
70
-
75
+ logger.info(f"Starting re-import for PDF {pdf.pdf_hash} (ID: {pdf_id})")
76
+
71
77
  with transaction.atomic():
72
78
  # Clear existing metadata to force regeneration
73
79
  old_meta_id = None
74
80
  if pdf.sensitive_meta:
75
- old_meta_id = pdf.sensitive_meta.id
76
- logger.info(f"Clearing existing SensitiveMeta {old_meta_id} for pdf {pdf.uuid}")
77
- pdf.sensitive_meta = None
78
- pdf.save(update_fields=['sensitive_meta'])
79
-
81
+ old_meta_id = pdf.sensitive_meta.pk
82
+ logger.info(
83
+ f"Clearing existing SensitiveMeta {old_meta_id} for PDF {pdf.pdf_hash}"
84
+ )
85
+ pdf.sensitive_meta = None # type: ignore
86
+ pdf.save(update_fields=["sensitive_meta"])
87
+
80
88
  # Delete the old SensitiveMeta record
81
89
  try:
82
- SensitiveMeta.objects.filter(id=old_meta_id).delete()
90
+ SensitiveMeta.objects.filter(pk=old_meta_id).delete()
83
91
  logger.info(f"Deleted old SensitiveMeta {old_meta_id}")
84
92
  except Exception as e:
85
- logger.warning(f"Could not delete old SensitiveMeta {old_meta_id}: {e}")
86
-
87
-
88
-
89
-
90
-
91
- # Ensure minimum patient data is available
92
- logger.info(f"Ensuring minimum patient data for {pdf.uuid}")
93
- self.pdf_service._ensure_default_patient_data(pdf)
94
-
95
- # Refresh from database to get updated data
96
- pdf.refresh_from_db()
97
-
98
- # Use VideoImportService for anonymization
93
+ logger.warning(
94
+ f"Could not delete old SensitiveMeta {old_meta_id}: {e}"
95
+ )
96
+
97
+ # Use PdfImportService for reprocessing
99
98
  try:
100
-
101
- logger.info(f"Starting anonymization using VideoImportService for {pdf.uuid}")
99
+ logger.info(
100
+ f"Starting reprocessing using PdfImportService for {pdf.pdf_hash}"
101
+ )
102
102
  self.pdf_service.import_and_anonymize(
103
103
  file_path=raw_file_path,
104
104
  center_name=pdf.center.name,
105
- processor_name=pdf.processor.name if pdf.processor else "Unknown",
106
- save_video=True,
107
- delete_source=False
105
+ delete_source=False, # Don't delete during reimport
106
+ retry=True, # Mark as retry attempt
108
107
  )
109
-
110
- logger.info(f"VideoImportService anonymization completed for {pdf.uuid}")
111
-
112
-
113
- return Response({
114
- "message": "Video re-import with VideoImportService completed successfully.",
115
- "pdf_id": pdf_id,
116
- "uuid": str(pdf.uuid),
117
- "frame_cleaning_applied": True,
118
- "sensitive_meta_created": pdf.sensitive_meta is not None,
119
- "sensitive_meta_id": pdf.sensitive_meta.id if pdf.sensitive_meta else None,
120
- "updated_in_place": True,
121
- "status": "done"
122
- }, status=status.HTTP_200_OK)
123
-
108
+
109
+ logger.info(
110
+ f"PdfImportService reprocessing completed for {pdf.pdf_hash}"
111
+ )
112
+
113
+ # Refresh to get updated state
114
+ pdf.refresh_from_db()
115
+
116
+ return Response(
117
+ {
118
+ "message": "PDF re-import completed successfully.",
119
+ "pdf_id": pdf_id,
120
+ "pdf_hash": str(pdf.pdf_hash),
121
+ "sensitive_meta_created": pdf.sensitive_meta is not None,
122
+ "sensitive_meta_id": pdf.sensitive_meta.pk
123
+ if pdf.sensitive_meta
124
+ else None,
125
+ "text_extracted": bool(pdf.text),
126
+ "anonymized": pdf.anonymized,
127
+ "status": "done",
128
+ },
129
+ status=status.HTTP_200_OK,
130
+ )
131
+
124
132
  except Exception as e:
125
- logger.exception(f"VideoImportService anonymization failed for pdf {pdf.uuid}: {e}")
126
- logger.warning("Continuing without anonymization due to error")
127
-
128
- # Refresh from database to get final state
129
- pdf.refresh_from_db()
130
-
131
- return Response({
132
- "message": "PDF re-import completed successfully.",
133
- "pdf_id": pdf_id,
134
- "uuid": str(pdf.uuid),
135
- "sensitive_meta_created": pdf.sensitive_meta is not None,
136
- "sensitive_meta_id": pdf.sensitive_meta.id if pdf.sensitive_meta else None,
137
- "updated_in_place": True,
138
- "status": "done"
139
- }, status=status.HTTP_200_OK)
133
+ logger.exception(
134
+ f"PdfImportService reprocessing failed for PDF {pdf.pdf_hash}: {e}"
135
+ )
136
+ return Response(
137
+ {
138
+ "error": f"Reprocessing failed: {str(e)}",
139
+ "error_type": "processing_error",
140
+ "pdf_id": pdf_id,
141
+ "pdf_hash": str(pdf.pdf_hash),
142
+ },
143
+ status=status.HTTP_500_INTERNAL_SERVER_ERROR,
144
+ )
140
145
 
141
146
  except Exception as e:
142
- logger.error(f"Failed to re-import pdf {pdf.uuid}: {str(e)}", exc_info=True)
143
-
147
+ logger.error(
148
+ f"Failed to re-import PDF {pdf.pdf_hash}: {str(e)}", exc_info=True
149
+ )
150
+
144
151
  # Handle specific error types
145
152
  error_msg = str(e)
146
- if any(phrase in error_msg.lower() for phrase in ["insufficient storage", "no space left", "disk full"]):
153
+ if any(
154
+ phrase in error_msg.lower()
155
+ for phrase in ["insufficient storage", "no space left", "disk full"]
156
+ ):
147
157
  # Storage error - return specific error message
148
- return Response({
149
- "error": f"Storage error during re-import: {error_msg}",
150
- "error_type": "storage_error",
151
- "pdf_id": pdf_id,
152
- "uuid": str(pdf.uuid)
153
- }, status=status.HTTP_507_INSUFFICIENT_STORAGE)
158
+ return Response(
159
+ {
160
+ "error": f"Storage error during re-import: {error_msg}",
161
+ "error_type": "storage_error",
162
+ "pdf_id": pdf_id,
163
+ "pdf_hash": str(pdf.pdf_hash),
164
+ },
165
+ status=status.HTTP_507_INSUFFICIENT_STORAGE,
166
+ )
154
167
  else:
155
168
  # Other errors
156
- return Response({
157
- "error": f"Re-import failed: {error_msg}",
158
- "error_type": "processing_error",
159
- "pdf_id": pdf_id,
160
- "uuid": str(pdf.uuid)
161
- }, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
169
+ return Response(
170
+ {
171
+ "error": f"Re-import failed: {error_msg}",
172
+ "error_type": "processing_error",
173
+ "pdf_id": pdf_id,
174
+ "pdf_hash": str(pdf.pdf_hash),
175
+ },
176
+ status=status.HTTP_500_INTERNAL_SERVER_ERROR,
177
+ )