endoreg-db 0.8.8.0__py3-none-any.whl → 0.8.8.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of endoreg-db might be problematic. Click here for more details.

Files changed (402) hide show
  1. endoreg_db/data/__init__.py +22 -8
  2. endoreg_db/data/ai_model_meta/default_multilabel_classification.yaml +0 -1
  3. endoreg_db/data/examination/examinations/data.yaml +114 -14
  4. endoreg_db/data/examination/time-type/data.yaml +0 -3
  5. endoreg_db/data/examination_indication/endoscopy.yaml +108 -173
  6. endoreg_db/data/examination_indication_classification/endoscopy.yaml +0 -70
  7. endoreg_db/data/examination_indication_classification_choice/endoscopy.yaml +33 -37
  8. endoreg_db/data/finding/00_generic.yaml +35 -0
  9. endoreg_db/data/finding/00_generic_complication.yaml +9 -0
  10. endoreg_db/data/finding/01_gastroscopy_baseline.yaml +88 -0
  11. endoreg_db/data/finding/01_gastroscopy_observation.yaml +113 -0
  12. endoreg_db/data/finding/02_colonoscopy_baseline.yaml +53 -0
  13. endoreg_db/data/finding/02_colonoscopy_hidden.yaml +119 -0
  14. endoreg_db/data/finding/02_colonoscopy_observation.yaml +152 -0
  15. endoreg_db/data/finding_classification/00_generic.yaml +44 -0
  16. endoreg_db/data/finding_classification/00_generic_histology.yaml +28 -0
  17. endoreg_db/data/finding_classification/00_generic_lesion.yaml +52 -0
  18. endoreg_db/data/finding_classification/{colonoscopy_bowel_preparation.yaml → 02_colonoscopy_baseline.yaml} +35 -20
  19. endoreg_db/data/finding_classification/02_colonoscopy_histology.yaml +13 -0
  20. endoreg_db/data/finding_classification/02_colonoscopy_other.yaml +12 -0
  21. endoreg_db/data/finding_classification/02_colonoscopy_polyp.yaml +101 -0
  22. endoreg_db/data/finding_classification_choice/{yes_no_na.yaml → 00_generic.yaml} +5 -1
  23. endoreg_db/data/finding_classification_choice/{examination_setting_generic_types.yaml → 00_generic_baseline.yaml} +10 -2
  24. endoreg_db/data/finding_classification_choice/{complication_generic_types.yaml → 00_generic_complication.yaml} +1 -1
  25. endoreg_db/data/finding_classification_choice/{histology.yaml → 00_generic_histology.yaml} +1 -4
  26. endoreg_db/data/finding_classification_choice/00_generic_lesion.yaml +158 -0
  27. endoreg_db/data/finding_classification_choice/{bowel_preparation.yaml → 02_colonoscopy_bowel_preparation.yaml} +1 -30
  28. endoreg_db/data/{_examples/finding_classification_choice/colonoscopy_not_complete_reason.yaml → finding_classification_choice/02_colonoscopy_generic.yaml} +1 -1
  29. endoreg_db/data/finding_classification_choice/{histology_polyp.yaml → 02_colonoscopy_histology.yaml} +1 -1
  30. endoreg_db/data/{_examples/finding_classification_choice/colonoscopy_location.yaml → finding_classification_choice/02_colonoscopy_location.yaml} +23 -4
  31. endoreg_db/data/finding_classification_choice/02_colonoscopy_other.yaml +34 -0
  32. endoreg_db/data/finding_classification_choice/02_colonoscopy_polyp_advanced_imaging.yaml +76 -0
  33. endoreg_db/data/{_examples/finding_classification_choice/colon_lesion_paris.yaml → finding_classification_choice/02_colonoscopy_polyp_morphology.yaml} +26 -8
  34. endoreg_db/data/finding_classification_choice/02_colonoscopy_size.yaml +27 -0
  35. endoreg_db/data/finding_classification_type/{colonoscopy_basic.yaml → 00_generic.yaml} +18 -13
  36. endoreg_db/data/finding_classification_type/02_colonoscopy.yaml +9 -0
  37. endoreg_db/data/finding_intervention/00_generic_endoscopy.yaml +59 -0
  38. endoreg_db/data/finding_intervention/00_generic_endoscopy_ablation.yaml +44 -0
  39. endoreg_db/data/finding_intervention/00_generic_endoscopy_bleeding.yaml +55 -0
  40. endoreg_db/data/finding_intervention/00_generic_endoscopy_resection.yaml +85 -0
  41. endoreg_db/data/finding_intervention/00_generic_endoscopy_stenosis.yaml +17 -0
  42. endoreg_db/data/finding_intervention/00_generic_endoscopy_stent.yaml +9 -0
  43. endoreg_db/data/finding_intervention/01_gastroscopy.yaml +19 -0
  44. endoreg_db/data/finding_intervention/04_eus.yaml +39 -0
  45. endoreg_db/data/finding_intervention/05_ercp.yaml +3 -0
  46. endoreg_db/data/finding_type/data.yaml +8 -12
  47. endoreg_db/data/requirement/01_patient_data.yaml +93 -0
  48. endoreg_db/data/requirement_operator/new_operators.yaml +36 -0
  49. endoreg_db/data/requirement_set/01_endoscopy_generic.yaml +0 -2
  50. endoreg_db/data/requirement_set/90_coloreg.yaml +20 -8
  51. endoreg_db/exceptions.py +0 -1
  52. endoreg_db/forms/examination_form.py +1 -1
  53. endoreg_db/helpers/data_loader.py +124 -52
  54. endoreg_db/helpers/default_objects.py +116 -81
  55. endoreg_db/import_files/__init__.py +27 -0
  56. endoreg_db/import_files/context/__init__.py +7 -0
  57. endoreg_db/import_files/context/default_sensitive_meta.py +81 -0
  58. endoreg_db/import_files/context/ensure_center.py +17 -0
  59. endoreg_db/import_files/context/file_lock.py +66 -0
  60. endoreg_db/import_files/context/import_context.py +43 -0
  61. endoreg_db/import_files/context/validate_directories.py +56 -0
  62. endoreg_db/import_files/file_storage/__init__.py +15 -0
  63. endoreg_db/import_files/file_storage/create_report_file.py +76 -0
  64. endoreg_db/import_files/file_storage/create_video_file.py +75 -0
  65. endoreg_db/import_files/file_storage/sensitive_meta_storage.py +39 -0
  66. endoreg_db/import_files/file_storage/state_management.py +400 -0
  67. endoreg_db/import_files/file_storage/storage.py +36 -0
  68. endoreg_db/import_files/import_service.md +26 -0
  69. endoreg_db/import_files/processing/__init__.py +11 -0
  70. endoreg_db/import_files/processing/report_processing/report_anonymization.py +94 -0
  71. endoreg_db/import_files/processing/sensitive_meta_adapter.py +51 -0
  72. endoreg_db/import_files/processing/video_processing/video_anonymization.py +107 -0
  73. endoreg_db/import_files/processing/video_processing/video_cleanup_on_error.py +119 -0
  74. endoreg_db/import_files/pseudonymization/fake.py +52 -0
  75. endoreg_db/import_files/pseudonymization/k_anonymity.py +182 -0
  76. endoreg_db/import_files/pseudonymization/k_pseudonymity.py +128 -0
  77. endoreg_db/import_files/report_import_service.py +141 -0
  78. endoreg_db/import_files/video_import_service.py +150 -0
  79. endoreg_db/management/commands/import_report.py +130 -65
  80. endoreg_db/management/commands/import_video_with_classification.py +1 -1
  81. endoreg_db/management/commands/load_ai_model_data.py +5 -5
  82. endoreg_db/management/commands/load_ai_model_label_data.py +9 -7
  83. endoreg_db/management/commands/load_base_db_data.py +5 -134
  84. endoreg_db/management/commands/load_contraindication_data.py +14 -16
  85. endoreg_db/management/commands/load_disease_classification_choices_data.py +15 -18
  86. endoreg_db/management/commands/load_disease_classification_data.py +15 -18
  87. endoreg_db/management/commands/load_disease_data.py +25 -28
  88. endoreg_db/management/commands/load_endoscope_data.py +20 -27
  89. endoreg_db/management/commands/load_event_data.py +14 -16
  90. endoreg_db/management/commands/load_examination_data.py +31 -44
  91. endoreg_db/management/commands/load_examination_indication_data.py +20 -21
  92. endoreg_db/management/commands/load_finding_data.py +52 -80
  93. endoreg_db/management/commands/load_information_source.py +21 -23
  94. endoreg_db/management/commands/load_lab_value_data.py +17 -26
  95. endoreg_db/management/commands/load_medication_data.py +13 -12
  96. endoreg_db/management/commands/load_organ_data.py +15 -19
  97. endoreg_db/management/commands/load_pdf_type_data.py +19 -18
  98. endoreg_db/management/commands/load_profession_data.py +14 -17
  99. endoreg_db/management/commands/load_qualification_data.py +20 -23
  100. endoreg_db/management/commands/load_report_reader_flag_data.py +17 -19
  101. endoreg_db/management/commands/load_requirement_data.py +14 -20
  102. endoreg_db/management/commands/load_risk_data.py +7 -6
  103. endoreg_db/management/commands/load_shift_data.py +20 -23
  104. endoreg_db/management/commands/load_tag_data.py +8 -11
  105. endoreg_db/management/commands/load_unit_data.py +17 -19
  106. endoreg_db/management/commands/start_filewatcher.py +46 -37
  107. endoreg_db/management/commands/validate_video_files.py +1 -5
  108. endoreg_db/migrations/0001_initial.py +1360 -1812
  109. endoreg_db/models/administration/person/patient/patient.py +72 -46
  110. endoreg_db/models/label/__init__.py +2 -2
  111. endoreg_db/models/label/annotation/video_segmentation_annotation.py +18 -26
  112. endoreg_db/models/label/label_video_segment/label_video_segment.py +23 -1
  113. endoreg_db/models/media/pdf/raw_pdf.py +136 -64
  114. endoreg_db/models/media/pdf/report_reader/report_reader_config.py +34 -10
  115. endoreg_db/models/media/processing_history/__init__.py +5 -0
  116. endoreg_db/models/media/processing_history/processing_history.py +96 -0
  117. endoreg_db/models/media/video/create_from_file.py +101 -31
  118. endoreg_db/models/media/video/video_file.py +125 -105
  119. endoreg_db/models/media/video/video_file_io.py +31 -26
  120. endoreg_db/models/medical/contraindication/README.md +1 -0
  121. endoreg_db/models/medical/examination/examination.py +28 -8
  122. endoreg_db/models/medical/examination/examination_indication.py +13 -79
  123. endoreg_db/models/medical/examination/examination_time.py +8 -3
  124. endoreg_db/models/medical/finding/finding.py +5 -12
  125. endoreg_db/models/medical/finding/finding_classification.py +18 -37
  126. endoreg_db/models/medical/finding/finding_intervention.py +7 -9
  127. endoreg_db/models/medical/hardware/endoscope.py +6 -0
  128. endoreg_db/models/medical/patient/medication_examples.py +5 -1
  129. endoreg_db/models/medical/patient/patient_finding.py +1 -1
  130. endoreg_db/models/metadata/pdf_meta.py +22 -10
  131. endoreg_db/models/metadata/sensitive_meta.py +3 -0
  132. endoreg_db/models/metadata/sensitive_meta_logic.py +200 -124
  133. endoreg_db/models/other/information_source.py +27 -6
  134. endoreg_db/models/report/__init__.py +0 -0
  135. endoreg_db/models/report/images.py +0 -0
  136. endoreg_db/models/report/report.py +6 -0
  137. endoreg_db/models/requirement/requirement.py +59 -399
  138. endoreg_db/models/requirement/requirement_operator.py +86 -98
  139. endoreg_db/models/state/audit_ledger.py +4 -5
  140. endoreg_db/models/state/raw_pdf.py +69 -30
  141. endoreg_db/models/state/video.py +64 -49
  142. endoreg_db/models/upload_job.py +33 -9
  143. endoreg_db/models/utils.py +27 -23
  144. endoreg_db/queries/__init__.py +3 -1
  145. endoreg_db/schemas/examination_evaluation.py +1 -1
  146. endoreg_db/serializers/__init__.py +2 -8
  147. endoreg_db/serializers/label_video_segment/label_video_segment.py +2 -29
  148. endoreg_db/serializers/meta/__init__.py +1 -6
  149. endoreg_db/serializers/misc/sensitive_patient_data.py +50 -26
  150. endoreg_db/serializers/patient_examination/patient_examination.py +3 -3
  151. endoreg_db/serializers/pdf/anony_text_validation.py +39 -23
  152. endoreg_db/serializers/video/video_file_list.py +65 -34
  153. endoreg_db/services/__old/pdf_import.py +1487 -0
  154. endoreg_db/services/__old/video_import.py +1306 -0
  155. endoreg_db/services/anonymization.py +63 -26
  156. endoreg_db/services/lookup_service.py +28 -28
  157. endoreg_db/services/lookup_store.py +2 -2
  158. endoreg_db/services/pdf_import.py +0 -1480
  159. endoreg_db/services/report_import.py +10 -0
  160. endoreg_db/services/video_import.py +6 -1165
  161. endoreg_db/tasks/upload_tasks.py +79 -70
  162. endoreg_db/tasks/video_ingest.py +8 -4
  163. endoreg_db/urls/__init__.py +0 -14
  164. endoreg_db/urls/ai.py +32 -0
  165. endoreg_db/urls/media.py +21 -24
  166. endoreg_db/utils/dataloader.py +87 -57
  167. endoreg_db/utils/paths.py +110 -46
  168. endoreg_db/utils/pipelines/Readme.md +1 -1
  169. endoreg_db/utils/requirement_operator_logic/new_operator_logic.py +97 -0
  170. endoreg_db/views/__init__.py +85 -173
  171. endoreg_db/views/ai/__init__.py +8 -0
  172. endoreg_db/views/ai/label.py +155 -0
  173. endoreg_db/views/anonymization/media_management.py +8 -7
  174. endoreg_db/views/anonymization/overview.py +97 -68
  175. endoreg_db/views/anonymization/validate.py +25 -21
  176. endoreg_db/views/media/__init__.py +5 -20
  177. endoreg_db/views/media/pdf_media.py +109 -65
  178. endoreg_db/views/media/sensitive_metadata.py +163 -148
  179. endoreg_db/views/meta/__init__.py +0 -8
  180. endoreg_db/views/misc/__init__.py +1 -7
  181. endoreg_db/views/misc/upload_views.py +94 -93
  182. endoreg_db/views/report/__init__.py +7 -0
  183. endoreg_db/views/{pdf → report}/reimport.py +45 -24
  184. endoreg_db/views/{pdf/pdf_stream.py → report/report_stream.py} +40 -32
  185. endoreg_db/views/requirement/lookup_store.py +22 -90
  186. endoreg_db/views/video/__init__.py +23 -22
  187. endoreg_db/views/video/correction.py +201 -172
  188. endoreg_db/views/video/reimport.py +1 -1
  189. endoreg_db/views/{media/video_segments.py → video/segments_crud.py} +75 -37
  190. endoreg_db/views/video/{video_meta.py → video_meta_stats.py} +2 -2
  191. endoreg_db/views/video/video_stream.py +7 -8
  192. {endoreg_db-0.8.8.0.dist-info → endoreg_db-0.8.8.9.dist-info}/METADATA +2 -2
  193. {endoreg_db-0.8.8.0.dist-info → endoreg_db-0.8.8.9.dist-info}/RECORD +217 -335
  194. {endoreg_db-0.8.8.0.dist-info → endoreg_db-0.8.8.9.dist-info}/WHEEL +1 -1
  195. endoreg_db/data/_examples/disease.yaml +0 -55
  196. endoreg_db/data/_examples/disease_classification.yaml +0 -13
  197. endoreg_db/data/_examples/disease_classification_choice.yaml +0 -62
  198. endoreg_db/data/_examples/event.yaml +0 -64
  199. endoreg_db/data/_examples/examination.yaml +0 -72
  200. endoreg_db/data/_examples/finding/anatomy_colon.yaml +0 -128
  201. endoreg_db/data/_examples/finding/colonoscopy.yaml +0 -40
  202. endoreg_db/data/_examples/finding/colonoscopy_bowel_prep.yaml +0 -56
  203. endoreg_db/data/_examples/finding/complication.yaml +0 -16
  204. endoreg_db/data/_examples/finding/data.yaml +0 -105
  205. endoreg_db/data/_examples/finding/examination_setting.yaml +0 -16
  206. endoreg_db/data/_examples/finding/medication_related.yaml +0 -18
  207. endoreg_db/data/_examples/finding/outcome.yaml +0 -12
  208. endoreg_db/data/_examples/finding_classification/colonoscopy_bowel_preparation.yaml +0 -68
  209. endoreg_db/data/_examples/finding_classification/colonoscopy_jnet.yaml +0 -22
  210. endoreg_db/data/_examples/finding_classification/colonoscopy_kudo.yaml +0 -25
  211. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_circularity.yaml +0 -20
  212. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_planarity.yaml +0 -24
  213. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_size.yaml +0 -68
  214. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_surface.yaml +0 -20
  215. endoreg_db/data/_examples/finding_classification/colonoscopy_location.yaml +0 -80
  216. endoreg_db/data/_examples/finding_classification/colonoscopy_lst.yaml +0 -21
  217. endoreg_db/data/_examples/finding_classification/colonoscopy_nice.yaml +0 -20
  218. endoreg_db/data/_examples/finding_classification/colonoscopy_paris.yaml +0 -26
  219. endoreg_db/data/_examples/finding_classification/colonoscopy_sano.yaml +0 -22
  220. endoreg_db/data/_examples/finding_classification/colonoscopy_summary.yaml +0 -53
  221. endoreg_db/data/_examples/finding_classification/complication_generic.yaml +0 -25
  222. endoreg_db/data/_examples/finding_classification/examination_setting_generic.yaml +0 -40
  223. endoreg_db/data/_examples/finding_classification/histology_colo.yaml +0 -51
  224. endoreg_db/data/_examples/finding_classification/intervention_required.yaml +0 -26
  225. endoreg_db/data/_examples/finding_classification/medication_related.yaml +0 -23
  226. endoreg_db/data/_examples/finding_classification/visualized.yaml +0 -33
  227. endoreg_db/data/_examples/finding_classification_choice/bowel_preparation.yaml +0 -78
  228. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_circularity_default.yaml +0 -32
  229. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_jnet.yaml +0 -15
  230. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_kudo.yaml +0 -23
  231. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_lst.yaml +0 -15
  232. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_nice.yaml +0 -17
  233. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_planarity_default.yaml +0 -49
  234. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_sano.yaml +0 -14
  235. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_surface_intact_default.yaml +0 -36
  236. endoreg_db/data/_examples/finding_classification_choice/colonoscopy_size.yaml +0 -82
  237. endoreg_db/data/_examples/finding_classification_choice/colonoscopy_summary_worst_finding.yaml +0 -15
  238. endoreg_db/data/_examples/finding_classification_choice/complication_generic_types.yaml +0 -15
  239. endoreg_db/data/_examples/finding_classification_choice/examination_setting_generic_types.yaml +0 -15
  240. endoreg_db/data/_examples/finding_classification_choice/histology.yaml +0 -24
  241. endoreg_db/data/_examples/finding_classification_choice/histology_polyp.yaml +0 -20
  242. endoreg_db/data/_examples/finding_classification_choice/outcome.yaml +0 -19
  243. endoreg_db/data/_examples/finding_classification_choice/yes_no_na.yaml +0 -11
  244. endoreg_db/data/_examples/finding_classification_type/colonoscopy_basic.yaml +0 -48
  245. endoreg_db/data/_examples/finding_intervention/endoscopy.yaml +0 -43
  246. endoreg_db/data/_examples/finding_intervention/endoscopy_colonoscopy.yaml +0 -168
  247. endoreg_db/data/_examples/finding_intervention/endoscopy_egd.yaml +0 -128
  248. endoreg_db/data/_examples/finding_intervention/endoscopy_ercp.yaml +0 -32
  249. endoreg_db/data/_examples/finding_intervention/endoscopy_eus_lower.yaml +0 -9
  250. endoreg_db/data/_examples/finding_intervention/endoscopy_eus_upper.yaml +0 -36
  251. endoreg_db/data/_examples/finding_intervention_type/endoscopy.yaml +0 -15
  252. endoreg_db/data/_examples/finding_type/data.yaml +0 -43
  253. endoreg_db/data/_examples/requirement/age.yaml +0 -26
  254. endoreg_db/data/_examples/requirement/gender.yaml +0 -25
  255. endoreg_db/data/_examples/requirement_set/01_endoscopy_generic.yaml +0 -48
  256. endoreg_db/data/_examples/requirement_set/colonoscopy_austria_screening.yaml +0 -57
  257. endoreg_db/data/_examples/requirement_set/endoscopy_bleeding_risk.yaml +0 -52
  258. endoreg_db/data/_examples/yaml_examples.xlsx +0 -0
  259. endoreg_db/data/finding/anatomy_colon.yaml +0 -128
  260. endoreg_db/data/finding/colonoscopy.yaml +0 -40
  261. endoreg_db/data/finding/colonoscopy_bowel_prep.yaml +0 -56
  262. endoreg_db/data/finding/complication.yaml +0 -16
  263. endoreg_db/data/finding/data.yaml +0 -105
  264. endoreg_db/data/finding/examination_setting.yaml +0 -16
  265. endoreg_db/data/finding/medication_related.yaml +0 -18
  266. endoreg_db/data/finding/outcome.yaml +0 -12
  267. endoreg_db/data/finding_classification/colonoscopy_jnet.yaml +0 -22
  268. endoreg_db/data/finding_classification/colonoscopy_kudo.yaml +0 -25
  269. endoreg_db/data/finding_classification/colonoscopy_lesion_circularity.yaml +0 -20
  270. endoreg_db/data/finding_classification/colonoscopy_lesion_planarity.yaml +0 -24
  271. endoreg_db/data/finding_classification/colonoscopy_lesion_size.yaml +0 -38
  272. endoreg_db/data/finding_classification/colonoscopy_lesion_surface.yaml +0 -20
  273. endoreg_db/data/finding_classification/colonoscopy_location.yaml +0 -49
  274. endoreg_db/data/finding_classification/colonoscopy_lst.yaml +0 -21
  275. endoreg_db/data/finding_classification/colonoscopy_nice.yaml +0 -20
  276. endoreg_db/data/finding_classification/colonoscopy_paris.yaml +0 -26
  277. endoreg_db/data/finding_classification/colonoscopy_sano.yaml +0 -22
  278. endoreg_db/data/finding_classification/colonoscopy_summary.yaml +0 -53
  279. endoreg_db/data/finding_classification/complication_generic.yaml +0 -25
  280. endoreg_db/data/finding_classification/examination_setting_generic.yaml +0 -40
  281. endoreg_db/data/finding_classification/histology_colo.yaml +0 -43
  282. endoreg_db/data/finding_classification/intervention_required.yaml +0 -26
  283. endoreg_db/data/finding_classification/medication_related.yaml +0 -23
  284. endoreg_db/data/finding_classification/visualized.yaml +0 -33
  285. endoreg_db/data/finding_classification_choice/colon_lesion_circularity_default.yaml +0 -32
  286. endoreg_db/data/finding_classification_choice/colon_lesion_jnet.yaml +0 -15
  287. endoreg_db/data/finding_classification_choice/colon_lesion_kudo.yaml +0 -23
  288. endoreg_db/data/finding_classification_choice/colon_lesion_lst.yaml +0 -15
  289. endoreg_db/data/finding_classification_choice/colon_lesion_nice.yaml +0 -17
  290. endoreg_db/data/finding_classification_choice/colon_lesion_paris.yaml +0 -57
  291. endoreg_db/data/finding_classification_choice/colon_lesion_planarity_default.yaml +0 -49
  292. endoreg_db/data/finding_classification_choice/colon_lesion_sano.yaml +0 -14
  293. endoreg_db/data/finding_classification_choice/colon_lesion_surface_intact_default.yaml +0 -36
  294. endoreg_db/data/finding_classification_choice/colonoscopy_location.yaml +0 -229
  295. endoreg_db/data/finding_classification_choice/colonoscopy_not_complete_reason.yaml +0 -19
  296. endoreg_db/data/finding_classification_choice/colonoscopy_size.yaml +0 -82
  297. endoreg_db/data/finding_classification_choice/colonoscopy_summary_worst_finding.yaml +0 -15
  298. endoreg_db/data/finding_classification_choice/outcome.yaml +0 -19
  299. endoreg_db/data/finding_intervention/endoscopy.yaml +0 -43
  300. endoreg_db/data/finding_intervention/endoscopy_colonoscopy.yaml +0 -168
  301. endoreg_db/data/finding_intervention/endoscopy_egd.yaml +0 -128
  302. endoreg_db/data/finding_intervention/endoscopy_ercp.yaml +0 -32
  303. endoreg_db/data/finding_intervention/endoscopy_eus_lower.yaml +0 -9
  304. endoreg_db/data/finding_intervention/endoscopy_eus_upper.yaml +0 -36
  305. endoreg_db/data/finding_morphology_classification_type/colonoscopy.yaml +0 -79
  306. endoreg_db/data/requirement/age.yaml +0 -26
  307. endoreg_db/data/requirement/colonoscopy_baseline_austria.yaml +0 -45
  308. endoreg_db/data/requirement/disease_cardiovascular.yaml +0 -79
  309. endoreg_db/data/requirement/disease_classification_choice_cardiovascular.yaml +0 -41
  310. endoreg_db/data/requirement/disease_hepatology.yaml +0 -12
  311. endoreg_db/data/requirement/disease_misc.yaml +0 -12
  312. endoreg_db/data/requirement/disease_renal.yaml +0 -96
  313. endoreg_db/data/requirement/endoscopy_bleeding_risk.yaml +0 -59
  314. endoreg_db/data/requirement/event_cardiology.yaml +0 -251
  315. endoreg_db/data/requirement/event_requirements.yaml +0 -145
  316. endoreg_db/data/requirement/finding_colon_polyp.yaml +0 -50
  317. endoreg_db/data/requirement/gender.yaml +0 -25
  318. endoreg_db/data/requirement/lab_value.yaml +0 -441
  319. endoreg_db/data/requirement/medication.yaml +0 -93
  320. endoreg_db/data/requirement_operator/age.yaml +0 -13
  321. endoreg_db/data/requirement_operator/lab_operators.yaml +0 -129
  322. endoreg_db/data/requirement_operator/model_operators.yaml +0 -96
  323. endoreg_db/management/commands/init_default_ai_model.py +0 -112
  324. endoreg_db/management/commands/reset_celery_schedule.py +0 -9
  325. endoreg_db/management/commands/validate_video.py +0 -204
  326. endoreg_db/migrations/0002_requirementset_depends_on.py +0 -18
  327. endoreg_db/migrations/_old/0001_initial.py +0 -1857
  328. endoreg_db/migrations/_old/0002_add_video_correction_models.py +0 -52
  329. endoreg_db/migrations/_old/0003_add_center_display_name.py +0 -30
  330. endoreg_db/migrations/_old/0004_employee_city_employee_post_code_employee_street_and_more.py +0 -68
  331. endoreg_db/migrations/_old/0004_remove_casetemplate_rules_and_more.py +0 -77
  332. endoreg_db/migrations/_old/0005_merge_20251111_1003.py +0 -14
  333. endoreg_db/migrations/_old/0006_sensitivemeta_anonymized_text_and_more.py +0 -68
  334. endoreg_db/migrations/_old/0007_remove_rule_attribute_dtype_remove_rule_rule_type_and_more.py +0 -89
  335. endoreg_db/migrations/_old/0008_remove_event_event_classification_and_more.py +0 -27
  336. endoreg_db/migrations/_old/0009_alter_modelmeta_options_and_more.py +0 -21
  337. endoreg_db/renames.yml +0 -8
  338. endoreg_db/serializers/_old/raw_pdf_meta_validation.py +0 -223
  339. endoreg_db/serializers/_old/raw_video_meta_validation.py +0 -179
  340. endoreg_db/serializers/_old/video.py +0 -71
  341. endoreg_db/serializers/meta/pdf_file_meta_extraction.py +0 -115
  342. endoreg_db/serializers/meta/report_meta.py +0 -53
  343. endoreg_db/serializers/report/__init__.py +0 -9
  344. endoreg_db/serializers/report/mixins.py +0 -45
  345. endoreg_db/serializers/report/report.py +0 -105
  346. endoreg_db/serializers/report/report_list.py +0 -22
  347. endoreg_db/serializers/report/secure_file_url.py +0 -26
  348. endoreg_db/services/requirements_object.py +0 -147
  349. endoreg_db/services/storage_aware_video_processor.py +0 -370
  350. endoreg_db/urls/files.py +0 -6
  351. endoreg_db/urls/label_video_segment_validate.py +0 -33
  352. endoreg_db/urls/label_video_segments.py +0 -46
  353. endoreg_db/views/label/__init__.py +0 -5
  354. endoreg_db/views/label/label.py +0 -15
  355. endoreg_db/views/label_video_segment/__init__.py +0 -16
  356. endoreg_db/views/label_video_segment/create_lvs_from_annotation.py +0 -44
  357. endoreg_db/views/label_video_segment/get_lvs_by_name_and_video.py +0 -50
  358. endoreg_db/views/label_video_segment/label_video_segment.py +0 -77
  359. endoreg_db/views/label_video_segment/label_video_segment_by_label.py +0 -174
  360. endoreg_db/views/label_video_segment/label_video_segment_detail.py +0 -73
  361. endoreg_db/views/label_video_segment/update_lvs_from_annotation.py +0 -46
  362. endoreg_db/views/label_video_segment/validate.py +0 -226
  363. endoreg_db/views/media/segments.py +0 -71
  364. endoreg_db/views/meta/available_files_list.py +0 -146
  365. endoreg_db/views/meta/report_meta.py +0 -53
  366. endoreg_db/views/meta/sensitive_meta_detail.py +0 -85
  367. endoreg_db/views/misc/secure_file_serving_view.py +0 -80
  368. endoreg_db/views/misc/secure_file_url_view.py +0 -84
  369. endoreg_db/views/misc/secure_url_validate.py +0 -79
  370. endoreg_db/views/patient_examination/DEPRECATED_video_backup.py +0 -164
  371. endoreg_db/views/patient_finding_location/__init__.py +0 -5
  372. endoreg_db/views/patient_finding_location/pfl_create.py +0 -70
  373. endoreg_db/views/patient_finding_morphology/__init__.py +0 -5
  374. endoreg_db/views/patient_finding_morphology/pfm_create.py +0 -70
  375. endoreg_db/views/pdf/__init__.py +0 -8
  376. endoreg_db/views/video/segmentation.py +0 -274
  377. endoreg_db/views/video/task_status.py +0 -49
  378. endoreg_db/views/video/timeline.py +0 -46
  379. endoreg_db/views/video/video_analyze.py +0 -52
  380. /endoreg_db/data/requirement/{colon_polyp_intervention.yaml → old/colon_polyp_intervention.yaml} +0 -0
  381. /endoreg_db/data/{_examples/requirement → requirement/old}/colonoscopy_baseline_austria.yaml +0 -0
  382. /endoreg_db/data/requirement/{coloreg_colon_polyp.yaml → old/coloreg_colon_polyp.yaml} +0 -0
  383. /endoreg_db/data/{_examples/requirement → requirement/old}/disease_cardiovascular.yaml +0 -0
  384. /endoreg_db/data/{_examples/requirement → requirement/old}/disease_classification_choice_cardiovascular.yaml +0 -0
  385. /endoreg_db/data/{_examples/requirement → requirement/old}/disease_hepatology.yaml +0 -0
  386. /endoreg_db/data/{_examples/requirement → requirement/old}/disease_misc.yaml +0 -0
  387. /endoreg_db/data/{_examples/requirement → requirement/old}/disease_renal.yaml +0 -0
  388. /endoreg_db/data/{_examples/requirement → requirement/old}/endoscopy_bleeding_risk.yaml +0 -0
  389. /endoreg_db/data/{_examples/requirement → requirement/old}/event_cardiology.yaml +0 -0
  390. /endoreg_db/data/{_examples/requirement → requirement/old}/event_requirements.yaml +0 -0
  391. /endoreg_db/data/{_examples/requirement → requirement/old}/finding_colon_polyp.yaml +0 -0
  392. /endoreg_db/{migrations/__init__.py → data/requirement/old/gender.yaml} +0 -0
  393. /endoreg_db/data/{_examples/requirement → requirement/old}/lab_value.yaml +0 -0
  394. /endoreg_db/data/{_examples/requirement → requirement/old}/medication.yaml +0 -0
  395. /endoreg_db/data/{_examples/requirement_operator → requirement_operator/_old}/age.yaml +0 -0
  396. /endoreg_db/data/{_examples/requirement_operator → requirement_operator/_old}/lab_operators.yaml +0 -0
  397. /endoreg_db/data/{_examples/requirement_operator → requirement_operator/_old}/model_operators.yaml +0 -0
  398. /endoreg_db/{urls/sensitive_meta.py → import_files/pseudonymization/__init__.py} +0 -0
  399. /endoreg_db/{views/pdf/pdf_stream_views.py → import_files/pseudonymization/pseudonymize.py} +0 -0
  400. /endoreg_db/utils/requirement_operator_logic/{lab_value_operators.py → _old/lab_value_operators.py} +0 -0
  401. /endoreg_db/utils/requirement_operator_logic/{model_evaluators.py → _old/model_evaluators.py} +0 -0
  402. {endoreg_db-0.8.8.0.dist-info → endoreg_db-0.8.8.9.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1487 @@
1
+ """
2
+ report import service module.
3
+
4
+ Provides high-level functions for importing and anonymizing report files,
5
+ combining RawPdfFile creation with text extraction and anonymization using lx anonymizer.
6
+
7
+ All Fields should be overwritten from anonymizer defaults except for the center which is given.
8
+ """
9
+
10
+ import errno
11
+ import hashlib
12
+ import logging
13
+ import os
14
+ import shutil
15
+ import subprocess
16
+ import sys
17
+ import time
18
+ from contextlib import contextmanager
19
+ from datetime import date, datetime
20
+ from pathlib import Path
21
+ from typing import TYPE_CHECKING, Union
22
+
23
+ import lx_anonymizer
24
+ from django.core.exceptions import ObjectDoesNotExist
25
+ from django.db import transaction
26
+
27
+ from endoreg_db.models import SensitiveMeta
28
+ from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
29
+ from endoreg_db.models.state.raw_pdf import RawPdfState
30
+ from endoreg_db.utils import paths as path_utils
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Treat lock files older than this as stale and reclaim them (in seconds)
35
+ STALE_LOCK_SECONDS = 600
36
+
37
+ if TYPE_CHECKING:
38
+ pass # RawPdfFile already imported above
39
+
40
+
41
+ class PdfImportService:
42
+ """
43
+ Service class for importing and processing report files with text extraction and anonymization.
44
+ Uses a central report instance pattern for cleaner state management.
45
+
46
+ Supports two processing modes:
47
+ - 'blackening': Simple report masking with black rectangles over sensitive areas
48
+ - 'cropping': Advanced mode that crops sensitive regions to separate images
49
+ """
50
+
51
+ def __init__(
52
+ self, allow_meta_overwrite: bool = True, processing_mode: str = "blackening"
53
+ ):
54
+ """
55
+ Initialize the report import service.
56
+
57
+ Args:
58
+ allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
59
+ processing_mode: Processing mode - 'blackening' for simple masking, 'cropping' for advanced cropping
60
+ """
61
+ self.processed_files = set()
62
+ self._report_reader_available = None
63
+ self._report_reader_class = None
64
+ self.allow_meta_overwrite = allow_meta_overwrite
65
+
66
+ # Validate and set processing mode
67
+ valid_modes = ["blackening", "cropping"]
68
+ if processing_mode not in valid_modes:
69
+ raise ValueError(
70
+ f"Invalid processing_mode '{processing_mode}'. Must be one of: {valid_modes}"
71
+ )
72
+ self.processing_mode = processing_mode
73
+
74
+ # Central report instance management
75
+ self.current_pdf = None
76
+ self.current_pdf_state = None
77
+ self.processing_context = {}
78
+ self.original_path = None
79
+
80
+ self.DEFAULT_PATIENT_FIRST_NAME = "Patient"
81
+ self.DEFAULT_PATIENT_LAST_NAME = "Unknown"
82
+ self.DEFAULT_PATIENT_DOB = date(1990, 1, 1)
83
+ self.DEFAULT_CENTER_NAME = "university_hospital_wuerzburg"
84
+
85
+ def import_and_anonymize(
86
+ self,
87
+ file_path: Union[Path, str],
88
+ center_name: str,
89
+ delete_source: bool = False,
90
+ retry: bool = False,
91
+ ) -> "RawPdfFile | None":
92
+ """
93
+ Import a report file and anonymize it using ReportReader.
94
+ Uses centralized report instance management pattern.
95
+
96
+ The processing mode is determined by the service initialization:
97
+ - 'blackening': Creates an anonymized report with black rectangles over sensitive regions
98
+ - 'cropping': Advanced mode that crops sensitive regions to separate images
99
+
100
+ Args:
101
+ file_path: Path to the report file to import
102
+ center_name: Name of the center to associate with report
103
+ delete_source: Whether to delete the source file after import
104
+ retry: Whether this is a retry attempt
105
+
106
+ Returns:
107
+ RawPdfFile instance after import and processing
108
+
109
+ Raises:
110
+ Exception: On any failure during import or processing
111
+ """
112
+ try:
113
+ # Initialize processing context
114
+ self._initialize_processing_context(
115
+ file_path, center_name, delete_source, retry
116
+ )
117
+
118
+ # Step 1: Validate and prepare file
119
+ self._validate_and_prepare_file()
120
+
121
+ # Step 2: Create or retrieve report instance
122
+ self._create_or_retrieve_pdf_instance()
123
+
124
+ # Early return check - if no report instance was created, return None
125
+ if not self.current_pdf:
126
+ logger.warning(
127
+ f"No report instance created for {file_path}, returning None"
128
+ )
129
+ raise ObjectDoesNotExist
130
+ # Step 3: Setup processing environment
131
+ self._setup_processing_environment()
132
+
133
+ # Step 4: Process text and metadata
134
+ self._process_text_and_metadata()
135
+
136
+ # Step 5: Finalize processing
137
+ self._finalize_processing()
138
+
139
+ return self.current_pdf
140
+
141
+ except ValueError as e:
142
+ # Handle "File already being processed" case specifically
143
+ if "already being processed" in str(e):
144
+ logger.info(f"Skipping file {file_path}: {e}")
145
+ return
146
+ else:
147
+ logger.error(f"report import failed for {file_path}: {e}")
148
+ self._cleanup_on_error()
149
+ raise
150
+ except Exception as e:
151
+ logger.error(f"report import failed for {file_path}: {e}")
152
+ # Cleanup on error
153
+ self._cleanup_on_error()
154
+ raise
155
+ finally:
156
+ # Always cleanup context
157
+ self._cleanup_processing_context()
158
+
159
+ def _initialize_processing_context(
160
+ self,
161
+ file_path: Union[Path, str],
162
+ center_name: str,
163
+ delete_source: bool,
164
+ retry: bool,
165
+ ):
166
+ """Initialize the processing context for the current report."""
167
+ self.processing_context = {
168
+ "file_path": Path(file_path),
169
+ "original_file_path": Path(file_path),
170
+ "center_name": center_name,
171
+ "delete_source": delete_source,
172
+ "retry": retry,
173
+ "file_hash": None,
174
+ "processing_started": False,
175
+ "text_extracted": False,
176
+ "metadata_processed": False,
177
+ "anonymization_completed": False,
178
+ }
179
+ self.original_path = Path(file_path)
180
+
181
+ # Check if already processed (only during current session to prevent race conditions)
182
+ if str(file_path) in self.processed_files:
183
+ logger.info(
184
+ f"File {file_path} already being processed in current session, skipping"
185
+ )
186
+ raise ValueError("File already being processed")
187
+
188
+ logger.info(f"Starting import and processing for: {file_path}")
189
+
190
+ @classmethod
191
+ def with_blackening(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
192
+ """
193
+ Create a PdfImportService configured for simple report blackening mode.
194
+
195
+ Args:
196
+ allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
197
+
198
+ Returns:
199
+ PdfImportService instance configured for blackening mode
200
+ """
201
+ return cls(
202
+ allow_meta_overwrite=allow_meta_overwrite, processing_mode="blackening"
203
+ )
204
+
205
+ @classmethod
206
+ def with_cropping(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
207
+ """
208
+ Create a PdfImportService configured for advanced cropping mode.
209
+
210
+ Args:
211
+ allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
212
+
213
+ Returns:
214
+ PdfImportService instance configured for cropping mode
215
+ """
216
+ return cls(
217
+ allow_meta_overwrite=allow_meta_overwrite, processing_mode="cropping"
218
+ )
219
+
220
+ @contextmanager
221
+ def _file_lock(self, path: Path):
222
+ """Create a file lock to prevent duplicate processing.
223
+ Handles stale lock files by reclaiming after STALE_LOCK_SECONDS.
224
+ """
225
+ lock_path = Path(str(path) + ".lock")
226
+ fd = None
227
+ try:
228
+ try:
229
+ # atomic create; fail if exists
230
+ fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
231
+ except FileExistsError:
232
+ # Check for stale lock
233
+ age = None
234
+ try:
235
+ st = os.stat(lock_path)
236
+ age = time.time() - st.st_mtime
237
+ except FileNotFoundError:
238
+ # race: lock removed between exists and stat; just retry acquiring below
239
+ pass
240
+
241
+ if age is not None and age > STALE_LOCK_SECONDS:
242
+ try:
243
+ logger.warning(
244
+ "Stale lock detected for %s (age %.0fs). Reclaiming lock...",
245
+ path,
246
+ age,
247
+ )
248
+ lock_path.unlink()
249
+ except Exception as e:
250
+ logger.warning(
251
+ "Failed to remove stale lock %s: %s", lock_path, e
252
+ )
253
+ # retry acquire
254
+ fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
255
+ else:
256
+ # Another worker is processing this file
257
+
258
+ raise ValueError(f"File already being processed: {path}")
259
+
260
+ os.write(fd, b"lock")
261
+ os.close(fd)
262
+ fd = None
263
+ yield
264
+ finally:
265
+ try:
266
+ if fd is not None:
267
+ os.close(fd)
268
+ if lock_path.exists():
269
+ lock_path.unlink()
270
+ except OSError:
271
+ pass
272
+
273
+ def _get_pdf_dir(self) -> Path | None:
274
+ """Resolve the configured report directory to a concrete Path."""
275
+ candidate = getattr(path_utils, "REPORT_DIR", None)
276
+ if isinstance(candidate, Path):
277
+ return candidate
278
+ if candidate is None:
279
+ return None
280
+ try:
281
+ derived = candidate / "."
282
+ except Exception:
283
+ derived = None
284
+
285
+ if derived is not None:
286
+ try:
287
+ return Path(derived)
288
+ except Exception:
289
+ return None
290
+
291
+ try:
292
+ return Path(str(candidate))
293
+ except Exception:
294
+ return None
295
+
296
+ def _quarantine(self, source: Path) -> Path:
297
+ """Move file to quarantine directory to prevent re-processing."""
298
+ qdir = path_utils.REPORT_DIR / "_processing"
299
+ qdir.mkdir(parents=True, exist_ok=True)
300
+ target = qdir / source.name
301
+ try:
302
+ # Try atomic rename first (fastest when on same filesystem)
303
+ source.rename(target)
304
+ except OSError as exc:
305
+ if exc.errno == errno.EXDEV:
306
+ # Cross-device move, fall back to shutil.move which copies+removes
307
+ shutil.move(str(source), str(target))
308
+ else:
309
+ raise
310
+ lock_path = Path(str(source) + ".lock")
311
+ if lock_path.exists():
312
+ lock_path.unlink()
313
+
314
+ return target
315
+
316
+ def _ensure_state(self, pdf_file: "RawPdfFile"):
317
+ """Ensure report file has a state object."""
318
+ if getattr(pdf_file, "state", None):
319
+ return pdf_file.state
320
+ if hasattr(pdf_file, "get_or_create_state"):
321
+ state = pdf_file.get_or_create_state()
322
+ pdf_file.state = state
323
+ self.current_pdf_state = state
324
+ assert isinstance(self.current_pdf_state, RawPdfState)
325
+ return state
326
+
327
+ def _ensure_report_reading_available(self):
328
+ """
329
+ Ensure report reading modules are available by adding lx-anonymizer to path.
330
+
331
+ Returns:
332
+ Tuple of (availability_flag, ReportReader_class)
333
+ """
334
+ if self._report_reader_available is not None:
335
+ return self._report_reader_available, self._report_reader_class
336
+
337
+ try:
338
+ # Try direct import first
339
+ from lx_anonymizer import ReportReader
340
+
341
+ logger.info("Successfully imported lx_anonymizer ReportReader module")
342
+ self._report_reader_available = True
343
+ self._report_reader_class = ReportReader
344
+ return True, ReportReader
345
+
346
+ except ImportError:
347
+ # Optional: honor LX_ANONYMIZER_PATH=/abs/path/to/src
348
+ import importlib
349
+
350
+ extra = os.getenv("LX_ANONYMIZER_PATH")
351
+ if extra and extra not in sys.path and Path(extra).exists():
352
+ sys.path.insert(0, extra)
353
+ try:
354
+ mod = importlib.import_module("lx_anonymizer")
355
+ ReportReader = getattr(mod, "ReportReader")
356
+ logger.info(
357
+ "Imported lx_anonymizer.ReportReader via LX_ANONYMIZER_PATH"
358
+ )
359
+ self._report_reader_available = True
360
+ self._report_reader_class = ReportReader
361
+ return True, ReportReader
362
+ except Exception as e:
363
+ logger.warning(
364
+ "Failed importing lx_anonymizer via LX_ANONYMIZER_PATH: %s", e
365
+ )
366
+ finally:
367
+ # Keep path for future imports if it worked; otherwise remove.
368
+ if "ReportReader" not in locals() and extra in sys.path:
369
+ sys.path.remove(extra)
370
+
371
+ self._report_reader_available = False
372
+ self._report_reader_class = None
373
+ return False, None
374
+
375
+ def _ensure_default_patient_data(self, pdf_instance: "RawPdfFile") -> None:
376
+ """
377
+ Ensure report has minimum required patient data in SensitiveMeta.
378
+ Creates default values if data is missing after text processing.
379
+ Uses the central report instance if no specific instance provided.
380
+
381
+ Args:
382
+ pdf_instance: Optional specific report instance, defaults to self.current_pdf
383
+ """
384
+ pdf_file = pdf_instance or self.current_pdf
385
+ if not pdf_file:
386
+ logger.warning(
387
+ "No report instance available for ensuring default patient data"
388
+ )
389
+ return
390
+
391
+ if not pdf_file.sensitive_meta:
392
+ logger.info(
393
+ f"No SensitiveMeta found for report {pdf_file.pdf_hash}, creating default"
394
+ )
395
+
396
+ # Create default SensitiveMeta with placeholder data
397
+ default_data = {
398
+ "patient_first_name": self.DEFAULT_PATIENT_FIRST_NAME,
399
+ "patient_last_name": self.DEFAULT_PATIENT_LAST_NAME,
400
+ "patient_dob": self.DEFAULT_PATIENT_DOB,
401
+ "examination_date": date.today(), # today is intentionally *not* a constant
402
+ "center_name": (
403
+ pdf_file.center.name
404
+ if pdf_file.center
405
+ else self.DEFAULT_CENTER_NAME
406
+ ),
407
+ }
408
+
409
+ try:
410
+ sensitive_meta = SensitiveMeta.create_from_dict(default_data)
411
+ pdf_file.sensitive_meta = sensitive_meta
412
+ pdf_file.save(update_fields=["sensitive_meta"])
413
+ logger.info(
414
+ f"Created default SensitiveMeta for report {pdf_file.pdf_hash}"
415
+ )
416
+ except Exception as e:
417
+ logger.error(
418
+ f"Failed to create default SensitiveMeta for report {pdf_file.pdf_hash}: {e}"
419
+ )
420
+
421
+ def _validate_and_prepare_file(self):
422
+ """Validate file existence and calculate hash."""
423
+ file_path = self.processing_context["file_path"]
424
+
425
+ if not file_path.exists():
426
+ raise FileNotFoundError(f"report file not found: {file_path}")
427
+
428
+ def _create_or_retrieve_pdf_instance(self):
429
+ """Create new or retrieve existing report instance."""
430
+ file_path = self.processing_context["file_path"]
431
+ center_name = self.processing_context["center_name"]
432
+ delete_source = self.processing_context["delete_source"]
433
+ retry = self.processing_context["retry"]
434
+ file_hash = self.processing_context["file_hash"]
435
+
436
+ if not retry:
437
+ # Check for existing report and handle duplicates
438
+ with self._file_lock(file_path):
439
+ existing = None
440
+ if file_hash and RawPdfFile.objects.filter(pdf_hash=file_hash).exists():
441
+ existing = RawPdfFile.objects.get(pdf_hash=file_hash)
442
+
443
+ if existing:
444
+ logger.info(f"Found existing RawPdfFile {existing.pdf_hash}")
445
+ if existing.text:
446
+ logger.info(
447
+ f"Existing report {existing.pdf_hash} already processed - returning"
448
+ )
449
+ self.current_pdf = existing
450
+ return
451
+ else:
452
+ # Retry processing
453
+ logger.info(f"Reprocessing existing report {existing.pdf_hash}")
454
+ return self._retry_existing_pdf(existing)
455
+
456
+ # Create new report instance
457
+ logger.info("Creating new RawPdfFile instance...")
458
+ from django.db import IntegrityError
459
+
460
+ try:
461
+ if not retry:
462
+ self.current_pdf = RawPdfFile.create_from_file_initialized(
463
+ file_path=file_path,
464
+ center_name=center_name,
465
+ delete_source=delete_source,
466
+ )
467
+ else:
468
+ # Retrieve existing for retry
469
+ self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
470
+ logger.info(
471
+ f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}"
472
+ )
473
+
474
+ # Check if retry is actually needed
475
+ if self.current_pdf.text:
476
+ logger.info(
477
+ f"Existing report {self.current_pdf.pdf_hash} already processed during retry - returning"
478
+ )
479
+ return
480
+
481
+ if not self.current_pdf:
482
+ raise RuntimeError("Failed to create RawPdfFile instance")
483
+
484
+ logger.info(f"report instance ready: {self.current_pdf.pdf_hash}")
485
+
486
+ except IntegrityError:
487
+ # Race condition - another worker created it
488
+ if file_hash:
489
+ self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
490
+ logger.info("Race condition detected, using existing RawPdfFile")
491
+ else:
492
+ raise
493
+
494
+ def _setup_processing_environment(self):
495
+ """Setup processing environment and state."""
496
+ original_path = self.processing_context.get("file_path")
497
+ if not original_path or not self.current_pdf:
498
+ try:
499
+ self.current_pdf = RawPdfFile.objects.get(
500
+ pdf_hash=self.processing_context["file_hash"]
501
+ )
502
+ self.original_path = Path(str(self.current_pdf.file.path))
503
+
504
+ except RawPdfFile.DoesNotExist:
505
+ raise RuntimeError("Processing environment setup failed")
506
+ # Create sensitive file copy
507
+ if original_path is None or not isinstance(original_path, (str, Path)):
508
+ logger.error(f"No original path: {original_path!r}")
509
+ return
510
+ self.create_sensitive_file(self.current_pdf, original_path)
511
+
512
+ # Update file path to point to sensitive copy
513
+ self.processing_context["file_path"] = self.current_pdf.file.path
514
+ self.processing_context["sensitive_copy_created"] = True
515
+ try:
516
+ self.processing_context["sensitive_file_path"] = Path(
517
+ self.current_pdf.file.path
518
+ )
519
+ except Exception:
520
+ self.processing_context["sensitive_file_path"] = None
521
+
522
+ # Ensure state exists
523
+ state = self.current_pdf.get_or_create_state()
524
+ state.mark_processing_started()
525
+ self.processing_context["processing_started"] = True
526
+
527
+ # Mark as processed to prevent duplicates
528
+ self.processed_files.add(str(self.processing_context["file_path"]))
529
+
530
+ # Ensure default patient data
531
+ logger.info("Ensuring default patient data...")
532
+ self._ensure_default_patient_data(self.current_pdf)
533
+
534
+ def _process_text_and_metadata(self):
535
+ """Process text extraction and metadata using ReportReader."""
536
+ report_reading_available, ReportReaderCls = (
537
+ self._ensure_report_reading_available()
538
+ )
539
+ try:
540
+ assert ReportReaderCls is not None and report_reading_available
541
+ assert self.current_pdf is not None
542
+ except AssertionError as e:
543
+ logger.error(
544
+ f"report Import failed on Error:{e} Ensure the pdf was passed correctly and report reading is available in function _process_text_and_metadata() "
545
+ )
546
+ if not report_reading_available:
547
+ logger.warning("Report reading not available (lx_anonymizer not found)")
548
+ self._mark_processing_incomplete("no_report_reader")
549
+ return
550
+ assert self.current_pdf is not None
551
+ if not self.current_pdf.file:
552
+ logger.warning("No file available for text processing")
553
+ self._mark_processing_incomplete("no_file")
554
+ return
555
+
556
+ try:
557
+ logger.info(
558
+ f"Starting text extraction and metadata processing with ReportReader (mode: {self.processing_mode})..."
559
+ )
560
+ ReportReaderCls = lx_anonymizer.ReportReader
561
+
562
+ # Initialize ReportReader
563
+ report_reader = ReportReaderCls(
564
+ report_root_path=str(path_utils.STORAGE_DIR),
565
+ locale="de_DE",
566
+ text_date_format="%d.%m.%Y",
567
+ )
568
+
569
+ if self.processing_mode == "cropping":
570
+ # Use advanced cropping method (existing implementation)
571
+ self._process_with_cropping(report_reader)
572
+ else: # blackening mode
573
+ # Use enhanced process_report with report masking
574
+ self._process_with_blackening(report_reader)
575
+
576
+ except Exception as e:
577
+ logger.warning(f"Text processing failed: {e}")
578
+ self._mark_processing_incomplete("text_processing_failed")
579
+
580
+ def _process_with_blackening(self, report_reader):
581
+ """Process report using simple blackening/masking mode."""
582
+ logger.info("Using simple report blackening mode...")
583
+
584
+ # Setup anonymized directory
585
+ anonymized_dir = path_utils.REPORT_DIR / "anonymized"
586
+ anonymized_dir.mkdir(parents=True, exist_ok=True)
587
+ assert self.current_pdf is not None
588
+ # Generate output path for anonymized report
589
+ pdf_hash = self.current_pdf.pdf_hash
590
+ anonymized_output_path = anonymized_dir / f"{pdf_hash}_anonymized.pdf"
591
+
592
+ # Process with enhanced process_report method (returns 4-tuple now)
593
+ original_text, anonymized_text, extracted_metadata, anonymized_pdf_path = (
594
+ report_reader.process_report(
595
+ pdf_path=self.processing_context["file_path"],
596
+ create_anonymized_pdf=True,
597
+ anonymized_pdf_output_path=str(anonymized_output_path),
598
+ )
599
+ )
600
+
601
+ # Store results in context
602
+ self.processing_context.update(
603
+ {
604
+ "original_text": original_text,
605
+ "anonymized_text": anonymized_text,
606
+ "extracted_metadata": extracted_metadata,
607
+ "cropped_regions": None, # Not available in blackening mode
608
+ "anonymized_pdf_path": anonymized_pdf_path,
609
+ }
610
+ )
611
+
612
+ # Apply results
613
+ if original_text:
614
+ self._apply_text_results()
615
+ self.processing_context["text_extracted"] = True
616
+
617
+ if extracted_metadata:
618
+ self._apply_metadata_results()
619
+ self.processing_context["metadata_processed"] = True
620
+
621
+ if anonymized_pdf_path:
622
+ self._apply_anonymized_pdf()
623
+ self.processing_context["anonymization_completed"] = True
624
+
625
+ logger.info("report blackening processing completed")
626
+
627
+ def _process_with_cropping(self, report_reader):
628
+ """Process report using advanced cropping mode (existing implementation)."""
629
+ logger.info("Using advanced cropping mode...")
630
+
631
+ # Setup output directories
632
+ crops_dir = path_utils.REPORT_DIR / "cropped_regions"
633
+ anonymized_dir = path_utils.REPORT_DIR / "anonymized"
634
+ crops_dir.mkdir(parents=True, exist_ok=True)
635
+ anonymized_dir.mkdir(parents=True, exist_ok=True)
636
+
637
+ # Process with cropping (returns 5-tuple)
638
+ (
639
+ original_text,
640
+ anonymized_text,
641
+ extracted_metadata,
642
+ cropped_regions,
643
+ anonymized_pdf_path,
644
+ ) = report_reader.process_report_with_cropping(
645
+ pdf_path=self.processing_context["file_path"],
646
+ crop_sensitive_regions=True,
647
+ crop_output_dir=str(crops_dir),
648
+ anonymization_output_dir=str(anonymized_dir),
649
+ )
650
+
651
+ # Store results in context
652
+ self.processing_context.update(
653
+ {
654
+ "original_text": original_text,
655
+ "anonymized_text": anonymized_text,
656
+ "extracted_metadata": extracted_metadata,
657
+ "cropped_regions": cropped_regions,
658
+ "anonymized_pdf_path": anonymized_pdf_path,
659
+ }
660
+ )
661
+
662
+ # Apply results
663
+ if original_text:
664
+ self._apply_text_results()
665
+ self.processing_context["text_extracted"] = True
666
+
667
+ if extracted_metadata:
668
+ self._apply_metadata_results()
669
+ self.processing_context["metadata_processed"] = True
670
+
671
+ if anonymized_pdf_path:
672
+ self._apply_anonymized_pdf()
673
+ self.processing_context["anonymization_completed"] = True
674
+
675
+ logger.info("report cropping processing completed")
676
+
677
+ def _apply_text_results(self):
678
+ """Apply text extraction results to the report instance."""
679
+ if not self.current_pdf:
680
+ logger.warning("Cannot apply text results - no report instance available")
681
+ return
682
+
683
+ original_text = self.processing_context.get("original_text")
684
+ anonymized_text = self.processing_context.get("anonymized_text")
685
+
686
+ if not original_text:
687
+ logger.warning("No original text available to apply")
688
+ return
689
+
690
+ # Store extracted text
691
+ self.current_pdf.text = original_text
692
+ logger.info(f"Extracted {len(original_text)} characters of text from report")
693
+
694
+ # Handle anonymized text
695
+ if anonymized_text and anonymized_text != original_text:
696
+ self.current_pdf.state.anonymization_status.mark_anonymized()
697
+ logger.info("report text anonymization completed")
698
+
699
+ def _apply_metadata_results(self):
700
+ """Apply metadata extraction results to SensitiveMeta."""
701
+ if not self.current_pdf:
702
+ logger.warning(
703
+ "Cannot apply metadata results - no report instance available"
704
+ )
705
+ return
706
+
707
+ extracted_metadata = self.processing_context.get("extracted_metadata")
708
+
709
+ if not self.current_pdf.sensitive_meta or not extracted_metadata:
710
+ logger.debug("No sensitive meta or extracted metadata available")
711
+ return
712
+
713
+ sm = self.current_pdf.sensitive_meta
714
+
715
+ # Map ReportReader metadata to SensitiveMeta fields
716
+ metadata_mapping = {
717
+ "patient_first_name": "patient_first_name",
718
+ "patient_last_name": "patient_last_name",
719
+ "patient_dob": "patient_dob",
720
+ "examination_date": "examination_date",
721
+ "examiner_first_name": "examiner_first_name",
722
+ "examiner_last_name": "examiner_last_name",
723
+ "endoscope_type": "endoscope_type",
724
+ "casenumber": "casenumber",
725
+ "center_name": "center_name",
726
+ }
727
+
728
+ # Update fields with extracted information
729
+ updated_fields = []
730
+ for meta_key, sm_field in metadata_mapping.items():
731
+ if extracted_metadata.get(meta_key) and hasattr(sm, sm_field):
732
+ old_value = getattr(sm, sm_field)
733
+ raw_value = extracted_metadata[meta_key]
734
+
735
+ # Skip if we just got the field name as a string (indicates no actual data)
736
+ if isinstance(raw_value, str) and raw_value == meta_key:
737
+ continue
738
+
739
+ # Handle date fields specially
740
+ if sm_field in ["patient_dob", "examination_date"]:
741
+ new_value = self._parse_date_field(raw_value, meta_key, sm_field)
742
+ if new_value is None:
743
+ continue
744
+ else:
745
+ new_value = raw_value
746
+
747
+ # Configurable overwrite policy
748
+ should_overwrite = (
749
+ self.allow_meta_overwrite
750
+ or self._is_placeholder_value(sm_field, old_value)
751
+ )
752
+
753
+ if new_value and should_overwrite:
754
+ setattr(sm, sm_field, new_value)
755
+ updated_fields.append(sm_field)
756
+
757
+ if updated_fields:
758
+ sm.save()
759
+ logger.info(f"Updated SensitiveMeta fields: {updated_fields}")
760
+
761
+ def _parse_date_field(self, raw_value, meta_key, sm_field):
762
+ """Parse date field with error handling."""
763
+ try:
764
+ if isinstance(raw_value, str):
765
+ # Skip if the value is just the field name itself
766
+ if raw_value == meta_key:
767
+ logger.warning(
768
+ "Skipping date field %s - got field name '%s' instead of actual date",
769
+ sm_field,
770
+ raw_value,
771
+ )
772
+ return None
773
+
774
+ # Try common date formats
775
+ date_formats = ["%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y"]
776
+ for fmt in date_formats:
777
+ try:
778
+ return datetime.strptime(raw_value, fmt).date()
779
+ except ValueError:
780
+ continue
781
+
782
+ logger.warning(
783
+ "Could not parse date '%s' for field %s", raw_value, sm_field
784
+ )
785
+ return None
786
+
787
+ elif hasattr(raw_value, "date"):
788
+ return raw_value.date()
789
+ else:
790
+ return raw_value
791
+
792
+ except (ValueError, AttributeError) as e:
793
+ logger.warning("Date parsing failed for %s: %s", sm_field, e)
794
+ return None
795
+
796
+ # from gc-08
797
+ def _apply_anonymized_pdf(self):
798
+ """
799
+ Attach the already-generated anonymized report without copying bytes.
800
+
801
+ We do NOT re-upload or re-save file bytes via Django storage (which would
802
+ place a new file under upload_to='raw_pdfs' and retrigger the watcher).
803
+ Instead, we point the FileField to the path that the anonymizer already
804
+ wrote (ideally relative to STORAGE_DIR). Additionally, we make sure the
805
+ model/state reflect that anonymization is done even if text didn't change.
806
+ """
807
+ if not self.current_pdf:
808
+ logger.warning(
809
+ "Cannot apply anonymized report - no report instance available"
810
+ )
811
+ return
812
+
813
+ anonymized_pdf_path = self.processing_context.get("anonymized_pdf_path")
814
+ if not anonymized_pdf_path:
815
+ logger.debug("No anonymized_pdf_path present in processing context")
816
+ return
817
+
818
+ anonymized_path = Path(anonymized_pdf_path)
819
+ if not anonymized_path.exists():
820
+ logger.warning(
821
+ "Anonymized report path returned but file does not exist: %s",
822
+ anonymized_path,
823
+ )
824
+ return
825
+
826
+ logger.info("Anonymized report created by ReportReader at: %s", anonymized_path)
827
+
828
+ try:
829
+ # Prefer storing a path relative to STORAGE_DIR so Django serves it correctly
830
+ try:
831
+ relative_name = str(anonymized_path.relative_to(path_utils.STORAGE_DIR))
832
+ except ValueError:
833
+ # Fallback to absolute path if the file lives outside STORAGE_DIR
834
+ relative_name = str(anonymized_path)
835
+
836
+ # Only update if something actually changed
837
+ if getattr(self.current_pdf.processed_file, "name", None) != relative_name:
838
+ self.current_pdf.processed_file.name = relative_name
839
+
840
+ # Ensure model/state reflect anonymization even if text didn't differ
841
+ if not getattr(self.current_pdf, "anonymized", False):
842
+ self.current_pdf.state.anonymization_status.mark_anonymized()
843
+
844
+ # Persist cropped regions info somewhere useful (optional & non-breaking)
845
+ # If your model has a field for this, persist there; otherwise we just log.
846
+ cropped_regions = self.processing_context.get("cropped_regions")
847
+ if cropped_regions:
848
+ logger.debug(
849
+ "Cropped regions recorded (%d regions).", len(cropped_regions)
850
+ )
851
+
852
+ # Save model changes
853
+ update_fields = ["processed_file"]
854
+ if "anonymized" in self.current_pdf.__dict__:
855
+ update_fields.append("anonymized")
856
+ self.current_pdf.save(update_fields=update_fields)
857
+
858
+ # Mark state as anonymized immediately; this keeps downstream flows working
859
+ state = self._ensure_state(self.current_pdf)
860
+
861
+ if state and not state.processing_started:
862
+ state.mark_processing_started()
863
+
864
+ logger.info(
865
+ "Updated processed_file reference to: %s",
866
+ self.current_pdf.processed_file.name,
867
+ )
868
+
869
+ except Exception as e:
870
+ logger.warning("Could not set anonymized file reference: %s", e)
871
+
872
+ def _finalize_processing(self):
873
+ """Finalize processing and update state."""
874
+ if not self.current_pdf:
875
+ logger.warning("Cannot finalize processing - no report instance available")
876
+ return
877
+
878
+ try:
879
+ # Update state based on processing results
880
+ state = self._ensure_state(self.current_pdf)
881
+
882
+ if self.processing_context.get("text_extracted") and state:
883
+ state.mark_anonymized()
884
+
885
+ # Mark as ready for validation after successful anonymization
886
+ if self.processing_context.get("anonymization_completed") and state:
887
+ state.mark_sensitive_meta_processed()
888
+ logger.info(
889
+ f"report {self.current_pdf.pdf_hash} processing completed - "
890
+ f"ready for validation (status: {state.anonymization_status})"
891
+ )
892
+
893
+ # Save all changes
894
+ with transaction.atomic():
895
+ self.current_pdf.save()
896
+ if state:
897
+ state.save()
898
+
899
+ logger.info("report processing completed successfully")
900
+ except Exception as e:
901
+ logger.warning(f"Failed to finalize processing: {e}")
902
+
903
+ def _mark_processing_incomplete(self, reason: str):
904
+ """Mark processing as incomplete with reason."""
905
+ if not self.current_pdf:
906
+ logger.warning(
907
+ f"Cannot mark processing incomplete - no report instance available. Reason: {reason}"
908
+ )
909
+ return
910
+
911
+ try:
912
+ state = self._ensure_state(self.current_pdf)
913
+ if state:
914
+ state.text_meta_extracted = False
915
+ state.pdf_meta_extracted = False
916
+ state.sensitive_meta_processed = False
917
+ state.save()
918
+ logger.info(f"Set report state: processed=False due to {reason}")
919
+
920
+ # Save changes
921
+ with transaction.atomic():
922
+ self.current_pdf.save()
923
+ except Exception as e:
924
+ logger.warning(f"Failed to mark processing incomplete: {e}")
925
+
926
+ def _retry_existing_pdf(self, existing_pdf):
927
+ """
928
+ Retry processing for existing report.
929
+
930
+ Uses get_raw_file_path() to find the original raw file instead of
931
+ relying on the file field which may point to a deleted sensitive file.
932
+ """
933
+ try:
934
+ # ✅ FIX: Use get_raw_file_path() to find original file
935
+ raw_file_path = existing_pdf.get_raw_file_path()
936
+
937
+ if not raw_file_path or not raw_file_path.exists():
938
+ logger.error(
939
+ f"Cannot retry report {existing_pdf.pdf_hash}: Raw file not found. "
940
+ f"Please re-upload the original report file."
941
+ )
942
+ self.current_pdf = existing_pdf
943
+ return existing_pdf
944
+
945
+ logger.info(f"Found raw file for retry at: {raw_file_path}")
946
+
947
+ # Remove from processed files to allow retry
948
+ file_path_str = str(raw_file_path)
949
+ if file_path_str in self.processed_files:
950
+ self.processed_files.remove(file_path_str)
951
+ logger.debug(f"Removed {file_path_str} from processed files for retry")
952
+
953
+ return self.import_and_anonymize(
954
+ file_path=raw_file_path, # ✅ Use raw file path, not sensitive path
955
+ center_name=existing_pdf.center.name
956
+ if existing_pdf.center
957
+ else "unknown_center",
958
+ delete_source=False, # Never delete during retry
959
+ retry=True,
960
+ )
961
+ except Exception as e:
962
+ logger.error(
963
+ f"Failed to re-import existing report {existing_pdf.pdf_hash}: {e}"
964
+ )
965
+ self.current_pdf = existing_pdf
966
+ return existing_pdf
967
+
968
+ def _cleanup_on_error(self):
969
+ """Cleanup processing context on error."""
970
+ original_path = self.original_path
971
+ try:
972
+ if self.current_pdf and hasattr(self.current_pdf, "state"):
973
+ state = self._ensure_state(self.current_pdf)
974
+ raw_file_path = self.current_pdf.get_raw_file_path()
975
+ if raw_file_path is not None and original_path is not None:
976
+ # Ensure reprocessing for next attempt by restoring original file
977
+ shutil.copy2(str(raw_file_path), str(original_path))
978
+
979
+ # Ensure no two files can remain
980
+ if (
981
+ raw_file_path == original_path
982
+ and raw_file_path is not None
983
+ and original_path is not None
984
+ ):
985
+ os.remove(str(raw_file_path))
986
+
987
+ # Remove Lock file also
988
+ lock_path = Path(str(path_utils.REPORT_DIR) + ".lock")
989
+ try:
990
+ if lock_path.exists():
991
+ lock_path.unlink()
992
+ logger.info(
993
+ "Removed lock file during quarantine: %s", lock_path
994
+ )
995
+ except Exception as e:
996
+ logger.warning(
997
+ "Could not remove lock file during quarantine: %s", e
998
+ )
999
+
1000
+ if state and self.processing_context.get("processing_started"):
1001
+ state.text_meta_extracted = False
1002
+ state.pdf_meta_extracted = False
1003
+ state.sensitive_meta_processed = False
1004
+ state.anonymized = False
1005
+ state.save()
1006
+ logger.debug("Updated report state to indicate processing failure")
1007
+ else:
1008
+ # 🔧 Early failure: no current_pdf (or no state).
1009
+ # In this case we want to make sure we don't leave stray files
1010
+ # under REPORT_DIR or REPORT_DIR/sensitive.
1011
+
1012
+ pdf_dir = self._get_pdf_dir()
1013
+ if pdf_dir and pdf_dir.exists():
1014
+ for candidate_dir in (pdf_dir, pdf_dir / "sensitive"):
1015
+ if candidate_dir.exists():
1016
+ for candidate in candidate_dir.glob("*.pdf"):
1017
+ # Don't delete the original ingress file
1018
+ if (
1019
+ original_path is not None
1020
+ and candidate.resolve()
1021
+ == Path(original_path).resolve()
1022
+ ):
1023
+ continue
1024
+ try:
1025
+ candidate.unlink()
1026
+ logger.debug(
1027
+ "Removed stray report during early error cleanup: %s",
1028
+ candidate,
1029
+ )
1030
+ except Exception as e:
1031
+ logger.warning(
1032
+ "Failed to remove stray report %s: %s",
1033
+ candidate,
1034
+ e,
1035
+ )
1036
+
1037
+ except Exception as e:
1038
+ logger.warning(f"Error during cleanup: {e}")
1039
+ finally:
1040
+ # Remove any sensitive copy created during this processing run
1041
+ sensitive_created = self.processing_context.get("sensitive_copy_created")
1042
+ if sensitive_created:
1043
+ pdf_obj = self.current_pdf
1044
+ try:
1045
+ if pdf_obj:
1046
+ file_field = getattr(pdf_obj, "file", None)
1047
+ if file_field and getattr(file_field, "name", None):
1048
+ storage_name = file_field.name
1049
+ file_field.delete(save=False)
1050
+ logger.debug(
1051
+ "Deleted sensitive copy %s during error cleanup",
1052
+ storage_name,
1053
+ )
1054
+ except Exception as cleanup_exc:
1055
+ logger.warning(
1056
+ "Failed to remove sensitive copy during error cleanup: %s",
1057
+ cleanup_exc,
1058
+ )
1059
+ pdf_dir = self._get_pdf_dir()
1060
+ if original_path and pdf_dir:
1061
+ # Try to remove any extra file that was created during import
1062
+ # Simplest heuristic: same basename as original, but in pdf dir or pdf/sensitive dir
1063
+ for candidate_dir in (pdf_dir, pdf_dir / "sensitive"):
1064
+ candidate = candidate_dir / original_path.name
1065
+ if candidate.exists() and candidate != original_path:
1066
+ try:
1067
+ candidate.unlink()
1068
+ logger.debug(
1069
+ "Removed stray report copy during early error cleanup: %s",
1070
+ candidate,
1071
+ )
1072
+ except Exception as e:
1073
+ logger.warning(
1074
+ "Failed to remove stray report copy %s: %s",
1075
+ candidate,
1076
+ e,
1077
+ )
1078
+
1079
+ # Always clean up processed files set to prevent blocks
1080
+ file_path = self.processing_context.get("file_path")
1081
+ if file_path and str(file_path) in self.processed_files:
1082
+ self.processed_files.remove(str(file_path))
1083
+ logger.debug(
1084
+ f"Removed {file_path} from processed files during error cleanup"
1085
+ )
1086
+
1087
+ try:
1088
+ raw_dir = (
1089
+ original_path.parent if isinstance(original_path, Path) else None
1090
+ )
1091
+
1092
+ pdf_dir = self._get_pdf_dir()
1093
+ if not pdf_dir and raw_dir:
1094
+ base_dir = raw_dir.parent
1095
+ dir_name = getattr(path_utils, "REPORT_DIR_NAME", "pdfs")
1096
+ fallback_pdf_dir = base_dir / dir_name
1097
+ logger.debug(
1098
+ "report cleanup fallback resolution - base: %s, dir_name: %s, exists: %s",
1099
+ base_dir,
1100
+ dir_name,
1101
+ fallback_pdf_dir.exists(),
1102
+ )
1103
+ if fallback_pdf_dir.exists():
1104
+ pdf_dir = fallback_pdf_dir
1105
+
1106
+ # Remove empty report subdirectories that might have been created during setup
1107
+ if pdf_dir and pdf_dir.exists():
1108
+ for subdir_name in (
1109
+ "sensitive",
1110
+ "cropped_regions",
1111
+ "anonymized",
1112
+ "_processing",
1113
+ ):
1114
+ subdir_path = pdf_dir / subdir_name
1115
+ if subdir_path.exists() and subdir_path.is_dir():
1116
+ try:
1117
+ next(subdir_path.iterdir())
1118
+ except StopIteration:
1119
+ try:
1120
+ subdir_path.rmdir()
1121
+ logger.debug(
1122
+ "Removed empty directory %s during error cleanup",
1123
+ subdir_path,
1124
+ )
1125
+ except OSError as rm_err:
1126
+ logger.debug(
1127
+ "Could not remove directory %s: %s",
1128
+ subdir_path,
1129
+ rm_err,
1130
+ )
1131
+ except Exception as iter_err:
1132
+ logger.debug(
1133
+ "Could not inspect directory %s: %s",
1134
+ subdir_path,
1135
+ iter_err,
1136
+ )
1137
+
1138
+ raw_count = (
1139
+ len(list(raw_dir.glob("*")))
1140
+ if raw_dir and raw_dir.exists()
1141
+ else None
1142
+ )
1143
+ pdf_count = (
1144
+ len(list(pdf_dir.glob("*")))
1145
+ if pdf_dir and pdf_dir.exists()
1146
+ else None
1147
+ )
1148
+
1149
+ sensitive_path = self.processing_context.get("sensitive_file_path")
1150
+ if sensitive_path:
1151
+ sensitive_parent = Path(sensitive_path).parent
1152
+ sensitive_count = (
1153
+ len(list(sensitive_parent.glob("*")))
1154
+ if sensitive_parent.exists()
1155
+ else None
1156
+ )
1157
+ else:
1158
+ sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
1159
+ sensitive_count = (
1160
+ len(list(sensitive_dir.glob("*")))
1161
+ if sensitive_dir and sensitive_dir.exists()
1162
+ else None
1163
+ )
1164
+
1165
+ logger.info(
1166
+ "report import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
1167
+ raw_count,
1168
+ pdf_count,
1169
+ sensitive_count,
1170
+ )
1171
+ except Exception:
1172
+ pass
1173
+
1174
+ def _cleanup_processing_context(self):
1175
+ """Cleanup processing context."""
1176
+ try:
1177
+ # Clean up temporary directories
1178
+ if self.processing_context.get("text_extracted"):
1179
+ crops_dir = path_utils.REPORT_DIR / "cropped_regions"
1180
+ if crops_dir.exists() and not any(crops_dir.iterdir()):
1181
+ crops_dir.rmdir()
1182
+
1183
+ # Always remove from processed files set after processing attempt
1184
+ file_path = self.processing_context.get("file_path")
1185
+ if file_path and str(file_path) in self.processed_files:
1186
+ self.processed_files.remove(str(file_path))
1187
+ logger.debug(f"Removed {file_path} from processed files set")
1188
+
1189
+ except Exception as e:
1190
+ logger.warning(f"Error during context cleanup: {e}")
1191
+ finally:
1192
+ # Reset context
1193
+ self.current_pdf = None
1194
+ self.processing_context = {}
1195
+
1196
+ def import_simple(
1197
+ self, file_path: Union[Path, str], center_name: str, delete_source: bool = False
1198
+ ) -> "RawPdfFile":
1199
+ """
1200
+ Simple report import without text processing or anonymization.
1201
+ Uses centralized report instance management pattern.
1202
+
1203
+ Args:
1204
+ file_path: Path to the report file to import
1205
+ center_name: Name of the center to associate with report
1206
+ delete_source: Whether to delete the source file after import
1207
+
1208
+ Returns:
1209
+ RawPdfFile instance after basic import
1210
+ """
1211
+ try:
1212
+ # Initialize simple processing context
1213
+ self._initialize_processing_context(
1214
+ file_path, center_name, delete_source, False
1215
+ )
1216
+
1217
+ # Validate file
1218
+ self._validate_and_prepare_file()
1219
+
1220
+ # Create report instance
1221
+ logger.info("Starting simple import - creating RawPdfFile instance...")
1222
+ self.current_pdf = RawPdfFile.create_from_file_initialized(
1223
+ file_path=self.processing_context["file_path"],
1224
+ center_name=center_name,
1225
+ delete_source=delete_source,
1226
+ )
1227
+
1228
+ if not self.current_pdf:
1229
+ raise RuntimeError("Failed to create RawPdfFile instance")
1230
+
1231
+ # Mark as processed
1232
+ self.processed_files.add(str(self.processing_context["file_path"]))
1233
+
1234
+ # Set basic state for simple import
1235
+ state = self._ensure_state(self.current_pdf)
1236
+ if state:
1237
+ state.text_meta_extracted = False
1238
+ state.pdf_meta_extracted = False
1239
+ state.sensitive_meta_processed = False
1240
+ state.save()
1241
+ logger.info("Set report state: processed=False for simple import")
1242
+
1243
+ # Save changes
1244
+ with transaction.atomic():
1245
+ self.current_pdf.save()
1246
+
1247
+ logger.info(
1248
+ "Simple import completed for RawPdfFile hash: %s",
1249
+ self.current_pdf.pdf_hash,
1250
+ )
1251
+ return self.current_pdf
1252
+
1253
+ except Exception as e:
1254
+ logger.error(f"Simple report import failed for {file_path}: {e}")
1255
+ self._cleanup_on_error()
1256
+ raise
1257
+ finally:
1258
+ self._cleanup_processing_context()
1259
+
1260
+ def check_storage_capacity(
1261
+ self, file_path: Union[Path, str], storage_root, min_required_space
1262
+ ) -> bool:
1263
+ """
1264
+ Check if there is sufficient storage capacity for the report file.
1265
+
1266
+ Args:
1267
+ file_path: Path to the report file to check
1268
+
1269
+ Raises:
1270
+ InsufficientStorageError: If there is not enough space
1271
+ """
1272
+ import shutil
1273
+
1274
+ from endoreg_db.exceptions import InsufficientStorageError
1275
+
1276
+ file_path = Path(file_path)
1277
+ if not file_path.exists():
1278
+ raise FileNotFoundError(f"File not found for storage check: {file_path}")
1279
+
1280
+ # Get the size of the file
1281
+ file_size = file_path.stat().st_size
1282
+
1283
+ # Get available space in the storage directory
1284
+
1285
+ total, used, free = shutil.disk_usage(storage_root)
1286
+
1287
+ if file_size:
1288
+ min_required_space = file_size if isinstance(min_required_space, int) else 0
1289
+
1290
+ # Check if there is enough space
1291
+ if file_size > free:
1292
+ raise InsufficientStorageError(
1293
+ f"Not enough space to store report file: {file_path}"
1294
+ )
1295
+ logger.info(
1296
+ f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available"
1297
+ )
1298
+
1299
+ return True
1300
+
1301
+ def create_sensitive_file(
1302
+ self, pdf_instance: "RawPdfFile", file_path: Union[Path, str]
1303
+ ) -> None:
1304
+ """
1305
+ Create a copy of the report file in the sensitive directory and update the file reference.
1306
+ Delete the source path to avoid duplicates.
1307
+ Uses the central report instance and processing context if parameters not provided.
1308
+
1309
+ Ensures the FileField points to the file under STORAGE_DIR/pdfs/sensitive and never back to raw_pdfs.
1310
+ """
1311
+ pdf_file = pdf_instance or self.current_pdf
1312
+ source_path = (
1313
+ Path(file_path) if file_path else self.processing_context.get("file_path")
1314
+ )
1315
+
1316
+ if not pdf_file:
1317
+ raise ValueError("No report instance available for creating sensitive file")
1318
+ if not source_path:
1319
+ raise ValueError("No file path available for creating sensitive file")
1320
+
1321
+ SENSITIVE_DIR = path_utils.REPORT_DIR / "sensitive"
1322
+ target = SENSITIVE_DIR / f"{pdf_file.pdf_hash}.pdf"
1323
+
1324
+ try:
1325
+ os.makedirs(SENSITIVE_DIR, exist_ok=True)
1326
+
1327
+ # If source already is the target, just ensure FileField points correctly
1328
+ if source_path.resolve() == target.resolve():
1329
+ pass
1330
+ else:
1331
+ # Move the file from ingress to sensitive storage
1332
+ # Using replace semantics when target exists (re-import)
1333
+ if target.exists():
1334
+ try:
1335
+ target.unlink()
1336
+ except Exception as e:
1337
+ logger.warning(
1338
+ "Could not remove existing sensitive target %s: %s",
1339
+ target,
1340
+ e,
1341
+ )
1342
+ shutil.move(str(source_path), str(target))
1343
+ logger.info(f"Moved report to sensitive directory: {target}")
1344
+
1345
+ # Update FileField to reference the file under STORAGE_DIR
1346
+ # We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
1347
+ try:
1348
+ relative_name = str(
1349
+ target.relative_to(path_utils.STORAGE_DIR)
1350
+ ) # Point Django FileField to sensitive storage
1351
+ except ValueError:
1352
+ # Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
1353
+ relative_name = str(target)
1354
+
1355
+ # Only update when changed
1356
+ if getattr(pdf_file.file, "name", None) != relative_name:
1357
+ pdf_file.file.name = relative_name
1358
+ pdf_file.save(update_fields=["file"])
1359
+ logger.info(
1360
+ "Updated report FileField reference to sensitive path: %s",
1361
+ pdf_file.file.path,
1362
+ )
1363
+ else:
1364
+ logger.debug(
1365
+ "report FileField already points to sensitive path: %s",
1366
+ pdf_file.file.path,
1367
+ )
1368
+
1369
+ # Best-effort: if original source still exists (e.g., copy), remove it to avoid re-triggers
1370
+ try:
1371
+ if source_path.exists() and source_path != target:
1372
+ os.remove(source_path)
1373
+ logger.info(
1374
+ f"Removed original report file at ingress: {source_path}"
1375
+ )
1376
+ except OSError as e:
1377
+ logger.warning(
1378
+ f"Could not delete original report file {source_path}: {e}"
1379
+ )
1380
+
1381
+ except Exception as e:
1382
+ logger.warning(
1383
+ f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}",
1384
+ exc_info=True,
1385
+ )
1386
+
1387
+ def archive_or_quarantine_file(
1388
+ self,
1389
+ pdf_instance: "RawPdfFile",
1390
+ source_file_path: Union[Path, str],
1391
+ quarantine_reason: str,
1392
+ is_pdf_problematic: bool,
1393
+ ) -> bool:
1394
+ """
1395
+ Archive or quarantine file based on the state of the report processing.
1396
+ Uses the central report instance and processing context if parameters not provided.
1397
+
1398
+ Args:
1399
+ pdf_instance: Optional report instance, defaults to self.current_pdf
1400
+ source_file_path: Optional source file path, defaults to processing_context['file_path']
1401
+ quarantine_reason: Optional quarantine reason, defaults to processing_context['error_reason']
1402
+ is_pdf_problematic: Optional override for problematic state
1403
+
1404
+ Returns:
1405
+ bool: True if file was quarantined, False if archived successfully
1406
+ """
1407
+ pdf_file = pdf_instance or self.current_pdf
1408
+ file_path = (
1409
+ Path(source_file_path)
1410
+ if source_file_path
1411
+ else self.processing_context.get("file_path")
1412
+ )
1413
+ quarantine_reason = str(
1414
+ quarantine_reason or self.processing_context.get("error_reason")
1415
+ )
1416
+
1417
+ if not pdf_file:
1418
+ raise ValueError("No report instance available for archiving/quarantine")
1419
+ if not file_path:
1420
+ raise ValueError("No file path available for archiving/quarantine")
1421
+
1422
+ # Determine if the report is problematic
1423
+ pdf_problematic = (
1424
+ is_pdf_problematic
1425
+ if is_pdf_problematic is not None
1426
+ else pdf_file.is_problematic
1427
+ )
1428
+
1429
+ if pdf_problematic:
1430
+ # Quarantine the file
1431
+ logger.warning(
1432
+ f"Quarantining problematic report: {pdf_file.pdf_hash}, reason: {quarantine_reason}"
1433
+ )
1434
+ quarantine_dir = path_utils.REPORT_DIR / "quarantine"
1435
+ os.makedirs(quarantine_dir, exist_ok=True)
1436
+
1437
+ quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
1438
+ try:
1439
+ shutil.move(file_path, quarantine_path)
1440
+ pdf_file.save(update_fields=["quarantine_reason"])
1441
+ logger.info(
1442
+ f"Moved problematic report to quarantine: {quarantine_path}"
1443
+ )
1444
+ return True
1445
+ except Exception as e:
1446
+ logger.error(f"Failed to quarantine report {pdf_file.pdf_hash}: {e}")
1447
+ return (
1448
+ True # Still consider as quarantined to prevent further processing
1449
+ )
1450
+ else:
1451
+ # Archive the file normally
1452
+ logger.info(f"Archiving successfully processed report: {pdf_file.pdf_hash}")
1453
+ archive_dir = path_utils.REPORT_DIR / "processed"
1454
+ os.makedirs(archive_dir, exist_ok=True)
1455
+
1456
+ archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
1457
+ try:
1458
+ shutil.move(file_path, archive_path)
1459
+ logger.info(f"Moved processed report to archive: {archive_path}")
1460
+ return False
1461
+ except Exception as e:
1462
+ logger.error(f"Failed to archive report {pdf_file.pdf_hash}: {e}")
1463
+ return False
1464
+
1465
+ def _is_placeholder_value(self, field_name: str, value) -> bool:
1466
+ """Return True if a SensitiveMeta field still has a dummy/default value."""
1467
+ if value is None:
1468
+ return True
1469
+
1470
+ # String placeholders
1471
+ if isinstance(value, str):
1472
+ if value in {
1473
+ self.DEFAULT_PATIENT_FIRST_NAME,
1474
+ self.DEFAULT_PATIENT_LAST_NAME,
1475
+ }:
1476
+ return True
1477
+
1478
+ # Date placeholders
1479
+ if isinstance(value, date):
1480
+ # Default DOB
1481
+ if field_name == "patient_dob" and value == self.DEFAULT_PATIENT_DOB:
1482
+ return True
1483
+ # "Today" exam date created as fallback – allow anonymizer to override
1484
+ if field_name == "examination_date" and value == date.today():
1485
+ return True
1486
+
1487
+ return False