endoreg-db 0.8.8.0__py3-none-any.whl → 0.8.8.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of endoreg-db might be problematic. Click here for more details.

Files changed (402) hide show
  1. endoreg_db/data/__init__.py +22 -8
  2. endoreg_db/data/ai_model_meta/default_multilabel_classification.yaml +0 -1
  3. endoreg_db/data/examination/examinations/data.yaml +114 -14
  4. endoreg_db/data/examination/time-type/data.yaml +0 -3
  5. endoreg_db/data/examination_indication/endoscopy.yaml +108 -173
  6. endoreg_db/data/examination_indication_classification/endoscopy.yaml +0 -70
  7. endoreg_db/data/examination_indication_classification_choice/endoscopy.yaml +33 -37
  8. endoreg_db/data/finding/00_generic.yaml +35 -0
  9. endoreg_db/data/finding/00_generic_complication.yaml +9 -0
  10. endoreg_db/data/finding/01_gastroscopy_baseline.yaml +88 -0
  11. endoreg_db/data/finding/01_gastroscopy_observation.yaml +113 -0
  12. endoreg_db/data/finding/02_colonoscopy_baseline.yaml +53 -0
  13. endoreg_db/data/finding/02_colonoscopy_hidden.yaml +119 -0
  14. endoreg_db/data/finding/02_colonoscopy_observation.yaml +152 -0
  15. endoreg_db/data/finding_classification/00_generic.yaml +44 -0
  16. endoreg_db/data/finding_classification/00_generic_histology.yaml +28 -0
  17. endoreg_db/data/finding_classification/00_generic_lesion.yaml +52 -0
  18. endoreg_db/data/finding_classification/{colonoscopy_bowel_preparation.yaml → 02_colonoscopy_baseline.yaml} +35 -20
  19. endoreg_db/data/finding_classification/02_colonoscopy_histology.yaml +13 -0
  20. endoreg_db/data/finding_classification/02_colonoscopy_other.yaml +12 -0
  21. endoreg_db/data/finding_classification/02_colonoscopy_polyp.yaml +101 -0
  22. endoreg_db/data/finding_classification_choice/{yes_no_na.yaml → 00_generic.yaml} +5 -1
  23. endoreg_db/data/finding_classification_choice/{examination_setting_generic_types.yaml → 00_generic_baseline.yaml} +10 -2
  24. endoreg_db/data/finding_classification_choice/{complication_generic_types.yaml → 00_generic_complication.yaml} +1 -1
  25. endoreg_db/data/finding_classification_choice/{histology.yaml → 00_generic_histology.yaml} +1 -4
  26. endoreg_db/data/finding_classification_choice/00_generic_lesion.yaml +158 -0
  27. endoreg_db/data/finding_classification_choice/{bowel_preparation.yaml → 02_colonoscopy_bowel_preparation.yaml} +1 -30
  28. endoreg_db/data/{_examples/finding_classification_choice/colonoscopy_not_complete_reason.yaml → finding_classification_choice/02_colonoscopy_generic.yaml} +1 -1
  29. endoreg_db/data/finding_classification_choice/{histology_polyp.yaml → 02_colonoscopy_histology.yaml} +1 -1
  30. endoreg_db/data/{_examples/finding_classification_choice/colonoscopy_location.yaml → finding_classification_choice/02_colonoscopy_location.yaml} +23 -4
  31. endoreg_db/data/finding_classification_choice/02_colonoscopy_other.yaml +34 -0
  32. endoreg_db/data/finding_classification_choice/02_colonoscopy_polyp_advanced_imaging.yaml +76 -0
  33. endoreg_db/data/{_examples/finding_classification_choice/colon_lesion_paris.yaml → finding_classification_choice/02_colonoscopy_polyp_morphology.yaml} +26 -8
  34. endoreg_db/data/finding_classification_choice/02_colonoscopy_size.yaml +27 -0
  35. endoreg_db/data/finding_classification_type/{colonoscopy_basic.yaml → 00_generic.yaml} +18 -13
  36. endoreg_db/data/finding_classification_type/02_colonoscopy.yaml +9 -0
  37. endoreg_db/data/finding_intervention/00_generic_endoscopy.yaml +59 -0
  38. endoreg_db/data/finding_intervention/00_generic_endoscopy_ablation.yaml +44 -0
  39. endoreg_db/data/finding_intervention/00_generic_endoscopy_bleeding.yaml +55 -0
  40. endoreg_db/data/finding_intervention/00_generic_endoscopy_resection.yaml +85 -0
  41. endoreg_db/data/finding_intervention/00_generic_endoscopy_stenosis.yaml +17 -0
  42. endoreg_db/data/finding_intervention/00_generic_endoscopy_stent.yaml +9 -0
  43. endoreg_db/data/finding_intervention/01_gastroscopy.yaml +19 -0
  44. endoreg_db/data/finding_intervention/04_eus.yaml +39 -0
  45. endoreg_db/data/finding_intervention/05_ercp.yaml +3 -0
  46. endoreg_db/data/finding_type/data.yaml +8 -12
  47. endoreg_db/data/requirement/01_patient_data.yaml +93 -0
  48. endoreg_db/data/requirement_operator/new_operators.yaml +36 -0
  49. endoreg_db/data/requirement_set/01_endoscopy_generic.yaml +0 -2
  50. endoreg_db/data/requirement_set/90_coloreg.yaml +20 -8
  51. endoreg_db/exceptions.py +0 -1
  52. endoreg_db/forms/examination_form.py +1 -1
  53. endoreg_db/helpers/data_loader.py +124 -52
  54. endoreg_db/helpers/default_objects.py +116 -81
  55. endoreg_db/import_files/__init__.py +27 -0
  56. endoreg_db/import_files/context/__init__.py +7 -0
  57. endoreg_db/import_files/context/default_sensitive_meta.py +81 -0
  58. endoreg_db/import_files/context/ensure_center.py +17 -0
  59. endoreg_db/import_files/context/file_lock.py +66 -0
  60. endoreg_db/import_files/context/import_context.py +43 -0
  61. endoreg_db/import_files/context/validate_directories.py +56 -0
  62. endoreg_db/import_files/file_storage/__init__.py +15 -0
  63. endoreg_db/import_files/file_storage/create_report_file.py +76 -0
  64. endoreg_db/import_files/file_storage/create_video_file.py +75 -0
  65. endoreg_db/import_files/file_storage/sensitive_meta_storage.py +39 -0
  66. endoreg_db/import_files/file_storage/state_management.py +400 -0
  67. endoreg_db/import_files/file_storage/storage.py +36 -0
  68. endoreg_db/import_files/import_service.md +26 -0
  69. endoreg_db/import_files/processing/__init__.py +11 -0
  70. endoreg_db/import_files/processing/report_processing/report_anonymization.py +94 -0
  71. endoreg_db/import_files/processing/sensitive_meta_adapter.py +51 -0
  72. endoreg_db/import_files/processing/video_processing/video_anonymization.py +107 -0
  73. endoreg_db/import_files/processing/video_processing/video_cleanup_on_error.py +119 -0
  74. endoreg_db/import_files/pseudonymization/fake.py +52 -0
  75. endoreg_db/import_files/pseudonymization/k_anonymity.py +182 -0
  76. endoreg_db/import_files/pseudonymization/k_pseudonymity.py +128 -0
  77. endoreg_db/import_files/report_import_service.py +141 -0
  78. endoreg_db/import_files/video_import_service.py +150 -0
  79. endoreg_db/management/commands/import_report.py +130 -65
  80. endoreg_db/management/commands/import_video_with_classification.py +1 -1
  81. endoreg_db/management/commands/load_ai_model_data.py +5 -5
  82. endoreg_db/management/commands/load_ai_model_label_data.py +9 -7
  83. endoreg_db/management/commands/load_base_db_data.py +5 -134
  84. endoreg_db/management/commands/load_contraindication_data.py +14 -16
  85. endoreg_db/management/commands/load_disease_classification_choices_data.py +15 -18
  86. endoreg_db/management/commands/load_disease_classification_data.py +15 -18
  87. endoreg_db/management/commands/load_disease_data.py +25 -28
  88. endoreg_db/management/commands/load_endoscope_data.py +20 -27
  89. endoreg_db/management/commands/load_event_data.py +14 -16
  90. endoreg_db/management/commands/load_examination_data.py +31 -44
  91. endoreg_db/management/commands/load_examination_indication_data.py +20 -21
  92. endoreg_db/management/commands/load_finding_data.py +52 -80
  93. endoreg_db/management/commands/load_information_source.py +21 -23
  94. endoreg_db/management/commands/load_lab_value_data.py +17 -26
  95. endoreg_db/management/commands/load_medication_data.py +13 -12
  96. endoreg_db/management/commands/load_organ_data.py +15 -19
  97. endoreg_db/management/commands/load_pdf_type_data.py +19 -18
  98. endoreg_db/management/commands/load_profession_data.py +14 -17
  99. endoreg_db/management/commands/load_qualification_data.py +20 -23
  100. endoreg_db/management/commands/load_report_reader_flag_data.py +17 -19
  101. endoreg_db/management/commands/load_requirement_data.py +14 -20
  102. endoreg_db/management/commands/load_risk_data.py +7 -6
  103. endoreg_db/management/commands/load_shift_data.py +20 -23
  104. endoreg_db/management/commands/load_tag_data.py +8 -11
  105. endoreg_db/management/commands/load_unit_data.py +17 -19
  106. endoreg_db/management/commands/start_filewatcher.py +46 -37
  107. endoreg_db/management/commands/validate_video_files.py +1 -5
  108. endoreg_db/migrations/0001_initial.py +1360 -1812
  109. endoreg_db/models/administration/person/patient/patient.py +72 -46
  110. endoreg_db/models/label/__init__.py +2 -2
  111. endoreg_db/models/label/annotation/video_segmentation_annotation.py +18 -26
  112. endoreg_db/models/label/label_video_segment/label_video_segment.py +23 -1
  113. endoreg_db/models/media/pdf/raw_pdf.py +136 -64
  114. endoreg_db/models/media/pdf/report_reader/report_reader_config.py +34 -10
  115. endoreg_db/models/media/processing_history/__init__.py +5 -0
  116. endoreg_db/models/media/processing_history/processing_history.py +96 -0
  117. endoreg_db/models/media/video/create_from_file.py +101 -31
  118. endoreg_db/models/media/video/video_file.py +125 -105
  119. endoreg_db/models/media/video/video_file_io.py +31 -26
  120. endoreg_db/models/medical/contraindication/README.md +1 -0
  121. endoreg_db/models/medical/examination/examination.py +28 -8
  122. endoreg_db/models/medical/examination/examination_indication.py +13 -79
  123. endoreg_db/models/medical/examination/examination_time.py +8 -3
  124. endoreg_db/models/medical/finding/finding.py +5 -12
  125. endoreg_db/models/medical/finding/finding_classification.py +18 -37
  126. endoreg_db/models/medical/finding/finding_intervention.py +7 -9
  127. endoreg_db/models/medical/hardware/endoscope.py +6 -0
  128. endoreg_db/models/medical/patient/medication_examples.py +5 -1
  129. endoreg_db/models/medical/patient/patient_finding.py +1 -1
  130. endoreg_db/models/metadata/pdf_meta.py +22 -10
  131. endoreg_db/models/metadata/sensitive_meta.py +3 -0
  132. endoreg_db/models/metadata/sensitive_meta_logic.py +200 -124
  133. endoreg_db/models/other/information_source.py +27 -6
  134. endoreg_db/models/report/__init__.py +0 -0
  135. endoreg_db/models/report/images.py +0 -0
  136. endoreg_db/models/report/report.py +6 -0
  137. endoreg_db/models/requirement/requirement.py +59 -399
  138. endoreg_db/models/requirement/requirement_operator.py +86 -98
  139. endoreg_db/models/state/audit_ledger.py +4 -5
  140. endoreg_db/models/state/raw_pdf.py +69 -30
  141. endoreg_db/models/state/video.py +64 -49
  142. endoreg_db/models/upload_job.py +33 -9
  143. endoreg_db/models/utils.py +27 -23
  144. endoreg_db/queries/__init__.py +3 -1
  145. endoreg_db/schemas/examination_evaluation.py +1 -1
  146. endoreg_db/serializers/__init__.py +2 -8
  147. endoreg_db/serializers/label_video_segment/label_video_segment.py +2 -29
  148. endoreg_db/serializers/meta/__init__.py +1 -6
  149. endoreg_db/serializers/misc/sensitive_patient_data.py +50 -26
  150. endoreg_db/serializers/patient_examination/patient_examination.py +3 -3
  151. endoreg_db/serializers/pdf/anony_text_validation.py +39 -23
  152. endoreg_db/serializers/video/video_file_list.py +65 -34
  153. endoreg_db/services/__old/pdf_import.py +1487 -0
  154. endoreg_db/services/__old/video_import.py +1306 -0
  155. endoreg_db/services/anonymization.py +63 -26
  156. endoreg_db/services/lookup_service.py +28 -28
  157. endoreg_db/services/lookup_store.py +2 -2
  158. endoreg_db/services/pdf_import.py +0 -1480
  159. endoreg_db/services/report_import.py +10 -0
  160. endoreg_db/services/video_import.py +6 -1165
  161. endoreg_db/tasks/upload_tasks.py +79 -70
  162. endoreg_db/tasks/video_ingest.py +8 -4
  163. endoreg_db/urls/__init__.py +0 -14
  164. endoreg_db/urls/ai.py +32 -0
  165. endoreg_db/urls/media.py +21 -24
  166. endoreg_db/utils/dataloader.py +87 -57
  167. endoreg_db/utils/paths.py +110 -46
  168. endoreg_db/utils/pipelines/Readme.md +1 -1
  169. endoreg_db/utils/requirement_operator_logic/new_operator_logic.py +97 -0
  170. endoreg_db/views/__init__.py +85 -173
  171. endoreg_db/views/ai/__init__.py +8 -0
  172. endoreg_db/views/ai/label.py +155 -0
  173. endoreg_db/views/anonymization/media_management.py +8 -7
  174. endoreg_db/views/anonymization/overview.py +97 -68
  175. endoreg_db/views/anonymization/validate.py +25 -21
  176. endoreg_db/views/media/__init__.py +5 -20
  177. endoreg_db/views/media/pdf_media.py +109 -65
  178. endoreg_db/views/media/sensitive_metadata.py +163 -148
  179. endoreg_db/views/meta/__init__.py +0 -8
  180. endoreg_db/views/misc/__init__.py +1 -7
  181. endoreg_db/views/misc/upload_views.py +94 -93
  182. endoreg_db/views/report/__init__.py +7 -0
  183. endoreg_db/views/{pdf → report}/reimport.py +45 -24
  184. endoreg_db/views/{pdf/pdf_stream.py → report/report_stream.py} +40 -32
  185. endoreg_db/views/requirement/lookup_store.py +22 -90
  186. endoreg_db/views/video/__init__.py +23 -22
  187. endoreg_db/views/video/correction.py +201 -172
  188. endoreg_db/views/video/reimport.py +1 -1
  189. endoreg_db/views/{media/video_segments.py → video/segments_crud.py} +75 -37
  190. endoreg_db/views/video/{video_meta.py → video_meta_stats.py} +2 -2
  191. endoreg_db/views/video/video_stream.py +7 -8
  192. {endoreg_db-0.8.8.0.dist-info → endoreg_db-0.8.8.9.dist-info}/METADATA +2 -2
  193. {endoreg_db-0.8.8.0.dist-info → endoreg_db-0.8.8.9.dist-info}/RECORD +217 -335
  194. {endoreg_db-0.8.8.0.dist-info → endoreg_db-0.8.8.9.dist-info}/WHEEL +1 -1
  195. endoreg_db/data/_examples/disease.yaml +0 -55
  196. endoreg_db/data/_examples/disease_classification.yaml +0 -13
  197. endoreg_db/data/_examples/disease_classification_choice.yaml +0 -62
  198. endoreg_db/data/_examples/event.yaml +0 -64
  199. endoreg_db/data/_examples/examination.yaml +0 -72
  200. endoreg_db/data/_examples/finding/anatomy_colon.yaml +0 -128
  201. endoreg_db/data/_examples/finding/colonoscopy.yaml +0 -40
  202. endoreg_db/data/_examples/finding/colonoscopy_bowel_prep.yaml +0 -56
  203. endoreg_db/data/_examples/finding/complication.yaml +0 -16
  204. endoreg_db/data/_examples/finding/data.yaml +0 -105
  205. endoreg_db/data/_examples/finding/examination_setting.yaml +0 -16
  206. endoreg_db/data/_examples/finding/medication_related.yaml +0 -18
  207. endoreg_db/data/_examples/finding/outcome.yaml +0 -12
  208. endoreg_db/data/_examples/finding_classification/colonoscopy_bowel_preparation.yaml +0 -68
  209. endoreg_db/data/_examples/finding_classification/colonoscopy_jnet.yaml +0 -22
  210. endoreg_db/data/_examples/finding_classification/colonoscopy_kudo.yaml +0 -25
  211. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_circularity.yaml +0 -20
  212. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_planarity.yaml +0 -24
  213. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_size.yaml +0 -68
  214. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_surface.yaml +0 -20
  215. endoreg_db/data/_examples/finding_classification/colonoscopy_location.yaml +0 -80
  216. endoreg_db/data/_examples/finding_classification/colonoscopy_lst.yaml +0 -21
  217. endoreg_db/data/_examples/finding_classification/colonoscopy_nice.yaml +0 -20
  218. endoreg_db/data/_examples/finding_classification/colonoscopy_paris.yaml +0 -26
  219. endoreg_db/data/_examples/finding_classification/colonoscopy_sano.yaml +0 -22
  220. endoreg_db/data/_examples/finding_classification/colonoscopy_summary.yaml +0 -53
  221. endoreg_db/data/_examples/finding_classification/complication_generic.yaml +0 -25
  222. endoreg_db/data/_examples/finding_classification/examination_setting_generic.yaml +0 -40
  223. endoreg_db/data/_examples/finding_classification/histology_colo.yaml +0 -51
  224. endoreg_db/data/_examples/finding_classification/intervention_required.yaml +0 -26
  225. endoreg_db/data/_examples/finding_classification/medication_related.yaml +0 -23
  226. endoreg_db/data/_examples/finding_classification/visualized.yaml +0 -33
  227. endoreg_db/data/_examples/finding_classification_choice/bowel_preparation.yaml +0 -78
  228. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_circularity_default.yaml +0 -32
  229. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_jnet.yaml +0 -15
  230. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_kudo.yaml +0 -23
  231. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_lst.yaml +0 -15
  232. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_nice.yaml +0 -17
  233. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_planarity_default.yaml +0 -49
  234. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_sano.yaml +0 -14
  235. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_surface_intact_default.yaml +0 -36
  236. endoreg_db/data/_examples/finding_classification_choice/colonoscopy_size.yaml +0 -82
  237. endoreg_db/data/_examples/finding_classification_choice/colonoscopy_summary_worst_finding.yaml +0 -15
  238. endoreg_db/data/_examples/finding_classification_choice/complication_generic_types.yaml +0 -15
  239. endoreg_db/data/_examples/finding_classification_choice/examination_setting_generic_types.yaml +0 -15
  240. endoreg_db/data/_examples/finding_classification_choice/histology.yaml +0 -24
  241. endoreg_db/data/_examples/finding_classification_choice/histology_polyp.yaml +0 -20
  242. endoreg_db/data/_examples/finding_classification_choice/outcome.yaml +0 -19
  243. endoreg_db/data/_examples/finding_classification_choice/yes_no_na.yaml +0 -11
  244. endoreg_db/data/_examples/finding_classification_type/colonoscopy_basic.yaml +0 -48
  245. endoreg_db/data/_examples/finding_intervention/endoscopy.yaml +0 -43
  246. endoreg_db/data/_examples/finding_intervention/endoscopy_colonoscopy.yaml +0 -168
  247. endoreg_db/data/_examples/finding_intervention/endoscopy_egd.yaml +0 -128
  248. endoreg_db/data/_examples/finding_intervention/endoscopy_ercp.yaml +0 -32
  249. endoreg_db/data/_examples/finding_intervention/endoscopy_eus_lower.yaml +0 -9
  250. endoreg_db/data/_examples/finding_intervention/endoscopy_eus_upper.yaml +0 -36
  251. endoreg_db/data/_examples/finding_intervention_type/endoscopy.yaml +0 -15
  252. endoreg_db/data/_examples/finding_type/data.yaml +0 -43
  253. endoreg_db/data/_examples/requirement/age.yaml +0 -26
  254. endoreg_db/data/_examples/requirement/gender.yaml +0 -25
  255. endoreg_db/data/_examples/requirement_set/01_endoscopy_generic.yaml +0 -48
  256. endoreg_db/data/_examples/requirement_set/colonoscopy_austria_screening.yaml +0 -57
  257. endoreg_db/data/_examples/requirement_set/endoscopy_bleeding_risk.yaml +0 -52
  258. endoreg_db/data/_examples/yaml_examples.xlsx +0 -0
  259. endoreg_db/data/finding/anatomy_colon.yaml +0 -128
  260. endoreg_db/data/finding/colonoscopy.yaml +0 -40
  261. endoreg_db/data/finding/colonoscopy_bowel_prep.yaml +0 -56
  262. endoreg_db/data/finding/complication.yaml +0 -16
  263. endoreg_db/data/finding/data.yaml +0 -105
  264. endoreg_db/data/finding/examination_setting.yaml +0 -16
  265. endoreg_db/data/finding/medication_related.yaml +0 -18
  266. endoreg_db/data/finding/outcome.yaml +0 -12
  267. endoreg_db/data/finding_classification/colonoscopy_jnet.yaml +0 -22
  268. endoreg_db/data/finding_classification/colonoscopy_kudo.yaml +0 -25
  269. endoreg_db/data/finding_classification/colonoscopy_lesion_circularity.yaml +0 -20
  270. endoreg_db/data/finding_classification/colonoscopy_lesion_planarity.yaml +0 -24
  271. endoreg_db/data/finding_classification/colonoscopy_lesion_size.yaml +0 -38
  272. endoreg_db/data/finding_classification/colonoscopy_lesion_surface.yaml +0 -20
  273. endoreg_db/data/finding_classification/colonoscopy_location.yaml +0 -49
  274. endoreg_db/data/finding_classification/colonoscopy_lst.yaml +0 -21
  275. endoreg_db/data/finding_classification/colonoscopy_nice.yaml +0 -20
  276. endoreg_db/data/finding_classification/colonoscopy_paris.yaml +0 -26
  277. endoreg_db/data/finding_classification/colonoscopy_sano.yaml +0 -22
  278. endoreg_db/data/finding_classification/colonoscopy_summary.yaml +0 -53
  279. endoreg_db/data/finding_classification/complication_generic.yaml +0 -25
  280. endoreg_db/data/finding_classification/examination_setting_generic.yaml +0 -40
  281. endoreg_db/data/finding_classification/histology_colo.yaml +0 -43
  282. endoreg_db/data/finding_classification/intervention_required.yaml +0 -26
  283. endoreg_db/data/finding_classification/medication_related.yaml +0 -23
  284. endoreg_db/data/finding_classification/visualized.yaml +0 -33
  285. endoreg_db/data/finding_classification_choice/colon_lesion_circularity_default.yaml +0 -32
  286. endoreg_db/data/finding_classification_choice/colon_lesion_jnet.yaml +0 -15
  287. endoreg_db/data/finding_classification_choice/colon_lesion_kudo.yaml +0 -23
  288. endoreg_db/data/finding_classification_choice/colon_lesion_lst.yaml +0 -15
  289. endoreg_db/data/finding_classification_choice/colon_lesion_nice.yaml +0 -17
  290. endoreg_db/data/finding_classification_choice/colon_lesion_paris.yaml +0 -57
  291. endoreg_db/data/finding_classification_choice/colon_lesion_planarity_default.yaml +0 -49
  292. endoreg_db/data/finding_classification_choice/colon_lesion_sano.yaml +0 -14
  293. endoreg_db/data/finding_classification_choice/colon_lesion_surface_intact_default.yaml +0 -36
  294. endoreg_db/data/finding_classification_choice/colonoscopy_location.yaml +0 -229
  295. endoreg_db/data/finding_classification_choice/colonoscopy_not_complete_reason.yaml +0 -19
  296. endoreg_db/data/finding_classification_choice/colonoscopy_size.yaml +0 -82
  297. endoreg_db/data/finding_classification_choice/colonoscopy_summary_worst_finding.yaml +0 -15
  298. endoreg_db/data/finding_classification_choice/outcome.yaml +0 -19
  299. endoreg_db/data/finding_intervention/endoscopy.yaml +0 -43
  300. endoreg_db/data/finding_intervention/endoscopy_colonoscopy.yaml +0 -168
  301. endoreg_db/data/finding_intervention/endoscopy_egd.yaml +0 -128
  302. endoreg_db/data/finding_intervention/endoscopy_ercp.yaml +0 -32
  303. endoreg_db/data/finding_intervention/endoscopy_eus_lower.yaml +0 -9
  304. endoreg_db/data/finding_intervention/endoscopy_eus_upper.yaml +0 -36
  305. endoreg_db/data/finding_morphology_classification_type/colonoscopy.yaml +0 -79
  306. endoreg_db/data/requirement/age.yaml +0 -26
  307. endoreg_db/data/requirement/colonoscopy_baseline_austria.yaml +0 -45
  308. endoreg_db/data/requirement/disease_cardiovascular.yaml +0 -79
  309. endoreg_db/data/requirement/disease_classification_choice_cardiovascular.yaml +0 -41
  310. endoreg_db/data/requirement/disease_hepatology.yaml +0 -12
  311. endoreg_db/data/requirement/disease_misc.yaml +0 -12
  312. endoreg_db/data/requirement/disease_renal.yaml +0 -96
  313. endoreg_db/data/requirement/endoscopy_bleeding_risk.yaml +0 -59
  314. endoreg_db/data/requirement/event_cardiology.yaml +0 -251
  315. endoreg_db/data/requirement/event_requirements.yaml +0 -145
  316. endoreg_db/data/requirement/finding_colon_polyp.yaml +0 -50
  317. endoreg_db/data/requirement/gender.yaml +0 -25
  318. endoreg_db/data/requirement/lab_value.yaml +0 -441
  319. endoreg_db/data/requirement/medication.yaml +0 -93
  320. endoreg_db/data/requirement_operator/age.yaml +0 -13
  321. endoreg_db/data/requirement_operator/lab_operators.yaml +0 -129
  322. endoreg_db/data/requirement_operator/model_operators.yaml +0 -96
  323. endoreg_db/management/commands/init_default_ai_model.py +0 -112
  324. endoreg_db/management/commands/reset_celery_schedule.py +0 -9
  325. endoreg_db/management/commands/validate_video.py +0 -204
  326. endoreg_db/migrations/0002_requirementset_depends_on.py +0 -18
  327. endoreg_db/migrations/_old/0001_initial.py +0 -1857
  328. endoreg_db/migrations/_old/0002_add_video_correction_models.py +0 -52
  329. endoreg_db/migrations/_old/0003_add_center_display_name.py +0 -30
  330. endoreg_db/migrations/_old/0004_employee_city_employee_post_code_employee_street_and_more.py +0 -68
  331. endoreg_db/migrations/_old/0004_remove_casetemplate_rules_and_more.py +0 -77
  332. endoreg_db/migrations/_old/0005_merge_20251111_1003.py +0 -14
  333. endoreg_db/migrations/_old/0006_sensitivemeta_anonymized_text_and_more.py +0 -68
  334. endoreg_db/migrations/_old/0007_remove_rule_attribute_dtype_remove_rule_rule_type_and_more.py +0 -89
  335. endoreg_db/migrations/_old/0008_remove_event_event_classification_and_more.py +0 -27
  336. endoreg_db/migrations/_old/0009_alter_modelmeta_options_and_more.py +0 -21
  337. endoreg_db/renames.yml +0 -8
  338. endoreg_db/serializers/_old/raw_pdf_meta_validation.py +0 -223
  339. endoreg_db/serializers/_old/raw_video_meta_validation.py +0 -179
  340. endoreg_db/serializers/_old/video.py +0 -71
  341. endoreg_db/serializers/meta/pdf_file_meta_extraction.py +0 -115
  342. endoreg_db/serializers/meta/report_meta.py +0 -53
  343. endoreg_db/serializers/report/__init__.py +0 -9
  344. endoreg_db/serializers/report/mixins.py +0 -45
  345. endoreg_db/serializers/report/report.py +0 -105
  346. endoreg_db/serializers/report/report_list.py +0 -22
  347. endoreg_db/serializers/report/secure_file_url.py +0 -26
  348. endoreg_db/services/requirements_object.py +0 -147
  349. endoreg_db/services/storage_aware_video_processor.py +0 -370
  350. endoreg_db/urls/files.py +0 -6
  351. endoreg_db/urls/label_video_segment_validate.py +0 -33
  352. endoreg_db/urls/label_video_segments.py +0 -46
  353. endoreg_db/views/label/__init__.py +0 -5
  354. endoreg_db/views/label/label.py +0 -15
  355. endoreg_db/views/label_video_segment/__init__.py +0 -16
  356. endoreg_db/views/label_video_segment/create_lvs_from_annotation.py +0 -44
  357. endoreg_db/views/label_video_segment/get_lvs_by_name_and_video.py +0 -50
  358. endoreg_db/views/label_video_segment/label_video_segment.py +0 -77
  359. endoreg_db/views/label_video_segment/label_video_segment_by_label.py +0 -174
  360. endoreg_db/views/label_video_segment/label_video_segment_detail.py +0 -73
  361. endoreg_db/views/label_video_segment/update_lvs_from_annotation.py +0 -46
  362. endoreg_db/views/label_video_segment/validate.py +0 -226
  363. endoreg_db/views/media/segments.py +0 -71
  364. endoreg_db/views/meta/available_files_list.py +0 -146
  365. endoreg_db/views/meta/report_meta.py +0 -53
  366. endoreg_db/views/meta/sensitive_meta_detail.py +0 -85
  367. endoreg_db/views/misc/secure_file_serving_view.py +0 -80
  368. endoreg_db/views/misc/secure_file_url_view.py +0 -84
  369. endoreg_db/views/misc/secure_url_validate.py +0 -79
  370. endoreg_db/views/patient_examination/DEPRECATED_video_backup.py +0 -164
  371. endoreg_db/views/patient_finding_location/__init__.py +0 -5
  372. endoreg_db/views/patient_finding_location/pfl_create.py +0 -70
  373. endoreg_db/views/patient_finding_morphology/__init__.py +0 -5
  374. endoreg_db/views/patient_finding_morphology/pfm_create.py +0 -70
  375. endoreg_db/views/pdf/__init__.py +0 -8
  376. endoreg_db/views/video/segmentation.py +0 -274
  377. endoreg_db/views/video/task_status.py +0 -49
  378. endoreg_db/views/video/timeline.py +0 -46
  379. endoreg_db/views/video/video_analyze.py +0 -52
  380. /endoreg_db/data/requirement/{colon_polyp_intervention.yaml → old/colon_polyp_intervention.yaml} +0 -0
  381. /endoreg_db/data/{_examples/requirement → requirement/old}/colonoscopy_baseline_austria.yaml +0 -0
  382. /endoreg_db/data/requirement/{coloreg_colon_polyp.yaml → old/coloreg_colon_polyp.yaml} +0 -0
  383. /endoreg_db/data/{_examples/requirement → requirement/old}/disease_cardiovascular.yaml +0 -0
  384. /endoreg_db/data/{_examples/requirement → requirement/old}/disease_classification_choice_cardiovascular.yaml +0 -0
  385. /endoreg_db/data/{_examples/requirement → requirement/old}/disease_hepatology.yaml +0 -0
  386. /endoreg_db/data/{_examples/requirement → requirement/old}/disease_misc.yaml +0 -0
  387. /endoreg_db/data/{_examples/requirement → requirement/old}/disease_renal.yaml +0 -0
  388. /endoreg_db/data/{_examples/requirement → requirement/old}/endoscopy_bleeding_risk.yaml +0 -0
  389. /endoreg_db/data/{_examples/requirement → requirement/old}/event_cardiology.yaml +0 -0
  390. /endoreg_db/data/{_examples/requirement → requirement/old}/event_requirements.yaml +0 -0
  391. /endoreg_db/data/{_examples/requirement → requirement/old}/finding_colon_polyp.yaml +0 -0
  392. /endoreg_db/{migrations/__init__.py → data/requirement/old/gender.yaml} +0 -0
  393. /endoreg_db/data/{_examples/requirement → requirement/old}/lab_value.yaml +0 -0
  394. /endoreg_db/data/{_examples/requirement → requirement/old}/medication.yaml +0 -0
  395. /endoreg_db/data/{_examples/requirement_operator → requirement_operator/_old}/age.yaml +0 -0
  396. /endoreg_db/data/{_examples/requirement_operator → requirement_operator/_old}/lab_operators.yaml +0 -0
  397. /endoreg_db/data/{_examples/requirement_operator → requirement_operator/_old}/model_operators.yaml +0 -0
  398. /endoreg_db/{urls/sensitive_meta.py → import_files/pseudonymization/__init__.py} +0 -0
  399. /endoreg_db/{views/pdf/pdf_stream_views.py → import_files/pseudonymization/pseudonymize.py} +0 -0
  400. /endoreg_db/utils/requirement_operator_logic/{lab_value_operators.py → _old/lab_value_operators.py} +0 -0
  401. /endoreg_db/utils/requirement_operator_logic/{model_evaluators.py → _old/model_evaluators.py} +0 -0
  402. {endoreg_db-0.8.8.0.dist-info → endoreg_db-0.8.8.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,1480 +0,0 @@
1
- """
2
- PDF import service module.
3
-
4
- Provides high-level functions for importing and anonymizing PDF files,
5
- combining RawPdfFile creation with text extraction and anonymization using lx anonymizer.
6
-
7
- All Fields should be overwritten from anonymizer defaults except for the center which is given.
8
- """
9
-
10
- import errno
11
- import hashlib
12
- import logging
13
- import os
14
- import shutil
15
- import sys
16
- import time
17
- from contextlib import contextmanager
18
- from datetime import date, datetime
19
- from pathlib import Path
20
- from typing import TYPE_CHECKING, Union
21
- import subprocess
22
- from django.db import transaction
23
- from django.core.exceptions import ObjectDoesNotExist
24
- import lx_anonymizer
25
-
26
- from endoreg_db.models import SensitiveMeta
27
- from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
28
- from endoreg_db.models.state.raw_pdf import RawPdfState
29
- from endoreg_db.utils import paths as path_utils
30
-
31
- logger = logging.getLogger(__name__)
32
-
33
- # Treat lock files older than this as stale and reclaim them (in seconds)
34
- STALE_LOCK_SECONDS = 600
35
-
36
- if TYPE_CHECKING:
37
- pass # RawPdfFile already imported above
38
-
39
-
40
- class PdfImportService:
41
- """
42
- Service class for importing and processing PDF files with text extraction and anonymization.
43
- Uses a central PDF instance pattern for cleaner state management.
44
-
45
- Supports two processing modes:
46
- - 'blackening': Simple PDF masking with black rectangles over sensitive areas
47
- - 'cropping': Advanced mode that crops sensitive regions to separate images
48
- """
49
-
50
- def __init__(
51
- self, allow_meta_overwrite: bool = True, processing_mode: str = "blackening"
52
- ):
53
- """
54
- Initialize the PDF import service.
55
-
56
- Args:
57
- allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
58
- processing_mode: Processing mode - 'blackening' for simple masking, 'cropping' for advanced cropping
59
- """
60
- self.processed_files = set()
61
- self._report_reader_available = None
62
- self._report_reader_class = None
63
- self.allow_meta_overwrite = allow_meta_overwrite
64
-
65
- # Validate and set processing mode
66
- valid_modes = ["blackening", "cropping"]
67
- if processing_mode not in valid_modes:
68
- raise ValueError(
69
- f"Invalid processing_mode '{processing_mode}'. Must be one of: {valid_modes}"
70
- )
71
- self.processing_mode = processing_mode
72
-
73
- # Central PDF instance management
74
- self.current_pdf = None
75
- self.current_pdf_state = None
76
- self.processing_context = {}
77
- self.original_path = None
78
-
79
- self.DEFAULT_PATIENT_FIRST_NAME = "Patient"
80
- self.DEFAULT_PATIENT_LAST_NAME = "Unknown"
81
- self.DEFAULT_PATIENT_DOB = date(1990, 1, 1)
82
- self.DEFAULT_CENTER_NAME = "university_hospital_wuerzburg"
83
-
84
- @classmethod
85
- def with_blackening(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
86
- """
87
- Create a PdfImportService configured for simple PDF blackening mode.
88
-
89
- Args:
90
- allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
91
-
92
- Returns:
93
- PdfImportService instance configured for blackening mode
94
- """
95
- return cls(
96
- allow_meta_overwrite=allow_meta_overwrite, processing_mode="blackening"
97
- )
98
-
99
- @classmethod
100
- def with_cropping(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
101
- """
102
- Create a PdfImportService configured for advanced cropping mode.
103
-
104
- Args:
105
- allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
106
-
107
- Returns:
108
- PdfImportService instance configured for cropping mode
109
- """
110
- return cls(
111
- allow_meta_overwrite=allow_meta_overwrite, processing_mode="cropping"
112
- )
113
-
114
- @contextmanager
115
- def _file_lock(self, path: Path):
116
- """Create a file lock to prevent duplicate processing.
117
- Handles stale lock files by reclaiming after STALE_LOCK_SECONDS.
118
- """
119
- lock_path = Path(str(path) + ".lock")
120
- fd = None
121
- try:
122
- try:
123
- # atomic create; fail if exists
124
- fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
125
- except FileExistsError:
126
- # Check for stale lock
127
- age = None
128
- try:
129
- st = os.stat(lock_path)
130
- age = time.time() - st.st_mtime
131
- except FileNotFoundError:
132
- # race: lock removed between exists and stat; just retry acquiring below
133
- pass
134
-
135
- if age is not None and age > STALE_LOCK_SECONDS:
136
- try:
137
- logger.warning(
138
- "Stale lock detected for %s (age %.0fs). Reclaiming lock...",
139
- path,
140
- age,
141
- )
142
- lock_path.unlink()
143
- except Exception as e:
144
- logger.warning(
145
- "Failed to remove stale lock %s: %s", lock_path, e
146
- )
147
- # retry acquire
148
- fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
149
- else:
150
- # Another worker is processing this file
151
-
152
- raise ValueError(f"File already being processed: {path}")
153
-
154
- os.write(fd, b"lock")
155
- os.close(fd)
156
- fd = None
157
- yield
158
- finally:
159
- try:
160
- if fd is not None:
161
- os.close(fd)
162
- if lock_path.exists():
163
- lock_path.unlink()
164
- except OSError:
165
- pass
166
-
167
- def _sha256(self, path: Path, chunk: int = 1024 * 1024) -> str:
168
- """Compute SHA256 hash of a file."""
169
- h = hashlib.sha256()
170
- with open(path, "rb") as f:
171
- while True:
172
- b = f.read(chunk)
173
- if not b:
174
- break
175
- h.update(b)
176
- return h.hexdigest()
177
-
178
- def _get_pdf_dir(self) -> Path | None:
179
- """Resolve the configured PDF directory to a concrete Path."""
180
- candidate = getattr(path_utils, "PDF_DIR", None)
181
- if isinstance(candidate, Path):
182
- return candidate
183
- if candidate is None:
184
- return None
185
- try:
186
- derived = candidate / "."
187
- except Exception:
188
- derived = None
189
-
190
- if derived is not None:
191
- try:
192
- return Path(derived)
193
- except Exception:
194
- return None
195
-
196
- try:
197
- return Path(str(candidate))
198
- except Exception:
199
- return None
200
-
201
- def _quarantine(self, source: Path) -> Path:
202
- """Move file to quarantine directory to prevent re-processing."""
203
- qdir = path_utils.PDF_DIR / "_processing"
204
- qdir.mkdir(parents=True, exist_ok=True)
205
- target = qdir / source.name
206
- try:
207
- # Try atomic rename first (fastest when on same filesystem)
208
- source.rename(target)
209
- except OSError as exc:
210
- if exc.errno == errno.EXDEV:
211
- # Cross-device move, fall back to shutil.move which copies+removes
212
- shutil.move(str(source), str(target))
213
- else:
214
- raise
215
- lock_path = Path(str(source) + ".lock")
216
- if lock_path.exists():
217
- lock_path.unlink()
218
-
219
- return target
220
-
221
- def _ensure_state(self, pdf_file: "RawPdfFile"):
222
- """Ensure PDF file has a state object."""
223
- if getattr(pdf_file, "state", None):
224
- return pdf_file.state
225
- if hasattr(pdf_file, "get_or_create_state"):
226
- state = pdf_file.get_or_create_state()
227
- pdf_file.state = state
228
- self.current_pdf_state = state
229
- assert isinstance(self.current_pdf_state, RawPdfState)
230
- return state
231
-
232
-
233
- def _ensure_report_reading_available(self):
234
- """
235
- Ensure report reading modules are available by adding lx-anonymizer to path.
236
-
237
- Returns:
238
- Tuple of (availability_flag, ReportReader_class)
239
- """
240
- if self._report_reader_available is not None:
241
- return self._report_reader_available, self._report_reader_class
242
-
243
- try:
244
- # Try direct import first
245
- from lx_anonymizer import ReportReader
246
-
247
- logger.info("Successfully imported lx_anonymizer ReportReader module")
248
- self._report_reader_available = True
249
- self._report_reader_class = ReportReader
250
- return True, ReportReader
251
-
252
- except ImportError:
253
- # Optional: honor LX_ANONYMIZER_PATH=/abs/path/to/src
254
- import importlib
255
-
256
- extra = os.getenv("LX_ANONYMIZER_PATH")
257
- if extra and extra not in sys.path and Path(extra).exists():
258
- sys.path.insert(0, extra)
259
- try:
260
- mod = importlib.import_module("lx_anonymizer")
261
- ReportReader = getattr(mod, "ReportReader")
262
- logger.info(
263
- "Imported lx_anonymizer.ReportReader via LX_ANONYMIZER_PATH"
264
- )
265
- self._report_reader_available = True
266
- self._report_reader_class = ReportReader
267
- return True, ReportReader
268
- except Exception as e:
269
- logger.warning(
270
- "Failed importing lx_anonymizer via LX_ANONYMIZER_PATH: %s", e
271
- )
272
- finally:
273
- # Keep path for future imports if it worked; otherwise remove.
274
- if "ReportReader" not in locals() and extra in sys.path:
275
- sys.path.remove(extra)
276
-
277
- self._report_reader_available = False
278
- self._report_reader_class = None
279
- return False, None
280
-
281
- def _ensure_default_patient_data(self, pdf_instance: "RawPdfFile") -> None:
282
- """
283
- Ensure PDF has minimum required patient data in SensitiveMeta.
284
- Creates default values if data is missing after text processing.
285
- Uses the central PDF instance if no specific instance provided.
286
-
287
- Args:
288
- pdf_instance: Optional specific PDF instance, defaults to self.current_pdf
289
- """
290
- pdf_file = pdf_instance or self.current_pdf
291
- if not pdf_file:
292
- logger.warning(
293
- "No PDF instance available for ensuring default patient data"
294
- )
295
- return
296
-
297
- if not pdf_file.sensitive_meta:
298
- logger.info(
299
- f"No SensitiveMeta found for PDF {pdf_file.pdf_hash}, creating default"
300
- )
301
-
302
- # Create default SensitiveMeta with placeholder data
303
- default_data = {
304
- "patient_first_name": self.DEFAULT_PATIENT_FIRST_NAME,
305
- "patient_last_name": self.DEFAULT_PATIENT_LAST_NAME,
306
- "patient_dob": self.DEFAULT_PATIENT_DOB,
307
- "examination_date": date.today(), # today is intentionally *not* a constant
308
- "center_name": (
309
- pdf_file.center.name
310
- if pdf_file.center
311
- else self.DEFAULT_CENTER_NAME
312
- ),
313
- }
314
-
315
-
316
- try:
317
- sensitive_meta = SensitiveMeta.create_from_dict(default_data)
318
- pdf_file.sensitive_meta = sensitive_meta
319
- pdf_file.save(update_fields=["sensitive_meta"])
320
- logger.info(
321
- f"Created default SensitiveMeta for PDF {pdf_file.pdf_hash}"
322
- )
323
- except Exception as e:
324
- logger.error(
325
- f"Failed to create default SensitiveMeta for PDF {pdf_file.pdf_hash}: {e}"
326
- )
327
-
328
- def import_and_anonymize(
329
- self,
330
- file_path: Union[Path, str],
331
- center_name: str,
332
- delete_source: bool = False,
333
- retry: bool = False,
334
- ) -> "RawPdfFile | None":
335
- """
336
- Import a PDF file and anonymize it using ReportReader.
337
- Uses centralized PDF instance management pattern.
338
-
339
- The processing mode is determined by the service initialization:
340
- - 'blackening': Creates an anonymized PDF with black rectangles over sensitive regions
341
- - 'cropping': Advanced mode that crops sensitive regions to separate images
342
-
343
- Args:
344
- file_path: Path to the PDF file to import
345
- center_name: Name of the center to associate with PDF
346
- delete_source: Whether to delete the source file after import
347
- retry: Whether this is a retry attempt
348
-
349
- Returns:
350
- RawPdfFile instance after import and processing
351
-
352
- Raises:
353
- Exception: On any failure during import or processing
354
- """
355
- try:
356
- # Initialize processing context
357
- self._initialize_processing_context(
358
- file_path, center_name, delete_source, retry
359
- )
360
-
361
- # Step 1: Validate and prepare file
362
- self._validate_and_prepare_file()
363
-
364
- # Step 2: Create or retrieve PDF instance
365
- self._create_or_retrieve_pdf_instance()
366
-
367
- # Early return check - if no PDF instance was created, return None
368
- if not self.current_pdf:
369
- logger.warning(
370
- f"No PDF instance created for {file_path}, returning None"
371
- )
372
- raise ObjectDoesNotExist
373
- # Step 3: Setup processing environment
374
- self._setup_processing_environment()
375
-
376
- # Step 4: Process text and metadata
377
- self._process_text_and_metadata()
378
-
379
- # Step 5: Finalize processing
380
- self._finalize_processing()
381
-
382
- return self.current_pdf
383
-
384
- except ValueError as e:
385
- # Handle "File already being processed" case specifically
386
- if "already being processed" in str(e):
387
- logger.info(f"Skipping file {file_path}: {e}")
388
- return
389
- else:
390
- logger.error(f"PDF import failed for {file_path}: {e}")
391
- self._cleanup_on_error()
392
- raise
393
- except Exception as e:
394
- logger.error(f"PDF import failed for {file_path}: {e}")
395
- # Cleanup on error
396
- self._cleanup_on_error()
397
- raise
398
- finally:
399
- # Always cleanup context
400
- self._cleanup_processing_context()
401
-
402
- def _initialize_processing_context(
403
- self,
404
- file_path: Union[Path, str],
405
- center_name: str,
406
- delete_source: bool,
407
- retry: bool,
408
- ):
409
- """Initialize the processing context for the current PDF."""
410
- self.processing_context = {
411
- "file_path": Path(file_path),
412
- "original_file_path": Path(file_path),
413
- "center_name": center_name,
414
- "delete_source": delete_source,
415
- "retry": retry,
416
- "file_hash": None,
417
- "processing_started": False,
418
- "text_extracted": False,
419
- "metadata_processed": False,
420
- "anonymization_completed": False,
421
- }
422
- self.original_path = Path(file_path)
423
-
424
- # Check if already processed (only during current session to prevent race conditions)
425
- if str(file_path) in self.processed_files:
426
- logger.info(
427
- f"File {file_path} already being processed in current session, skipping"
428
- )
429
- raise ValueError("File already being processed")
430
-
431
- logger.info(f"Starting import and processing for: {file_path}")
432
-
433
- def _validate_and_prepare_file(self):
434
- """Validate file existence and calculate hash."""
435
- file_path = self.processing_context["file_path"]
436
-
437
- if not file_path.exists():
438
- raise FileNotFoundError(f"PDF file not found: {file_path}")
439
-
440
- try:
441
- self.processing_context["file_hash"] = self._sha256(file_path)
442
- except Exception as e:
443
- logger.warning(f"Could not calculate file hash: {e}")
444
- self.processing_context["file_hash"] = None
445
-
446
- def _create_or_retrieve_pdf_instance(self):
447
- """Create new or retrieve existing PDF instance."""
448
- file_path = self.processing_context["file_path"]
449
- center_name = self.processing_context["center_name"]
450
- delete_source = self.processing_context["delete_source"]
451
- retry = self.processing_context["retry"]
452
- file_hash = self.processing_context["file_hash"]
453
-
454
- if not retry:
455
- # Check for existing PDF and handle duplicates
456
- with self._file_lock(file_path):
457
- existing = None
458
- if file_hash and RawPdfFile.objects.filter(pdf_hash=file_hash).exists():
459
- existing = RawPdfFile.objects.get(pdf_hash=file_hash)
460
-
461
- if existing:
462
- logger.info(f"Found existing RawPdfFile {existing.pdf_hash}")
463
- if existing.text:
464
- logger.info(
465
- f"Existing PDF {existing.pdf_hash} already processed - returning"
466
- )
467
- self.current_pdf = existing
468
- return
469
- else:
470
- # Retry processing
471
- logger.info(f"Reprocessing existing PDF {existing.pdf_hash}")
472
- return self._retry_existing_pdf(existing)
473
-
474
- # Create new PDF instance
475
- logger.info("Creating new RawPdfFile instance...")
476
- from django.db import IntegrityError
477
-
478
- try:
479
- if not retry:
480
- self.current_pdf = RawPdfFile.create_from_file_initialized(
481
- file_path=file_path,
482
- center_name=center_name,
483
- delete_source=delete_source,
484
- )
485
- else:
486
- # Retrieve existing for retry
487
- self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
488
- logger.info(
489
- f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}"
490
- )
491
-
492
- # Check if retry is actually needed
493
- if self.current_pdf.text:
494
- logger.info(
495
- f"Existing PDF {self.current_pdf.pdf_hash} already processed during retry - returning"
496
- )
497
- return
498
-
499
- if not self.current_pdf:
500
- raise RuntimeError("Failed to create RawPdfFile instance")
501
-
502
- logger.info(f"PDF instance ready: {self.current_pdf.pdf_hash}")
503
-
504
- except IntegrityError:
505
- # Race condition - another worker created it
506
- if file_hash:
507
- self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
508
- logger.info("Race condition detected, using existing RawPdfFile")
509
- else:
510
- raise
511
-
512
- def _setup_processing_environment(self):
513
- """Setup processing environment and state."""
514
- original_path = self.processing_context.get("file_path")
515
- if not original_path or not self.current_pdf:
516
- try:
517
- self.current_pdf = RawPdfFile.objects.get(pdf_hash=self.processing_context["file_hash"])
518
- self.original_path = Path(str(self.current_pdf.file.path))
519
-
520
- except RawPdfFile.DoesNotExist:
521
- raise RuntimeError("Processing environment setup failed")
522
- # Create sensitive file copy
523
- if original_path is None or not isinstance(original_path, (str, Path)):
524
- logger.error(f"No original path: {original_path!r}")
525
- return
526
- self.create_sensitive_file(self.current_pdf, original_path)
527
-
528
- # Update file path to point to sensitive copy
529
- self.processing_context["file_path"] = self.current_pdf.file.path
530
- self.processing_context["sensitive_copy_created"] = True
531
- try:
532
- self.processing_context["sensitive_file_path"] = Path(
533
- self.current_pdf.file.path
534
- )
535
- except Exception:
536
- self.processing_context["sensitive_file_path"] = None
537
-
538
- # Ensure state exists
539
- state = self.current_pdf.get_or_create_state()
540
- state.mark_processing_started()
541
- self.processing_context["processing_started"] = True
542
-
543
- # Mark as processed to prevent duplicates
544
- self.processed_files.add(str(self.processing_context["file_path"]))
545
-
546
- # Ensure default patient data
547
- logger.info("Ensuring default patient data...")
548
- self._ensure_default_patient_data(self.current_pdf)
549
-
550
- def _process_text_and_metadata(self):
551
- """Process text extraction and metadata using ReportReader."""
552
- report_reading_available, ReportReaderCls = self._ensure_report_reading_available()
553
- try:
554
- assert ReportReaderCls is not None and report_reading_available
555
- assert self.current_pdf is not None
556
- except AssertionError as e:
557
- logger.error(f"PDF Import failed on Error:{e} Ensure the pdf was passed correctly and report reading is available in function _process_text_and_metadata() ")
558
- if not report_reading_available:
559
- logger.warning("Report reading not available (lx_anonymizer not found)")
560
- self._mark_processing_incomplete("no_report_reader")
561
- return
562
- assert self.current_pdf is not None
563
- if not self.current_pdf.file:
564
- logger.warning("No file available for text processing")
565
- self._mark_processing_incomplete("no_file")
566
- return
567
-
568
- try:
569
- logger.info(
570
- f"Starting text extraction and metadata processing with ReportReader (mode: {self.processing_mode})..."
571
- )
572
- ReportReaderCls = lx_anonymizer.ReportReader
573
-
574
- # Initialize ReportReader
575
- report_reader = ReportReaderCls(
576
- report_root_path=str(path_utils.STORAGE_DIR),
577
- locale="de_DE",
578
- text_date_format="%d.%m.%Y",
579
- )
580
-
581
- if self.processing_mode == "cropping":
582
- # Use advanced cropping method (existing implementation)
583
- self._process_with_cropping(report_reader)
584
- else: # blackening mode
585
- # Use enhanced process_report with PDF masking
586
- self._process_with_blackening(report_reader)
587
-
588
- except Exception as e:
589
- logger.warning(f"Text processing failed: {e}")
590
- self._mark_processing_incomplete("text_processing_failed")
591
-
592
- def _process_with_blackening(self, report_reader):
593
- """Process PDF using simple blackening/masking mode."""
594
- logger.info("Using simple PDF blackening mode...")
595
-
596
- # Setup anonymized directory
597
- anonymized_dir = path_utils.PDF_DIR / "anonymized"
598
- anonymized_dir.mkdir(parents=True, exist_ok=True)
599
- assert self.current_pdf is not None
600
- # Generate output path for anonymized PDF
601
- pdf_hash = self.current_pdf.pdf_hash
602
- anonymized_output_path = anonymized_dir / f"{pdf_hash}_anonymized.pdf"
603
-
604
- # Process with enhanced process_report method (returns 4-tuple now)
605
- original_text, anonymized_text, extracted_metadata, anonymized_pdf_path = (
606
- report_reader.process_report(
607
- pdf_path=self.processing_context["file_path"],
608
- create_anonymized_pdf=True,
609
- anonymized_pdf_output_path=str(anonymized_output_path),
610
- )
611
- )
612
-
613
- # Store results in context
614
- self.processing_context.update(
615
- {
616
- "original_text": original_text,
617
- "anonymized_text": anonymized_text,
618
- "extracted_metadata": extracted_metadata,
619
- "cropped_regions": None, # Not available in blackening mode
620
- "anonymized_pdf_path": anonymized_pdf_path,
621
- }
622
- )
623
-
624
- # Apply results
625
- if original_text:
626
- self._apply_text_results()
627
- self.processing_context["text_extracted"] = True
628
-
629
- if extracted_metadata:
630
- self._apply_metadata_results()
631
- self.processing_context["metadata_processed"] = True
632
-
633
- if anonymized_pdf_path:
634
- self._apply_anonymized_pdf()
635
- self.processing_context["anonymization_completed"] = True
636
-
637
- logger.info("PDF blackening processing completed")
638
-
639
- def _process_with_cropping(self, report_reader):
640
- """Process PDF using advanced cropping mode (existing implementation)."""
641
- logger.info("Using advanced cropping mode...")
642
-
643
- # Setup output directories
644
- crops_dir = path_utils.PDF_DIR / "cropped_regions"
645
- anonymized_dir = path_utils.PDF_DIR / "anonymized"
646
- crops_dir.mkdir(parents=True, exist_ok=True)
647
- anonymized_dir.mkdir(parents=True, exist_ok=True)
648
-
649
- # Process with cropping (returns 5-tuple)
650
- (
651
- original_text,
652
- anonymized_text,
653
- extracted_metadata,
654
- cropped_regions,
655
- anonymized_pdf_path,
656
- ) = report_reader.process_report_with_cropping(
657
- pdf_path=self.processing_context["file_path"],
658
- crop_sensitive_regions=True,
659
- crop_output_dir=str(crops_dir),
660
- anonymization_output_dir=str(anonymized_dir),
661
- )
662
-
663
- # Store results in context
664
- self.processing_context.update(
665
- {
666
- "original_text": original_text,
667
- "anonymized_text": anonymized_text,
668
- "extracted_metadata": extracted_metadata,
669
- "cropped_regions": cropped_regions,
670
- "anonymized_pdf_path": anonymized_pdf_path,
671
- }
672
- )
673
-
674
- # Apply results
675
- if original_text:
676
- self._apply_text_results()
677
- self.processing_context["text_extracted"] = True
678
-
679
- if extracted_metadata:
680
- self._apply_metadata_results()
681
- self.processing_context["metadata_processed"] = True
682
-
683
- if anonymized_pdf_path:
684
- self._apply_anonymized_pdf()
685
- self.processing_context["anonymization_completed"] = True
686
-
687
- logger.info("PDF cropping processing completed")
688
-
689
- def _apply_text_results(self):
690
- """Apply text extraction results to the PDF instance."""
691
- if not self.current_pdf:
692
- logger.warning("Cannot apply text results - no PDF instance available")
693
- return
694
-
695
- original_text = self.processing_context.get("original_text")
696
- anonymized_text = self.processing_context.get("anonymized_text")
697
-
698
- if not original_text:
699
- logger.warning("No original text available to apply")
700
- return
701
-
702
- # Store extracted text
703
- self.current_pdf.text = original_text
704
- logger.info(f"Extracted {len(original_text)} characters of text from PDF")
705
-
706
- # Handle anonymized text
707
- if anonymized_text and anonymized_text != original_text:
708
- self.current_pdf.anonymized = True
709
- logger.info("PDF text anonymization completed")
710
-
711
- def _apply_metadata_results(self):
712
- """Apply metadata extraction results to SensitiveMeta."""
713
- if not self.current_pdf:
714
- logger.warning("Cannot apply metadata results - no PDF instance available")
715
- return
716
-
717
- extracted_metadata = self.processing_context.get("extracted_metadata")
718
-
719
- if not self.current_pdf.sensitive_meta or not extracted_metadata:
720
- logger.debug("No sensitive meta or extracted metadata available")
721
- return
722
-
723
- sm = self.current_pdf.sensitive_meta
724
-
725
- # Map ReportReader metadata to SensitiveMeta fields
726
- metadata_mapping = {
727
- "patient_first_name": "patient_first_name",
728
- "patient_last_name": "patient_last_name",
729
- "patient_dob": "patient_dob",
730
- "examination_date": "examination_date",
731
- "examiner_first_name": "examiner_first_name",
732
- "examiner_last_name": "examiner_last_name",
733
- "endoscope_type": "endoscope_type",
734
- "casenumber": "casenumber",
735
- "center_name": "center_name",
736
- }
737
-
738
- # Update fields with extracted information
739
- updated_fields = []
740
- for meta_key, sm_field in metadata_mapping.items():
741
- if extracted_metadata.get(meta_key) and hasattr(sm, sm_field):
742
- old_value = getattr(sm, sm_field)
743
- raw_value = extracted_metadata[meta_key]
744
-
745
- # Skip if we just got the field name as a string (indicates no actual data)
746
- if isinstance(raw_value, str) and raw_value == meta_key:
747
- continue
748
-
749
- # Handle date fields specially
750
- if sm_field in ["patient_dob", "examination_date"]:
751
- new_value = self._parse_date_field(raw_value, meta_key, sm_field)
752
- if new_value is None:
753
- continue
754
- else:
755
- new_value = raw_value
756
-
757
- # Configurable overwrite policy
758
- should_overwrite = (
759
- self.allow_meta_overwrite
760
- or self._is_placeholder_value(sm_field, old_value)
761
- )
762
-
763
- if new_value and should_overwrite:
764
- setattr(sm, sm_field, new_value)
765
- updated_fields.append(sm_field)
766
-
767
- if updated_fields:
768
- sm.save()
769
- logger.info(f"Updated SensitiveMeta fields: {updated_fields}")
770
-
771
- def _parse_date_field(self, raw_value, meta_key, sm_field):
772
- """Parse date field with error handling."""
773
- try:
774
- if isinstance(raw_value, str):
775
- # Skip if the value is just the field name itself
776
- if raw_value == meta_key:
777
- logger.warning(
778
- "Skipping date field %s - got field name '%s' instead of actual date",
779
- sm_field,
780
- raw_value,
781
- )
782
- return None
783
-
784
- # Try common date formats
785
- date_formats = ["%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y"]
786
- for fmt in date_formats:
787
- try:
788
- return datetime.strptime(raw_value, fmt).date()
789
- except ValueError:
790
- continue
791
-
792
- logger.warning(
793
- "Could not parse date '%s' for field %s", raw_value, sm_field
794
- )
795
- return None
796
-
797
- elif hasattr(raw_value, "date"):
798
- return raw_value.date()
799
- else:
800
- return raw_value
801
-
802
- except (ValueError, AttributeError) as e:
803
- logger.warning("Date parsing failed for %s: %s", sm_field, e)
804
- return None
805
-
806
- # from gc-08
807
- def _apply_anonymized_pdf(self):
808
- """
809
- Attach the already-generated anonymized PDF without copying bytes.
810
-
811
- We do NOT re-upload or re-save file bytes via Django storage (which would
812
- place a new file under upload_to='raw_pdfs' and retrigger the watcher).
813
- Instead, we point the FileField to the path that the anonymizer already
814
- wrote (ideally relative to STORAGE_DIR). Additionally, we make sure the
815
- model/state reflect that anonymization is done even if text didn't change.
816
- """
817
- if not self.current_pdf:
818
- logger.warning("Cannot apply anonymized PDF - no PDF instance available")
819
- return
820
-
821
- anonymized_pdf_path = self.processing_context.get("anonymized_pdf_path")
822
- if not anonymized_pdf_path:
823
- logger.debug("No anonymized_pdf_path present in processing context")
824
- return
825
-
826
- anonymized_path = Path(anonymized_pdf_path)
827
- if not anonymized_path.exists():
828
- logger.warning(
829
- "Anonymized PDF path returned but file does not exist: %s",
830
- anonymized_path,
831
- )
832
- return
833
-
834
- logger.info("Anonymized PDF created by ReportReader at: %s", anonymized_path)
835
-
836
- try:
837
- # Prefer storing a path relative to STORAGE_DIR so Django serves it correctly
838
- try:
839
- relative_name = str(anonymized_path.relative_to(path_utils.STORAGE_DIR))
840
- except ValueError:
841
- # Fallback to absolute path if the file lives outside STORAGE_DIR
842
- relative_name = str(anonymized_path)
843
-
844
- # Only update if something actually changed
845
- if getattr(self.current_pdf.anonymized_file, "name", None) != relative_name:
846
- self.current_pdf.anonymized_file.name = relative_name
847
-
848
- # Ensure model/state reflect anonymization even if text didn't differ
849
- if not getattr(self.current_pdf, "anonymized", False):
850
- self.current_pdf.anonymized = True
851
-
852
- # Persist cropped regions info somewhere useful (optional & non-breaking)
853
- # If your model has a field for this, persist there; otherwise we just log.
854
- cropped_regions = self.processing_context.get("cropped_regions")
855
- if cropped_regions:
856
- logger.debug(
857
- "Cropped regions recorded (%d regions).", len(cropped_regions)
858
- )
859
-
860
- # Save model changes
861
- update_fields = ["anonymized_file"]
862
- if "anonymized" in self.current_pdf.__dict__:
863
- update_fields.append("anonymized")
864
- self.current_pdf.save(update_fields=update_fields)
865
-
866
- # Mark state as anonymized immediately; this keeps downstream flows working
867
- state = self._ensure_state(self.current_pdf)
868
-
869
- if state and not state.processing_started:
870
- state.mark_processing_started()
871
-
872
- logger.info(
873
- "Updated anonymized_file reference to: %s",
874
- self.current_pdf.anonymized_file.name,
875
- )
876
-
877
- except Exception as e:
878
- logger.warning("Could not set anonymized file reference: %s", e)
879
-
880
- def _finalize_processing(self):
881
- """Finalize processing and update state."""
882
- if not self.current_pdf:
883
- logger.warning("Cannot finalize processing - no PDF instance available")
884
- return
885
-
886
- try:
887
- # Update state based on processing results
888
- state = self._ensure_state(self.current_pdf)
889
-
890
- if self.processing_context.get("text_extracted") and state:
891
- state.mark_anonymized()
892
-
893
- # Mark as ready for validation after successful anonymization
894
- if self.processing_context.get("anonymization_completed") and state:
895
- state.mark_sensitive_meta_processed()
896
- logger.info(
897
- f"PDF {self.current_pdf.pdf_hash} processing completed - "
898
- f"ready for validation (status: {state.anonymization_status})"
899
- )
900
-
901
- # Save all changes
902
- with transaction.atomic():
903
- self.current_pdf.save()
904
- if state:
905
- state.save()
906
-
907
- logger.info("PDF processing completed successfully")
908
- except Exception as e:
909
- logger.warning(f"Failed to finalize processing: {e}")
910
-
911
- def _mark_processing_incomplete(self, reason: str):
912
- """Mark processing as incomplete with reason."""
913
- if not self.current_pdf:
914
- logger.warning(
915
- f"Cannot mark processing incomplete - no PDF instance available. Reason: {reason}"
916
- )
917
- return
918
-
919
- try:
920
- state = self._ensure_state(self.current_pdf)
921
- if state:
922
- state.text_meta_extracted = False
923
- state.pdf_meta_extracted = False
924
- state.sensitive_meta_processed = False
925
- state.save()
926
- logger.info(f"Set PDF state: processed=False due to {reason}")
927
-
928
- # Save changes
929
- with transaction.atomic():
930
- self.current_pdf.save()
931
- except Exception as e:
932
- logger.warning(f"Failed to mark processing incomplete: {e}")
933
-
934
- def _retry_existing_pdf(self, existing_pdf):
935
- """
936
- Retry processing for existing PDF.
937
-
938
- Uses get_raw_file_path() to find the original raw file instead of
939
- relying on the file field which may point to a deleted sensitive file.
940
- """
941
- try:
942
- # ✅ FIX: Use get_raw_file_path() to find original file
943
- raw_file_path = existing_pdf.get_raw_file_path()
944
-
945
- if not raw_file_path or not raw_file_path.exists():
946
- logger.error(
947
- f"Cannot retry PDF {existing_pdf.pdf_hash}: Raw file not found. "
948
- f"Please re-upload the original PDF file."
949
- )
950
- self.current_pdf = existing_pdf
951
- return existing_pdf
952
-
953
- logger.info(f"Found raw file for retry at: {raw_file_path}")
954
-
955
- # Remove from processed files to allow retry
956
- file_path_str = str(raw_file_path)
957
- if file_path_str in self.processed_files:
958
- self.processed_files.remove(file_path_str)
959
- logger.debug(f"Removed {file_path_str} from processed files for retry")
960
-
961
- return self.import_and_anonymize(
962
- file_path=raw_file_path, # ✅ Use raw file path, not sensitive path
963
- center_name=existing_pdf.center.name
964
- if existing_pdf.center
965
- else "unknown_center",
966
- delete_source=False, # Never delete during retry
967
- retry=True,
968
- )
969
- except Exception as e:
970
- logger.error(
971
- f"Failed to re-import existing PDF {existing_pdf.pdf_hash}: {e}"
972
- )
973
- self.current_pdf = existing_pdf
974
- return existing_pdf
975
-
976
- def _cleanup_on_error(self):
977
- """Cleanup processing context on error."""
978
- original_path = self.original_path
979
- try:
980
- if self.current_pdf and hasattr(self.current_pdf, "state"):
981
- state = self._ensure_state(self.current_pdf)
982
- raw_file_path = self.current_pdf.get_raw_file_path()
983
- if raw_file_path is not None and original_path is not None:
984
- # Ensure reprocessing for next attempt by restoring original file
985
- shutil.copy2(str(raw_file_path), str(original_path))
986
-
987
- # Ensure no two files can remain
988
- if raw_file_path == original_path and raw_file_path is not None and original_path is not None:
989
- os.remove(str(raw_file_path))
990
-
991
-
992
- # Remove Lock file also
993
- lock_path = Path(str(path_utils.PDF_DIR) + ".lock")
994
- try:
995
- if lock_path.exists():
996
- lock_path.unlink()
997
- logger.info("Removed lock file during quarantine: %s", lock_path)
998
- except Exception as e:
999
- logger.warning("Could not remove lock file during quarantine: %s", e)
1000
-
1001
-
1002
- if state and self.processing_context.get("processing_started"):
1003
- state.text_meta_extracted = False
1004
- state.pdf_meta_extracted = False
1005
- state.sensitive_meta_processed = False
1006
- state.anonymized = False
1007
- state.save()
1008
- logger.debug("Updated PDF state to indicate processing failure")
1009
- else:
1010
- # 🔧 Early failure: no current_pdf (or no state).
1011
- # In this case we want to make sure we don't leave stray files
1012
- # under PDF_DIR or PDF_DIR/sensitive.
1013
-
1014
- pdf_dir = self._get_pdf_dir()
1015
- if pdf_dir and pdf_dir.exists():
1016
- for candidate_dir in (pdf_dir, pdf_dir / "sensitive"):
1017
- if candidate_dir.exists():
1018
- for candidate in candidate_dir.glob("*.pdf"):
1019
- # Don't delete the original ingress file
1020
- if (
1021
- original_path is not None
1022
- and candidate.resolve() == Path(original_path).resolve()
1023
- ):
1024
- continue
1025
- try:
1026
- candidate.unlink()
1027
- logger.debug(
1028
- "Removed stray PDF during early error cleanup: %s",
1029
- candidate,
1030
- )
1031
- except Exception as e:
1032
- logger.warning(
1033
- "Failed to remove stray PDF %s: %s",
1034
- candidate,
1035
- e,
1036
- )
1037
-
1038
- except Exception as e:
1039
- logger.warning(f"Error during cleanup: {e}")
1040
- finally:
1041
- # Remove any sensitive copy created during this processing run
1042
- sensitive_created = self.processing_context.get("sensitive_copy_created")
1043
- if sensitive_created:
1044
- pdf_obj = self.current_pdf
1045
- try:
1046
- if pdf_obj:
1047
- file_field = getattr(pdf_obj, "file", None)
1048
- if file_field and getattr(file_field, "name", None):
1049
- storage_name = file_field.name
1050
- file_field.delete(save=False)
1051
- logger.debug(
1052
- "Deleted sensitive copy %s during error cleanup",
1053
- storage_name,
1054
- )
1055
- except Exception as cleanup_exc:
1056
- logger.warning(
1057
- "Failed to remove sensitive copy during error cleanup: %s",
1058
- cleanup_exc,
1059
- )
1060
- pdf_dir = self._get_pdf_dir()
1061
- if original_path and pdf_dir:
1062
- # Try to remove any extra file that was created during import
1063
- # Simplest heuristic: same basename as original, but in pdf dir or pdf/sensitive dir
1064
- for candidate_dir in (pdf_dir, pdf_dir / "sensitive"):
1065
- candidate = candidate_dir / original_path.name
1066
- if candidate.exists() and candidate != original_path:
1067
- try:
1068
- candidate.unlink()
1069
- logger.debug(
1070
- "Removed stray PDF copy during early error cleanup: %s",
1071
- candidate,
1072
- )
1073
- except Exception as e:
1074
- logger.warning(
1075
- "Failed to remove stray PDF copy %s: %s",
1076
- candidate,
1077
- e,
1078
- )
1079
-
1080
- # Always clean up processed files set to prevent blocks
1081
- file_path = self.processing_context.get("file_path")
1082
- if file_path and str(file_path) in self.processed_files:
1083
- self.processed_files.remove(str(file_path))
1084
- logger.debug(
1085
- f"Removed {file_path} from processed files during error cleanup"
1086
- )
1087
-
1088
- try:
1089
- raw_dir = (
1090
- original_path.parent if isinstance(original_path, Path) else None
1091
- )
1092
-
1093
- pdf_dir = self._get_pdf_dir()
1094
- if not pdf_dir and raw_dir:
1095
- base_dir = raw_dir.parent
1096
- dir_name = getattr(path_utils, "PDF_DIR_NAME", "pdfs")
1097
- fallback_pdf_dir = base_dir / dir_name
1098
- logger.debug(
1099
- "PDF cleanup fallback resolution - base: %s, dir_name: %s, exists: %s",
1100
- base_dir,
1101
- dir_name,
1102
- fallback_pdf_dir.exists(),
1103
- )
1104
- if fallback_pdf_dir.exists():
1105
- pdf_dir = fallback_pdf_dir
1106
-
1107
- # Remove empty PDF subdirectories that might have been created during setup
1108
- if pdf_dir and pdf_dir.exists():
1109
- for subdir_name in (
1110
- "sensitive",
1111
- "cropped_regions",
1112
- "anonymized",
1113
- "_processing",
1114
- ):
1115
- subdir_path = pdf_dir / subdir_name
1116
- if subdir_path.exists() and subdir_path.is_dir():
1117
- try:
1118
- next(subdir_path.iterdir())
1119
- except StopIteration:
1120
- try:
1121
- subdir_path.rmdir()
1122
- logger.debug(
1123
- "Removed empty directory %s during error cleanup",
1124
- subdir_path,
1125
- )
1126
- except OSError as rm_err:
1127
- logger.debug(
1128
- "Could not remove directory %s: %s",
1129
- subdir_path,
1130
- rm_err,
1131
- )
1132
- except Exception as iter_err:
1133
- logger.debug(
1134
- "Could not inspect directory %s: %s",
1135
- subdir_path,
1136
- iter_err,
1137
- )
1138
-
1139
- raw_count = (
1140
- len(list(raw_dir.glob("*")))
1141
- if raw_dir and raw_dir.exists()
1142
- else None
1143
- )
1144
- pdf_count = (
1145
- len(list(pdf_dir.glob("*")))
1146
- if pdf_dir and pdf_dir.exists()
1147
- else None
1148
- )
1149
-
1150
- sensitive_path = self.processing_context.get("sensitive_file_path")
1151
- if sensitive_path:
1152
- sensitive_parent = Path(sensitive_path).parent
1153
- sensitive_count = (
1154
- len(list(sensitive_parent.glob("*")))
1155
- if sensitive_parent.exists()
1156
- else None
1157
- )
1158
- else:
1159
- sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
1160
- sensitive_count = (
1161
- len(list(sensitive_dir.glob("*")))
1162
- if sensitive_dir and sensitive_dir.exists()
1163
- else None
1164
- )
1165
-
1166
- logger.info(
1167
- "PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
1168
- raw_count,
1169
- pdf_count,
1170
- sensitive_count,
1171
- )
1172
- except Exception:
1173
- pass
1174
-
1175
- def _cleanup_processing_context(self):
1176
- """Cleanup processing context."""
1177
- try:
1178
- # Clean up temporary directories
1179
- if self.processing_context.get("text_extracted"):
1180
- crops_dir = path_utils.PDF_DIR / "cropped_regions"
1181
- if crops_dir.exists() and not any(crops_dir.iterdir()):
1182
- crops_dir.rmdir()
1183
-
1184
- # Always remove from processed files set after processing attempt
1185
- file_path = self.processing_context.get("file_path")
1186
- if file_path and str(file_path) in self.processed_files:
1187
- self.processed_files.remove(str(file_path))
1188
- logger.debug(f"Removed {file_path} from processed files set")
1189
-
1190
- except Exception as e:
1191
- logger.warning(f"Error during context cleanup: {e}")
1192
- finally:
1193
- # Reset context
1194
- self.current_pdf = None
1195
- self.processing_context = {}
1196
-
1197
- def import_simple(
1198
- self, file_path: Union[Path, str], center_name: str, delete_source: bool = False
1199
- ) -> "RawPdfFile":
1200
- """
1201
- Simple PDF import without text processing or anonymization.
1202
- Uses centralized PDF instance management pattern.
1203
-
1204
- Args:
1205
- file_path: Path to the PDF file to import
1206
- center_name: Name of the center to associate with PDF
1207
- delete_source: Whether to delete the source file after import
1208
-
1209
- Returns:
1210
- RawPdfFile instance after basic import
1211
- """
1212
- try:
1213
- # Initialize simple processing context
1214
- self._initialize_processing_context(
1215
- file_path, center_name, delete_source, False
1216
- )
1217
-
1218
- # Validate file
1219
- self._validate_and_prepare_file()
1220
-
1221
- # Create PDF instance
1222
- logger.info("Starting simple import - creating RawPdfFile instance...")
1223
- self.current_pdf = RawPdfFile.create_from_file_initialized(
1224
- file_path=self.processing_context["file_path"],
1225
- center_name=center_name,
1226
- delete_source=delete_source,
1227
- )
1228
-
1229
- if not self.current_pdf:
1230
- raise RuntimeError("Failed to create RawPdfFile instance")
1231
-
1232
- # Mark as processed
1233
- self.processed_files.add(str(self.processing_context["file_path"]))
1234
-
1235
- # Set basic state for simple import
1236
- state = self._ensure_state(self.current_pdf)
1237
- if state:
1238
- state.text_meta_extracted = False
1239
- state.pdf_meta_extracted = False
1240
- state.sensitive_meta_processed = False
1241
- state.save()
1242
- logger.info("Set PDF state: processed=False for simple import")
1243
-
1244
- # Save changes
1245
- with transaction.atomic():
1246
- self.current_pdf.save()
1247
-
1248
- logger.info(
1249
- "Simple import completed for RawPdfFile hash: %s",
1250
- self.current_pdf.pdf_hash,
1251
- )
1252
- return self.current_pdf
1253
-
1254
- except Exception as e:
1255
- logger.error(f"Simple PDF import failed for {file_path}: {e}")
1256
- self._cleanup_on_error()
1257
- raise
1258
- finally:
1259
- self._cleanup_processing_context()
1260
-
1261
- def check_storage_capacity(
1262
- self, file_path: Union[Path, str], storage_root, min_required_space
1263
- ) -> bool:
1264
- """
1265
- Check if there is sufficient storage capacity for the PDF file.
1266
-
1267
- Args:
1268
- file_path: Path to the PDF file to check
1269
-
1270
- Raises:
1271
- InsufficientStorageError: If there is not enough space
1272
- """
1273
- import shutil
1274
-
1275
- from endoreg_db.exceptions import InsufficientStorageError
1276
-
1277
- file_path = Path(file_path)
1278
- if not file_path.exists():
1279
- raise FileNotFoundError(f"File not found for storage check: {file_path}")
1280
-
1281
- # Get the size of the file
1282
- file_size = file_path.stat().st_size
1283
-
1284
- # Get available space in the storage directory
1285
-
1286
- total, used, free = shutil.disk_usage(storage_root)
1287
-
1288
- if file_size:
1289
- min_required_space = file_size if isinstance(min_required_space, int) else 0
1290
-
1291
- # Check if there is enough space
1292
- if file_size > free:
1293
- raise InsufficientStorageError(
1294
- f"Not enough space to store PDF file: {file_path}"
1295
- )
1296
- logger.info(
1297
- f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available"
1298
- )
1299
-
1300
- return True
1301
-
1302
- def create_sensitive_file(
1303
- self, pdf_instance: "RawPdfFile", file_path: Union[Path, str]
1304
- ) -> None:
1305
- """
1306
- Create a copy of the PDF file in the sensitive directory and update the file reference.
1307
- Delete the source path to avoid duplicates.
1308
- Uses the central PDF instance and processing context if parameters not provided.
1309
-
1310
- Ensures the FileField points to the file under STORAGE_DIR/pdfs/sensitive and never back to raw_pdfs.
1311
- """
1312
- pdf_file = pdf_instance or self.current_pdf
1313
- source_path = (
1314
- Path(file_path) if file_path else self.processing_context.get("file_path")
1315
- )
1316
-
1317
- if not pdf_file:
1318
- raise ValueError("No PDF instance available for creating sensitive file")
1319
- if not source_path:
1320
- raise ValueError("No file path available for creating sensitive file")
1321
-
1322
- SENSITIVE_DIR = path_utils.PDF_DIR / "sensitive"
1323
- target = SENSITIVE_DIR / f"{pdf_file.pdf_hash}.pdf"
1324
-
1325
- try:
1326
- os.makedirs(SENSITIVE_DIR, exist_ok=True)
1327
-
1328
- # If source already is the target, just ensure FileField points correctly
1329
- if source_path.resolve() == target.resolve():
1330
- pass
1331
- else:
1332
- # Move the file from ingress to sensitive storage
1333
- # Using replace semantics when target exists (re-import)
1334
- if target.exists():
1335
- try:
1336
- target.unlink()
1337
- except Exception as e:
1338
- logger.warning(
1339
- "Could not remove existing sensitive target %s: %s",
1340
- target,
1341
- e,
1342
- )
1343
- shutil.move(str(source_path), str(target))
1344
- logger.info(f"Moved PDF to sensitive directory: {target}")
1345
-
1346
- # Update FileField to reference the file under STORAGE_DIR
1347
- # We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
1348
- try:
1349
- relative_name = str(
1350
- target.relative_to(path_utils.STORAGE_DIR)
1351
- ) # Point Django FileField to sensitive storage
1352
- except ValueError:
1353
- # Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
1354
- relative_name = str(target)
1355
-
1356
- # Only update when changed
1357
- if getattr(pdf_file.file, "name", None) != relative_name:
1358
- pdf_file.file.name = relative_name
1359
- pdf_file.save(update_fields=["file"])
1360
- logger.info(
1361
- "Updated PDF FileField reference to sensitive path: %s",
1362
- pdf_file.file.path,
1363
- )
1364
- else:
1365
- logger.debug(
1366
- "PDF FileField already points to sensitive path: %s",
1367
- pdf_file.file.path,
1368
- )
1369
-
1370
- # Best-effort: if original source still exists (e.g., copy), remove it to avoid re-triggers
1371
- try:
1372
- if source_path.exists() and source_path != target:
1373
- os.remove(source_path)
1374
- logger.info(f"Removed original PDF file at ingress: {source_path}")
1375
- except OSError as e:
1376
- logger.warning(f"Could not delete original PDF file {source_path}: {e}")
1377
-
1378
- except Exception as e:
1379
- logger.warning(
1380
- f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}",
1381
- exc_info=True,
1382
- )
1383
-
1384
- def archive_or_quarantine_file(
1385
- self,
1386
- pdf_instance: "RawPdfFile",
1387
- source_file_path: Union[Path, str],
1388
- quarantine_reason: str,
1389
- is_pdf_problematic: bool,
1390
- ) -> bool:
1391
- """
1392
- Archive or quarantine file based on the state of the PDF processing.
1393
- Uses the central PDF instance and processing context if parameters not provided.
1394
-
1395
- Args:
1396
- pdf_instance: Optional PDF instance, defaults to self.current_pdf
1397
- source_file_path: Optional source file path, defaults to processing_context['file_path']
1398
- quarantine_reason: Optional quarantine reason, defaults to processing_context['error_reason']
1399
- is_pdf_problematic: Optional override for problematic state
1400
-
1401
- Returns:
1402
- bool: True if file was quarantined, False if archived successfully
1403
- """
1404
- pdf_file = pdf_instance or self.current_pdf
1405
- file_path = (
1406
- Path(source_file_path)
1407
- if source_file_path
1408
- else self.processing_context.get("file_path")
1409
- )
1410
- quarantine_reason = str(quarantine_reason or self.processing_context.get(
1411
- "error_reason"
1412
- ))
1413
-
1414
- if not pdf_file:
1415
- raise ValueError("No PDF instance available for archiving/quarantine")
1416
- if not file_path:
1417
- raise ValueError("No file path available for archiving/quarantine")
1418
-
1419
- # Determine if the PDF is problematic
1420
- pdf_problematic = (
1421
- is_pdf_problematic
1422
- if is_pdf_problematic is not None
1423
- else pdf_file.is_problematic
1424
- )
1425
-
1426
- if pdf_problematic:
1427
- # Quarantine the file
1428
- logger.warning(
1429
- f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}"
1430
- )
1431
- quarantine_dir = path_utils.PDF_DIR / "quarantine"
1432
- os.makedirs(quarantine_dir, exist_ok=True)
1433
-
1434
- quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
1435
- try:
1436
- shutil.move(file_path, quarantine_path)
1437
- pdf_file.save(update_fields=["quarantine_reason"])
1438
- logger.info(f"Moved problematic PDF to quarantine: {quarantine_path}")
1439
- return True
1440
- except Exception as e:
1441
- logger.error(f"Failed to quarantine PDF {pdf_file.pdf_hash}: {e}")
1442
- return (
1443
- True # Still consider as quarantined to prevent further processing
1444
- )
1445
- else:
1446
- # Archive the file normally
1447
- logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
1448
- archive_dir = path_utils.PDF_DIR / "processed"
1449
- os.makedirs(archive_dir, exist_ok=True)
1450
-
1451
- archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
1452
- try:
1453
- shutil.move(file_path, archive_path)
1454
- logger.info(f"Moved processed PDF to archive: {archive_path}")
1455
- return False
1456
- except Exception as e:
1457
- logger.error(f"Failed to archive PDF {pdf_file.pdf_hash}: {e}")
1458
- return False
1459
-
1460
- def _is_placeholder_value(self, field_name: str, value) -> bool:
1461
- """Return True if a SensitiveMeta field still has a dummy/default value."""
1462
- if value is None:
1463
- return True
1464
-
1465
- # String placeholders
1466
- if isinstance(value, str):
1467
- if value in {self.DEFAULT_PATIENT_FIRST_NAME, self.DEFAULT_PATIENT_LAST_NAME}:
1468
- return True
1469
-
1470
- # Date placeholders
1471
- if isinstance(value, date):
1472
- # Default DOB
1473
- if field_name == "patient_dob" and value == self.DEFAULT_PATIENT_DOB:
1474
- return True
1475
- # "Today" exam date created as fallback – allow anonymizer to override
1476
- if field_name == "examination_date" and value == date.today():
1477
- return True
1478
-
1479
- return False
1480
-