endoreg-db 0.8.4.4__py3-none-any.whl → 0.8.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of endoreg-db might be problematic. Click here for more details.

Files changed (372) hide show
  1. endoreg_db/authz/auth.py +74 -0
  2. endoreg_db/authz/backends.py +168 -0
  3. endoreg_db/authz/management/commands/list_routes.py +18 -0
  4. endoreg_db/authz/middleware.py +83 -0
  5. endoreg_db/authz/permissions.py +127 -0
  6. endoreg_db/authz/policy.py +218 -0
  7. endoreg_db/authz/views_auth.py +66 -0
  8. endoreg_db/config/env.py +13 -8
  9. endoreg_db/data/__init__.py +8 -31
  10. endoreg_db/data/_examples/disease.yaml +55 -0
  11. endoreg_db/data/_examples/disease_classification.yaml +13 -0
  12. endoreg_db/data/_examples/disease_classification_choice.yaml +62 -0
  13. endoreg_db/data/_examples/event.yaml +64 -0
  14. endoreg_db/data/_examples/examination.yaml +72 -0
  15. endoreg_db/data/_examples/finding/anatomy_colon.yaml +128 -0
  16. endoreg_db/data/_examples/finding/colonoscopy.yaml +40 -0
  17. endoreg_db/data/_examples/finding/colonoscopy_bowel_prep.yaml +56 -0
  18. endoreg_db/data/_examples/finding/complication.yaml +16 -0
  19. endoreg_db/data/_examples/finding/data.yaml +105 -0
  20. endoreg_db/data/_examples/finding/examination_setting.yaml +16 -0
  21. endoreg_db/data/_examples/finding/medication_related.yaml +18 -0
  22. endoreg_db/data/_examples/finding/outcome.yaml +12 -0
  23. endoreg_db/data/_examples/finding_classification/colonoscopy_bowel_preparation.yaml +68 -0
  24. endoreg_db/data/_examples/finding_classification/colonoscopy_jnet.yaml +22 -0
  25. endoreg_db/data/_examples/finding_classification/colonoscopy_kudo.yaml +25 -0
  26. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_circularity.yaml +20 -0
  27. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_planarity.yaml +24 -0
  28. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_size.yaml +68 -0
  29. endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_surface.yaml +20 -0
  30. endoreg_db/data/_examples/finding_classification/colonoscopy_location.yaml +80 -0
  31. endoreg_db/data/_examples/finding_classification/colonoscopy_lst.yaml +21 -0
  32. endoreg_db/data/_examples/finding_classification/colonoscopy_nice.yaml +20 -0
  33. endoreg_db/data/_examples/finding_classification/colonoscopy_paris.yaml +26 -0
  34. endoreg_db/data/_examples/finding_classification/colonoscopy_sano.yaml +22 -0
  35. endoreg_db/data/_examples/finding_classification/colonoscopy_summary.yaml +53 -0
  36. endoreg_db/data/_examples/finding_classification/complication_generic.yaml +25 -0
  37. endoreg_db/data/_examples/finding_classification/examination_setting_generic.yaml +40 -0
  38. endoreg_db/data/_examples/finding_classification/histology_colo.yaml +51 -0
  39. endoreg_db/data/_examples/finding_classification/intervention_required.yaml +26 -0
  40. endoreg_db/data/_examples/finding_classification/medication_related.yaml +23 -0
  41. endoreg_db/data/_examples/finding_classification/visualized.yaml +33 -0
  42. endoreg_db/data/_examples/finding_classification_choice/bowel_preparation.yaml +78 -0
  43. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_circularity_default.yaml +32 -0
  44. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_jnet.yaml +15 -0
  45. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_kudo.yaml +23 -0
  46. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_lst.yaml +15 -0
  47. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_nice.yaml +17 -0
  48. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_paris.yaml +57 -0
  49. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_planarity_default.yaml +49 -0
  50. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_sano.yaml +14 -0
  51. endoreg_db/data/_examples/finding_classification_choice/colon_lesion_surface_intact_default.yaml +36 -0
  52. endoreg_db/data/_examples/finding_classification_choice/colonoscopy_location.yaml +229 -0
  53. endoreg_db/data/_examples/finding_classification_choice/colonoscopy_not_complete_reason.yaml +19 -0
  54. endoreg_db/data/_examples/finding_classification_choice/colonoscopy_size.yaml +82 -0
  55. endoreg_db/data/_examples/finding_classification_choice/colonoscopy_summary_worst_finding.yaml +15 -0
  56. endoreg_db/data/_examples/finding_classification_choice/complication_generic_types.yaml +15 -0
  57. endoreg_db/data/_examples/finding_classification_choice/examination_setting_generic_types.yaml +15 -0
  58. endoreg_db/data/_examples/finding_classification_choice/histology.yaml +24 -0
  59. endoreg_db/data/_examples/finding_classification_choice/histology_polyp.yaml +20 -0
  60. endoreg_db/data/_examples/finding_classification_choice/outcome.yaml +19 -0
  61. endoreg_db/data/_examples/finding_classification_choice/yes_no_na.yaml +11 -0
  62. endoreg_db/data/_examples/finding_classification_type/colonoscopy_basic.yaml +48 -0
  63. endoreg_db/data/_examples/finding_intervention/endoscopy.yaml +43 -0
  64. endoreg_db/data/_examples/finding_intervention/endoscopy_colonoscopy.yaml +168 -0
  65. endoreg_db/data/_examples/finding_intervention/endoscopy_egd.yaml +128 -0
  66. endoreg_db/data/_examples/finding_intervention/endoscopy_ercp.yaml +32 -0
  67. endoreg_db/data/_examples/finding_intervention/endoscopy_eus_lower.yaml +9 -0
  68. endoreg_db/data/_examples/finding_intervention/endoscopy_eus_upper.yaml +36 -0
  69. endoreg_db/data/_examples/finding_intervention_type/endoscopy.yaml +15 -0
  70. endoreg_db/data/_examples/finding_type/data.yaml +43 -0
  71. endoreg_db/data/_examples/requirement/age.yaml +26 -0
  72. endoreg_db/data/_examples/requirement/colonoscopy_baseline_austria.yaml +45 -0
  73. endoreg_db/data/_examples/requirement/disease_cardiovascular.yaml +79 -0
  74. endoreg_db/data/_examples/requirement/disease_classification_choice_cardiovascular.yaml +41 -0
  75. endoreg_db/data/_examples/requirement/disease_hepatology.yaml +12 -0
  76. endoreg_db/data/_examples/requirement/disease_misc.yaml +12 -0
  77. endoreg_db/data/_examples/requirement/disease_renal.yaml +96 -0
  78. endoreg_db/data/_examples/requirement/endoscopy_bleeding_risk.yaml +59 -0
  79. endoreg_db/data/_examples/requirement/event_cardiology.yaml +251 -0
  80. endoreg_db/data/_examples/requirement/event_requirements.yaml +145 -0
  81. endoreg_db/data/_examples/requirement/finding_colon_polyp.yaml +50 -0
  82. endoreg_db/data/_examples/requirement/gender.yaml +25 -0
  83. endoreg_db/data/_examples/requirement/lab_value.yaml +441 -0
  84. endoreg_db/data/_examples/requirement/medication.yaml +93 -0
  85. endoreg_db/data/_examples/requirement_operator/age.yaml +13 -0
  86. endoreg_db/data/_examples/requirement_operator/lab_operators.yaml +129 -0
  87. endoreg_db/data/_examples/requirement_operator/model_operators.yaml +96 -0
  88. endoreg_db/data/_examples/requirement_set/01_endoscopy_generic.yaml +48 -0
  89. endoreg_db/data/_examples/requirement_set/colonoscopy_austria_screening.yaml +57 -0
  90. endoreg_db/data/_examples/yaml_examples.xlsx +0 -0
  91. endoreg_db/data/ai_model_meta/default_multilabel_classification.yaml +4 -3
  92. endoreg_db/data/event_classification/data.yaml +4 -0
  93. endoreg_db/data/event_classification_choice/data.yaml +9 -0
  94. endoreg_db/data/finding_classification/colonoscopy_bowel_preparation.yaml +43 -70
  95. endoreg_db/data/finding_classification/colonoscopy_lesion_size.yaml +22 -52
  96. endoreg_db/data/finding_classification/colonoscopy_location.yaml +31 -62
  97. endoreg_db/data/finding_classification/histology_colo.yaml +28 -36
  98. endoreg_db/data/requirement/colon_polyp_intervention.yaml +49 -0
  99. endoreg_db/data/requirement/coloreg_colon_polyp.yaml +49 -0
  100. endoreg_db/data/requirement_set/01_endoscopy_generic.yaml +31 -12
  101. endoreg_db/data/requirement_set/01_laboratory.yaml +13 -0
  102. endoreg_db/data/requirement_set/02_endoscopy_bleeding_risk.yaml +46 -0
  103. endoreg_db/data/requirement_set/90_coloreg.yaml +178 -0
  104. endoreg_db/data/requirement_set/_old_ +109 -0
  105. endoreg_db/data/requirement_set_type/data.yaml +21 -0
  106. endoreg_db/data/setup_config.yaml +4 -4
  107. endoreg_db/data/tag/requirement_set_tags.yaml +21 -0
  108. endoreg_db/exceptions.py +5 -2
  109. endoreg_db/helpers/data_loader.py +1 -1
  110. endoreg_db/management/commands/create_model_meta_from_huggingface.py +21 -10
  111. endoreg_db/management/commands/create_multilabel_model_meta.py +299 -129
  112. endoreg_db/management/commands/import_video.py +9 -10
  113. endoreg_db/management/commands/import_video_with_classification.py +1 -1
  114. endoreg_db/management/commands/init_default_ai_model.py +1 -1
  115. endoreg_db/management/commands/list_routes.py +18 -0
  116. endoreg_db/management/commands/load_ai_model_data.py +2 -1
  117. endoreg_db/management/commands/load_center_data.py +12 -12
  118. endoreg_db/management/commands/load_requirement_data.py +60 -31
  119. endoreg_db/management/commands/load_requirement_set_tags.py +95 -0
  120. endoreg_db/management/commands/setup_endoreg_db.py +14 -10
  121. endoreg_db/management/commands/storage_management.py +271 -203
  122. endoreg_db/migrations/0001_initial.py +1799 -1300
  123. endoreg_db/migrations/0002_requirementset_depends_on.py +18 -0
  124. endoreg_db/migrations/_old/0001_initial.py +1857 -0
  125. endoreg_db/migrations/_old/0004_employee_city_employee_post_code_employee_street_and_more.py +68 -0
  126. endoreg_db/migrations/_old/0004_remove_casetemplate_rules_and_more.py +77 -0
  127. endoreg_db/migrations/_old/0005_merge_20251111_1003.py +14 -0
  128. endoreg_db/migrations/_old/0006_sensitivemeta_anonymized_text_and_more.py +68 -0
  129. endoreg_db/migrations/_old/0007_remove_rule_attribute_dtype_remove_rule_rule_type_and_more.py +89 -0
  130. endoreg_db/migrations/_old/0008_remove_event_event_classification_and_more.py +27 -0
  131. endoreg_db/migrations/_old/0009_alter_modelmeta_options_and_more.py +21 -0
  132. endoreg_db/models/__init__.py +78 -123
  133. endoreg_db/models/administration/__init__.py +21 -42
  134. endoreg_db/models/administration/ai/active_model.py +2 -2
  135. endoreg_db/models/administration/ai/ai_model.py +7 -6
  136. endoreg_db/models/administration/case/__init__.py +1 -15
  137. endoreg_db/models/administration/case/case.py +3 -3
  138. endoreg_db/models/administration/case/case_template/__init__.py +2 -14
  139. endoreg_db/models/administration/case/case_template/case_template.py +2 -124
  140. endoreg_db/models/administration/case/case_template/case_template_rule.py +2 -268
  141. endoreg_db/models/administration/case/case_template/case_template_rule_value.py +2 -85
  142. endoreg_db/models/administration/case/case_template/case_template_type.py +2 -25
  143. endoreg_db/models/administration/center/center.py +33 -19
  144. endoreg_db/models/administration/center/center_product.py +12 -9
  145. endoreg_db/models/administration/center/center_resource.py +25 -19
  146. endoreg_db/models/administration/center/center_shift.py +21 -17
  147. endoreg_db/models/administration/center/center_waste.py +16 -8
  148. endoreg_db/models/administration/person/__init__.py +2 -0
  149. endoreg_db/models/administration/person/employee/employee.py +10 -5
  150. endoreg_db/models/administration/person/employee/employee_qualification.py +9 -4
  151. endoreg_db/models/administration/person/employee/employee_type.py +12 -6
  152. endoreg_db/models/administration/person/examiner/examiner.py +13 -11
  153. endoreg_db/models/administration/person/patient/__init__.py +2 -0
  154. endoreg_db/models/administration/person/patient/patient.py +103 -100
  155. endoreg_db/models/administration/person/patient/patient_external_id.py +37 -0
  156. endoreg_db/models/administration/person/person.py +4 -0
  157. endoreg_db/models/administration/person/profession/__init__.py +8 -4
  158. endoreg_db/models/administration/person/user/portal_user_information.py +11 -7
  159. endoreg_db/models/administration/product/product.py +20 -15
  160. endoreg_db/models/administration/product/product_material.py +17 -18
  161. endoreg_db/models/administration/product/product_weight.py +12 -8
  162. endoreg_db/models/administration/product/reference_product.py +23 -55
  163. endoreg_db/models/administration/qualification/qualification.py +7 -3
  164. endoreg_db/models/administration/qualification/qualification_type.py +7 -3
  165. endoreg_db/models/administration/shift/scheduled_days.py +8 -5
  166. endoreg_db/models/administration/shift/shift.py +16 -12
  167. endoreg_db/models/administration/shift/shift_type.py +23 -31
  168. endoreg_db/models/label/__init__.py +7 -8
  169. endoreg_db/models/label/annotation/image_classification.py +10 -9
  170. endoreg_db/models/label/annotation/video_segmentation_annotation.py +8 -5
  171. endoreg_db/models/label/label.py +15 -15
  172. endoreg_db/models/label/label_set.py +19 -6
  173. endoreg_db/models/label/label_type.py +1 -1
  174. endoreg_db/models/label/label_video_segment/_create_from_video.py +5 -8
  175. endoreg_db/models/label/label_video_segment/label_video_segment.py +76 -102
  176. endoreg_db/models/label/video_segmentation_label.py +4 -0
  177. endoreg_db/models/label/video_segmentation_labelset.py +4 -3
  178. endoreg_db/models/media/frame/frame.py +22 -22
  179. endoreg_db/models/media/pdf/raw_pdf.py +249 -177
  180. endoreg_db/models/media/pdf/report_file.py +25 -29
  181. endoreg_db/models/media/pdf/report_reader/report_reader_config.py +30 -46
  182. endoreg_db/models/media/pdf/report_reader/report_reader_flag.py +23 -7
  183. endoreg_db/models/media/video/__init__.py +1 -0
  184. endoreg_db/models/media/video/create_from_file.py +48 -56
  185. endoreg_db/models/media/video/pipe_1.py +30 -33
  186. endoreg_db/models/media/video/pipe_2.py +8 -9
  187. endoreg_db/models/media/video/video_file.py +359 -204
  188. endoreg_db/models/media/video/video_file_ai.py +288 -74
  189. endoreg_db/models/media/video/video_file_anonymize.py +38 -38
  190. endoreg_db/models/media/video/video_file_frames/__init__.py +3 -1
  191. endoreg_db/models/media/video/video_file_frames/_bulk_create_frames.py +6 -8
  192. endoreg_db/models/media/video/video_file_frames/_create_frame_object.py +7 -9
  193. endoreg_db/models/media/video/video_file_frames/_delete_frames.py +9 -8
  194. endoreg_db/models/media/video/video_file_frames/_extract_frames.py +38 -45
  195. endoreg_db/models/media/video/video_file_frames/_get_frame.py +6 -8
  196. endoreg_db/models/media/video/video_file_frames/_get_frame_number.py +4 -18
  197. endoreg_db/models/media/video/video_file_frames/_get_frame_path.py +4 -3
  198. endoreg_db/models/media/video/video_file_frames/_get_frame_paths.py +7 -6
  199. endoreg_db/models/media/video/video_file_frames/_get_frame_range.py +6 -8
  200. endoreg_db/models/media/video/video_file_frames/_get_frames.py +6 -8
  201. endoreg_db/models/media/video/video_file_frames/_initialize_frames.py +15 -25
  202. endoreg_db/models/media/video/video_file_frames/_manage_frame_range.py +26 -23
  203. endoreg_db/models/media/video/video_file_frames/_mark_frames_extracted_status.py +23 -14
  204. endoreg_db/models/media/video/video_file_io.py +109 -62
  205. endoreg_db/models/media/video/video_file_meta/get_crop_template.py +3 -3
  206. endoreg_db/models/media/video/video_file_meta/get_endo_roi.py +5 -3
  207. endoreg_db/models/media/video/video_file_meta/get_fps.py +37 -34
  208. endoreg_db/models/media/video/video_file_meta/initialize_video_specs.py +19 -25
  209. endoreg_db/models/media/video/video_file_meta/text_meta.py +41 -38
  210. endoreg_db/models/media/video/video_file_meta/video_meta.py +14 -7
  211. endoreg_db/models/media/video/video_file_segments.py +24 -17
  212. endoreg_db/models/media/video/video_metadata.py +19 -35
  213. endoreg_db/models/media/video/video_processing.py +96 -95
  214. endoreg_db/models/medical/contraindication/__init__.py +13 -3
  215. endoreg_db/models/medical/disease.py +22 -16
  216. endoreg_db/models/medical/event.py +31 -18
  217. endoreg_db/models/medical/examination/__init__.py +13 -6
  218. endoreg_db/models/medical/examination/examination.py +17 -18
  219. endoreg_db/models/medical/examination/examination_indication.py +26 -25
  220. endoreg_db/models/medical/examination/examination_time.py +16 -6
  221. endoreg_db/models/medical/examination/examination_time_type.py +9 -6
  222. endoreg_db/models/medical/examination/examination_type.py +3 -4
  223. endoreg_db/models/medical/finding/finding.py +38 -39
  224. endoreg_db/models/medical/finding/finding_classification.py +37 -48
  225. endoreg_db/models/medical/finding/finding_intervention.py +27 -22
  226. endoreg_db/models/medical/finding/finding_type.py +13 -12
  227. endoreg_db/models/medical/hardware/endoscope.py +20 -26
  228. endoreg_db/models/medical/hardware/endoscopy_processor.py +2 -2
  229. endoreg_db/models/medical/laboratory/lab_value.py +62 -91
  230. endoreg_db/models/medical/medication/medication.py +22 -10
  231. endoreg_db/models/medical/medication/medication_indication.py +29 -3
  232. endoreg_db/models/medical/medication/medication_indication_type.py +25 -14
  233. endoreg_db/models/medical/medication/medication_intake_time.py +31 -19
  234. endoreg_db/models/medical/medication/medication_schedule.py +27 -16
  235. endoreg_db/models/medical/organ/__init__.py +15 -12
  236. endoreg_db/models/medical/patient/medication_examples.py +1 -5
  237. endoreg_db/models/medical/patient/patient_disease.py +20 -23
  238. endoreg_db/models/medical/patient/patient_event.py +19 -22
  239. endoreg_db/models/medical/patient/patient_examination.py +48 -54
  240. endoreg_db/models/medical/patient/patient_examination_indication.py +16 -14
  241. endoreg_db/models/medical/patient/patient_finding.py +122 -139
  242. endoreg_db/models/medical/patient/patient_finding_classification.py +44 -49
  243. endoreg_db/models/medical/patient/patient_finding_intervention.py +8 -19
  244. endoreg_db/models/medical/patient/patient_lab_sample.py +28 -23
  245. endoreg_db/models/medical/patient/patient_lab_value.py +82 -89
  246. endoreg_db/models/medical/patient/patient_medication.py +27 -38
  247. endoreg_db/models/medical/patient/patient_medication_schedule.py +28 -36
  248. endoreg_db/models/medical/risk/risk.py +7 -6
  249. endoreg_db/models/medical/risk/risk_type.py +8 -5
  250. endoreg_db/models/metadata/model_meta.py +60 -29
  251. endoreg_db/models/metadata/model_meta_logic.py +139 -18
  252. endoreg_db/models/metadata/pdf_meta.py +19 -24
  253. endoreg_db/models/metadata/sensitive_meta.py +102 -85
  254. endoreg_db/models/metadata/sensitive_meta_logic.py +383 -43
  255. endoreg_db/models/metadata/video_meta.py +51 -31
  256. endoreg_db/models/metadata/video_prediction_logic.py +16 -23
  257. endoreg_db/models/metadata/video_prediction_meta.py +29 -33
  258. endoreg_db/models/other/distribution/date_value_distribution.py +89 -29
  259. endoreg_db/models/other/distribution/multiple_categorical_value_distribution.py +21 -5
  260. endoreg_db/models/other/distribution/numeric_value_distribution.py +114 -53
  261. endoreg_db/models/other/distribution/single_categorical_value_distribution.py +4 -3
  262. endoreg_db/models/other/emission/emission_factor.py +18 -8
  263. endoreg_db/models/other/gender.py +10 -5
  264. endoreg_db/models/other/information_source.py +25 -25
  265. endoreg_db/models/other/material.py +9 -5
  266. endoreg_db/models/other/resource.py +6 -4
  267. endoreg_db/models/other/tag.py +10 -5
  268. endoreg_db/models/other/transport_route.py +13 -8
  269. endoreg_db/models/other/unit.py +10 -6
  270. endoreg_db/models/other/waste.py +6 -5
  271. endoreg_db/models/requirement/requirement.py +580 -272
  272. endoreg_db/models/requirement/requirement_error.py +85 -0
  273. endoreg_db/models/requirement/requirement_evaluation/evaluate_with_dependencies.py +268 -0
  274. endoreg_db/models/requirement/requirement_evaluation/operator_evaluation_models.py +3 -6
  275. endoreg_db/models/requirement/requirement_evaluation/requirement_type_parser.py +90 -64
  276. endoreg_db/models/requirement/requirement_operator.py +36 -33
  277. endoreg_db/models/requirement/requirement_set.py +74 -57
  278. endoreg_db/models/state/__init__.py +4 -4
  279. endoreg_db/models/state/abstract.py +2 -2
  280. endoreg_db/models/state/anonymization.py +12 -0
  281. endoreg_db/models/state/audit_ledger.py +46 -47
  282. endoreg_db/models/state/label_video_segment.py +9 -0
  283. endoreg_db/models/state/raw_pdf.py +40 -46
  284. endoreg_db/models/state/sensitive_meta.py +6 -2
  285. endoreg_db/models/state/video.py +58 -53
  286. endoreg_db/models/upload_job.py +32 -55
  287. endoreg_db/models/utils.py +1 -2
  288. endoreg_db/root_urls.py +21 -2
  289. endoreg_db/serializers/__init__.py +26 -57
  290. endoreg_db/serializers/anonymization.py +18 -10
  291. endoreg_db/serializers/meta/report_meta.py +1 -1
  292. endoreg_db/serializers/meta/sensitive_meta_detail.py +63 -118
  293. endoreg_db/serializers/misc/__init__.py +1 -1
  294. endoreg_db/serializers/misc/file_overview.py +33 -91
  295. endoreg_db/serializers/misc/{vop_patient_data.py → sensitive_patient_data.py} +1 -1
  296. endoreg_db/serializers/requirements/requirement_sets.py +92 -22
  297. endoreg_db/serializers/video/segmentation.py +2 -1
  298. endoreg_db/serializers/video/video_processing_history.py +20 -5
  299. endoreg_db/serializers/video_examination.py +198 -0
  300. endoreg_db/services/anonymization.py +75 -73
  301. endoreg_db/services/lookup_service.py +256 -73
  302. endoreg_db/services/lookup_store.py +174 -30
  303. endoreg_db/services/pdf_import.py +711 -310
  304. endoreg_db/services/storage_aware_video_processor.py +140 -114
  305. endoreg_db/services/video_import.py +266 -117
  306. endoreg_db/urls/__init__.py +27 -27
  307. endoreg_db/urls/label_video_segments.py +2 -0
  308. endoreg_db/urls/media.py +108 -66
  309. endoreg_db/urls/root_urls.py +29 -0
  310. endoreg_db/utils/__init__.py +15 -5
  311. endoreg_db/utils/ai/multilabel_classification_net.py +116 -20
  312. endoreg_db/utils/case_generator/__init__.py +3 -0
  313. endoreg_db/utils/dataloader.py +88 -16
  314. endoreg_db/utils/defaults/set_default_center.py +32 -0
  315. endoreg_db/utils/names.py +22 -16
  316. endoreg_db/utils/permissions.py +2 -1
  317. endoreg_db/utils/pipelines/process_video_dir.py +1 -1
  318. endoreg_db/utils/requirement_operator_logic/model_evaluators.py +414 -127
  319. endoreg_db/utils/setup_config.py +8 -5
  320. endoreg_db/utils/storage.py +115 -0
  321. endoreg_db/utils/validate_endo_roi.py +8 -2
  322. endoreg_db/utils/video/ffmpeg_wrapper.py +184 -188
  323. endoreg_db/views/__init__.py +5 -12
  324. endoreg_db/views/anonymization/media_management.py +198 -163
  325. endoreg_db/views/anonymization/overview.py +4 -1
  326. endoreg_db/views/anonymization/validate.py +174 -40
  327. endoreg_db/views/media/__init__.py +2 -0
  328. endoreg_db/views/media/pdf_media.py +131 -150
  329. endoreg_db/views/media/sensitive_metadata.py +46 -6
  330. endoreg_db/views/media/video_media.py +89 -82
  331. endoreg_db/views/media/video_segments.py +187 -260
  332. endoreg_db/views/meta/sensitive_meta_detail.py +0 -63
  333. endoreg_db/views/patient/patient.py +5 -4
  334. endoreg_db/views/pdf/__init__.py +5 -8
  335. endoreg_db/views/pdf/pdf_stream.py +186 -0
  336. endoreg_db/views/pdf/pdf_stream_views.py +0 -127
  337. endoreg_db/views/pdf/reimport.py +86 -91
  338. endoreg_db/views/requirement/evaluate.py +188 -187
  339. endoreg_db/views/requirement/lookup.py +186 -288
  340. endoreg_db/views/requirement/requirement_utils.py +89 -0
  341. endoreg_db/views/video/__init__.py +0 -4
  342. endoreg_db/views/video/correction.py +2 -2
  343. endoreg_db/views/video/video_examination_viewset.py +202 -289
  344. {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.8.0.dist-info}/METADATA +7 -3
  345. {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.8.0.dist-info}/RECORD +350 -255
  346. endoreg_db/models/administration/permissions/__init__.py +0 -44
  347. endoreg_db/models/media/video/refactor_plan.md +0 -0
  348. endoreg_db/models/media/video/video_file_frames.py +0 -0
  349. endoreg_db/models/metadata/frame_ocr_result.py +0 -0
  350. endoreg_db/models/rule/__init__.py +0 -13
  351. endoreg_db/models/rule/rule.py +0 -27
  352. endoreg_db/models/rule/rule_applicator.py +0 -224
  353. endoreg_db/models/rule/rule_attribute_dtype.py +0 -17
  354. endoreg_db/models/rule/rule_type.py +0 -20
  355. endoreg_db/models/rule/ruleset.py +0 -17
  356. endoreg_db/serializers/video/video_metadata.py +0 -105
  357. endoreg_db/urls/report.py +0 -48
  358. endoreg_db/urls/video.py +0 -61
  359. endoreg_db/utils/case_generator/case_generator.py +0 -159
  360. endoreg_db/utils/case_generator/utils.py +0 -30
  361. endoreg_db/views/pdf/pdf_media.py +0 -239
  362. endoreg_db/views/report/__init__.py +0 -9
  363. endoreg_db/views/report/report_list.py +0 -112
  364. endoreg_db/views/report/report_with_secure_url.py +0 -28
  365. endoreg_db/views/report/start_examination.py +0 -7
  366. endoreg_db/views/video/video_media.py +0 -158
  367. endoreg_db/views.py +0 -0
  368. /endoreg_db/data/{requirement_set → _examples/requirement_set}/endoscopy_bleeding_risk.yaml +0 -0
  369. /endoreg_db/migrations/{0002_add_video_correction_models.py → _old/0002_add_video_correction_models.py} +0 -0
  370. /endoreg_db/migrations/{0003_add_center_display_name.py → _old/0003_add_center_display_name.py} +0 -0
  371. {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.8.0.dist-info}/WHEEL +0 -0
  372. {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,24 +2,31 @@
2
2
  PDF import service module.
3
3
 
4
4
  Provides high-level functions for importing and anonymizing PDF files,
5
- combining RawPdfFile creation with text extraction and anonymization.
5
+ combining RawPdfFile creation with text extraction and anonymization using lx anonymizer.
6
+
7
+ All Fields should be overwritten from anonymizer defaults except for the center which is given.
6
8
  """
7
- from datetime import date, datetime
9
+
8
10
  import errno
11
+ import hashlib
9
12
  import logging
13
+ import os
10
14
  import shutil
11
15
  import sys
12
- import os
13
- import hashlib
16
+ import time
17
+ from contextlib import contextmanager
18
+ from datetime import date, datetime
14
19
  from pathlib import Path
15
20
  from typing import TYPE_CHECKING, Union
16
- from contextlib import contextmanager
21
+ import subprocess
17
22
  from django.db import transaction
23
+ from django.core.exceptions import ObjectDoesNotExist
24
+ import lx_anonymizer
25
+
26
+ from endoreg_db.models import SensitiveMeta
18
27
  from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
19
28
  from endoreg_db.models.state.raw_pdf import RawPdfState
20
- from endoreg_db.models import SensitiveMeta
21
29
  from endoreg_db.utils import paths as path_utils
22
- import time
23
30
 
24
31
  logger = logging.getLogger(__name__)
25
32
 
@@ -34,24 +41,76 @@ class PdfImportService:
34
41
  """
35
42
  Service class for importing and processing PDF files with text extraction and anonymization.
36
43
  Uses a central PDF instance pattern for cleaner state management.
44
+
45
+ Supports two processing modes:
46
+ - 'blackening': Simple PDF masking with black rectangles over sensitive areas
47
+ - 'cropping': Advanced mode that crops sensitive regions to separate images
37
48
  """
38
-
39
- def __init__(self, allow_meta_overwrite: bool = False):
49
+
50
+ def __init__(
51
+ self, allow_meta_overwrite: bool = True, processing_mode: str = "blackening"
52
+ ):
40
53
  """
41
54
  Initialize the PDF import service.
42
-
55
+
43
56
  Args:
44
57
  allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
58
+ processing_mode: Processing mode - 'blackening' for simple masking, 'cropping' for advanced cropping
45
59
  """
46
60
  self.processed_files = set()
47
61
  self._report_reader_available = None
48
62
  self._report_reader_class = None
49
63
  self.allow_meta_overwrite = allow_meta_overwrite
50
-
64
+
65
+ # Validate and set processing mode
66
+ valid_modes = ["blackening", "cropping"]
67
+ if processing_mode not in valid_modes:
68
+ raise ValueError(
69
+ f"Invalid processing_mode '{processing_mode}'. Must be one of: {valid_modes}"
70
+ )
71
+ self.processing_mode = processing_mode
72
+
51
73
  # Central PDF instance management
52
74
  self.current_pdf = None
75
+ self.current_pdf_state = None
53
76
  self.processing_context = {}
77
+ self.original_path = None
54
78
 
79
+ self.DEFAULT_PATIENT_FIRST_NAME = "Patient"
80
+ self.DEFAULT_PATIENT_LAST_NAME = "Unknown"
81
+ self.DEFAULT_PATIENT_DOB = date(1990, 1, 1)
82
+ self.DEFAULT_CENTER_NAME = "university_hospital_wuerzburg"
83
+
84
+ @classmethod
85
+ def with_blackening(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
86
+ """
87
+ Create a PdfImportService configured for simple PDF blackening mode.
88
+
89
+ Args:
90
+ allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
91
+
92
+ Returns:
93
+ PdfImportService instance configured for blackening mode
94
+ """
95
+ return cls(
96
+ allow_meta_overwrite=allow_meta_overwrite, processing_mode="blackening"
97
+ )
98
+
99
+ @classmethod
100
+ def with_cropping(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
101
+ """
102
+ Create a PdfImportService configured for advanced cropping mode.
103
+
104
+ Args:
105
+ allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
106
+
107
+ Returns:
108
+ PdfImportService instance configured for cropping mode
109
+ """
110
+ return cls(
111
+ allow_meta_overwrite=allow_meta_overwrite, processing_mode="cropping"
112
+ )
113
+
55
114
  @contextmanager
56
115
  def _file_lock(self, path: Path):
57
116
  """Create a file lock to prevent duplicate processing.
@@ -77,15 +136,19 @@ class PdfImportService:
77
136
  try:
78
137
  logger.warning(
79
138
  "Stale lock detected for %s (age %.0fs). Reclaiming lock...",
80
- path, age
139
+ path,
140
+ age,
81
141
  )
82
142
  lock_path.unlink()
83
143
  except Exception as e:
84
- logger.warning("Failed to remove stale lock %s: %s", lock_path, e)
144
+ logger.warning(
145
+ "Failed to remove stale lock %s: %s", lock_path, e
146
+ )
85
147
  # retry acquire
86
148
  fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
87
149
  else:
88
150
  # Another worker is processing this file
151
+
89
152
  raise ValueError(f"File already being processed: {path}")
90
153
 
91
154
  os.write(fd, b"lock")
@@ -100,7 +163,7 @@ class PdfImportService:
100
163
  lock_path.unlink()
101
164
  except OSError:
102
165
  pass
103
-
166
+
104
167
  def _sha256(self, path: Path, chunk: int = 1024 * 1024) -> str:
105
168
  """Compute SHA256 hash of a file."""
106
169
  h = hashlib.sha256()
@@ -134,7 +197,7 @@ class PdfImportService:
134
197
  return Path(str(candidate))
135
198
  except Exception:
136
199
  return None
137
-
200
+
138
201
  def _quarantine(self, source: Path) -> Path:
139
202
  """Move file to quarantine directory to prevent re-processing."""
140
203
  qdir = path_utils.PDF_DIR / "_processing"
@@ -149,8 +212,12 @@ class PdfImportService:
149
212
  shutil.move(str(source), str(target))
150
213
  else:
151
214
  raise
215
+ lock_path = Path(str(source) + ".lock")
216
+ if lock_path.exists():
217
+ lock_path.unlink()
218
+
152
219
  return target
153
-
220
+
154
221
  def _ensure_state(self, pdf_file: "RawPdfFile"):
155
222
  """Ensure PDF file has a state object."""
156
223
  if getattr(pdf_file, "state", None):
@@ -158,147 +225,167 @@ class PdfImportService:
158
225
  if hasattr(pdf_file, "get_or_create_state"):
159
226
  state = pdf_file.get_or_create_state()
160
227
  pdf_file.state = state
228
+ self.current_pdf_state = state
229
+ assert isinstance(self.current_pdf_state, RawPdfState)
161
230
  return state
162
- # Very defensive fallback
163
- try:
164
- state, _ = pdf_file.get_or_create_state(raw_pdf_file=pdf_file)
165
- pdf_file.state = state
166
- return state
167
- except Exception:
168
- return None
169
-
231
+
232
+
170
233
  def _ensure_report_reading_available(self):
171
234
  """
172
235
  Ensure report reading modules are available by adding lx-anonymizer to path.
173
-
236
+
174
237
  Returns:
175
238
  Tuple of (availability_flag, ReportReader_class)
176
239
  """
177
240
  if self._report_reader_available is not None:
178
241
  return self._report_reader_available, self._report_reader_class
179
-
242
+
180
243
  try:
181
244
  # Try direct import first
182
245
  from lx_anonymizer import ReportReader
183
-
246
+
184
247
  logger.info("Successfully imported lx_anonymizer ReportReader module")
185
248
  self._report_reader_available = True
186
249
  self._report_reader_class = ReportReader
187
250
  return True, ReportReader
188
-
251
+
189
252
  except ImportError:
190
253
  # Optional: honor LX_ANONYMIZER_PATH=/abs/path/to/src
191
254
  import importlib
255
+
192
256
  extra = os.getenv("LX_ANONYMIZER_PATH")
193
257
  if extra and extra not in sys.path and Path(extra).exists():
194
258
  sys.path.insert(0, extra)
195
259
  try:
196
260
  mod = importlib.import_module("lx_anonymizer")
197
261
  ReportReader = getattr(mod, "ReportReader")
198
- logger.info("Imported lx_anonymizer.ReportReader via LX_ANONYMIZER_PATH")
262
+ logger.info(
263
+ "Imported lx_anonymizer.ReportReader via LX_ANONYMIZER_PATH"
264
+ )
199
265
  self._report_reader_available = True
200
266
  self._report_reader_class = ReportReader
201
267
  return True, ReportReader
202
268
  except Exception as e:
203
- logger.warning("Failed importing lx_anonymizer via LX_ANONYMIZER_PATH: %s", e)
269
+ logger.warning(
270
+ "Failed importing lx_anonymizer via LX_ANONYMIZER_PATH: %s", e
271
+ )
204
272
  finally:
205
273
  # Keep path for future imports if it worked; otherwise remove.
206
274
  if "ReportReader" not in locals() and extra in sys.path:
207
275
  sys.path.remove(extra)
208
-
276
+
209
277
  self._report_reader_available = False
210
278
  self._report_reader_class = None
211
279
  return False, None
212
280
 
213
-
214
- def _ensure_default_patient_data(self, pdf_instance: "RawPdfFile" = None) -> None:
281
+ def _ensure_default_patient_data(self, pdf_instance: "RawPdfFile") -> None:
215
282
  """
216
283
  Ensure PDF has minimum required patient data in SensitiveMeta.
217
284
  Creates default values if data is missing after text processing.
218
285
  Uses the central PDF instance if no specific instance provided.
219
-
286
+
220
287
  Args:
221
288
  pdf_instance: Optional specific PDF instance, defaults to self.current_pdf
222
289
  """
223
290
  pdf_file = pdf_instance or self.current_pdf
224
291
  if not pdf_file:
225
- logger.warning("No PDF instance available for ensuring default patient data")
292
+ logger.warning(
293
+ "No PDF instance available for ensuring default patient data"
294
+ )
226
295
  return
227
-
296
+
228
297
  if not pdf_file.sensitive_meta:
229
- logger.info(f"No SensitiveMeta found for PDF {pdf_file.pdf_hash}, creating default")
230
-
298
+ logger.info(
299
+ f"No SensitiveMeta found for PDF {pdf_file.pdf_hash}, creating default"
300
+ )
301
+
231
302
  # Create default SensitiveMeta with placeholder data
232
303
  default_data = {
233
- "patient_first_name": "Patient",
234
- "patient_last_name": "Unknown",
235
- "patient_dob": date(1990, 1, 1), # Default DOB
236
- "examination_date": date.today(),
237
- "center_name": pdf_file.center.name if pdf_file.center else "university_hospital_wuerzburg"
304
+ "patient_first_name": self.DEFAULT_PATIENT_FIRST_NAME,
305
+ "patient_last_name": self.DEFAULT_PATIENT_LAST_NAME,
306
+ "patient_dob": self.DEFAULT_PATIENT_DOB,
307
+ "examination_date": date.today(), # today is intentionally *not* a constant
308
+ "center_name": (
309
+ pdf_file.center.name
310
+ if pdf_file.center
311
+ else self.DEFAULT_CENTER_NAME
312
+ ),
238
313
  }
239
-
314
+
315
+
240
316
  try:
241
317
  sensitive_meta = SensitiveMeta.create_from_dict(default_data)
242
318
  pdf_file.sensitive_meta = sensitive_meta
243
- pdf_file.save(update_fields=['sensitive_meta'])
244
- logger.info(f"Created default SensitiveMeta for PDF {pdf_file.pdf_hash}")
319
+ pdf_file.save(update_fields=["sensitive_meta"])
320
+ logger.info(
321
+ f"Created default SensitiveMeta for PDF {pdf_file.pdf_hash}"
322
+ )
245
323
  except Exception as e:
246
- logger.error(f"Failed to create default SensitiveMeta for PDF {pdf_file.pdf_hash}: {e}")
324
+ logger.error(
325
+ f"Failed to create default SensitiveMeta for PDF {pdf_file.pdf_hash}: {e}"
326
+ )
247
327
 
248
328
  def import_and_anonymize(
249
- self,
250
- file_path: Union[Path, str],
251
- center_name: str,
329
+ self,
330
+ file_path: Union[Path, str],
331
+ center_name: str,
252
332
  delete_source: bool = False,
253
333
  retry: bool = False,
254
- ) -> "RawPdfFile":
334
+ ) -> "RawPdfFile | None":
255
335
  """
256
336
  Import a PDF file and anonymize it using ReportReader.
257
337
  Uses centralized PDF instance management pattern.
258
-
338
+
339
+ The processing mode is determined by the service initialization:
340
+ - 'blackening': Creates an anonymized PDF with black rectangles over sensitive regions
341
+ - 'cropping': Advanced mode that crops sensitive regions to separate images
342
+
259
343
  Args:
260
344
  file_path: Path to the PDF file to import
261
345
  center_name: Name of the center to associate with PDF
262
346
  delete_source: Whether to delete the source file after import
263
347
  retry: Whether this is a retry attempt
264
-
348
+
265
349
  Returns:
266
350
  RawPdfFile instance after import and processing
267
-
351
+
268
352
  Raises:
269
353
  Exception: On any failure during import or processing
270
354
  """
271
355
  try:
272
356
  # Initialize processing context
273
- self._initialize_processing_context(file_path, center_name, delete_source, retry)
274
-
357
+ self._initialize_processing_context(
358
+ file_path, center_name, delete_source, retry
359
+ )
360
+
275
361
  # Step 1: Validate and prepare file
276
362
  self._validate_and_prepare_file()
277
-
363
+
278
364
  # Step 2: Create or retrieve PDF instance
279
365
  self._create_or_retrieve_pdf_instance()
280
-
366
+
281
367
  # Early return check - if no PDF instance was created, return None
282
368
  if not self.current_pdf:
283
- logger.warning(f"No PDF instance created for {file_path}, returning None")
284
- return None
285
-
369
+ logger.warning(
370
+ f"No PDF instance created for {file_path}, returning None"
371
+ )
372
+ raise ObjectDoesNotExist
286
373
  # Step 3: Setup processing environment
287
374
  self._setup_processing_environment()
288
-
375
+
289
376
  # Step 4: Process text and metadata
290
377
  self._process_text_and_metadata()
291
-
378
+
292
379
  # Step 5: Finalize processing
293
380
  self._finalize_processing()
294
-
381
+
295
382
  return self.current_pdf
296
-
383
+
297
384
  except ValueError as e:
298
385
  # Handle "File already being processed" case specifically
299
386
  if "already being processed" in str(e):
300
387
  logger.info(f"Skipping file {file_path}: {e}")
301
- return None
388
+ return
302
389
  else:
303
390
  logger.error(f"PDF import failed for {file_path}: {e}")
304
391
  self._cleanup_on_error()
@@ -312,50 +399,58 @@ class PdfImportService:
312
399
  # Always cleanup context
313
400
  self._cleanup_processing_context()
314
401
 
315
- def _initialize_processing_context(self, file_path: Union[Path, str], center_name: str,
316
- delete_source: bool, retry: bool):
402
+ def _initialize_processing_context(
403
+ self,
404
+ file_path: Union[Path, str],
405
+ center_name: str,
406
+ delete_source: bool,
407
+ retry: bool,
408
+ ):
317
409
  """Initialize the processing context for the current PDF."""
318
410
  self.processing_context = {
319
- 'file_path': Path(file_path),
320
- 'original_file_path': Path(file_path),
321
- 'center_name': center_name,
322
- 'delete_source': delete_source,
323
- 'retry': retry,
324
- 'file_hash': None,
325
- 'processing_started': False,
326
- 'text_extracted': False,
327
- 'metadata_processed': False,
328
- 'anonymization_completed': False
411
+ "file_path": Path(file_path),
412
+ "original_file_path": Path(file_path),
413
+ "center_name": center_name,
414
+ "delete_source": delete_source,
415
+ "retry": retry,
416
+ "file_hash": None,
417
+ "processing_started": False,
418
+ "text_extracted": False,
419
+ "metadata_processed": False,
420
+ "anonymization_completed": False,
329
421
  }
330
-
422
+ self.original_path = Path(file_path)
423
+
331
424
  # Check if already processed (only during current session to prevent race conditions)
332
425
  if str(file_path) in self.processed_files:
333
- logger.info(f"File {file_path} already being processed in current session, skipping")
426
+ logger.info(
427
+ f"File {file_path} already being processed in current session, skipping"
428
+ )
334
429
  raise ValueError("File already being processed")
335
-
430
+
336
431
  logger.info(f"Starting import and processing for: {file_path}")
337
432
 
338
433
  def _validate_and_prepare_file(self):
339
434
  """Validate file existence and calculate hash."""
340
- file_path = self.processing_context['file_path']
341
-
435
+ file_path = self.processing_context["file_path"]
436
+
342
437
  if not file_path.exists():
343
438
  raise FileNotFoundError(f"PDF file not found: {file_path}")
344
-
439
+
345
440
  try:
346
- self.processing_context['file_hash'] = self._sha256(file_path)
441
+ self.processing_context["file_hash"] = self._sha256(file_path)
347
442
  except Exception as e:
348
443
  logger.warning(f"Could not calculate file hash: {e}")
349
- self.processing_context['file_hash'] = None
444
+ self.processing_context["file_hash"] = None
350
445
 
351
446
  def _create_or_retrieve_pdf_instance(self):
352
447
  """Create new or retrieve existing PDF instance."""
353
- file_path = self.processing_context['file_path']
354
- center_name = self.processing_context['center_name']
355
- delete_source = self.processing_context['delete_source']
356
- retry = self.processing_context['retry']
357
- file_hash = self.processing_context['file_hash']
358
-
448
+ file_path = self.processing_context["file_path"]
449
+ center_name = self.processing_context["center_name"]
450
+ delete_source = self.processing_context["delete_source"]
451
+ retry = self.processing_context["retry"]
452
+ file_hash = self.processing_context["file_hash"]
453
+
359
454
  if not retry:
360
455
  # Check for existing PDF and handle duplicates
361
456
  with self._file_lock(file_path):
@@ -366,18 +461,20 @@ class PdfImportService:
366
461
  if existing:
367
462
  logger.info(f"Found existing RawPdfFile {existing.pdf_hash}")
368
463
  if existing.text:
369
- logger.info(f"Existing PDF {existing.pdf_hash} already processed - returning")
464
+ logger.info(
465
+ f"Existing PDF {existing.pdf_hash} already processed - returning"
466
+ )
370
467
  self.current_pdf = existing
371
468
  return
372
469
  else:
373
470
  # Retry processing
374
471
  logger.info(f"Reprocessing existing PDF {existing.pdf_hash}")
375
472
  return self._retry_existing_pdf(existing)
376
-
473
+
377
474
  # Create new PDF instance
378
475
  logger.info("Creating new RawPdfFile instance...")
379
476
  from django.db import IntegrityError
380
-
477
+
381
478
  try:
382
479
  if not retry:
383
480
  self.current_pdf = RawPdfFile.create_from_file_initialized(
@@ -388,18 +485,22 @@ class PdfImportService:
388
485
  else:
389
486
  # Retrieve existing for retry
390
487
  self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
391
- logger.info(f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}")
392
-
488
+ logger.info(
489
+ f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}"
490
+ )
491
+
393
492
  # Check if retry is actually needed
394
493
  if self.current_pdf.text:
395
- logger.info(f"Existing PDF {self.current_pdf.pdf_hash} already processed during retry - returning")
494
+ logger.info(
495
+ f"Existing PDF {self.current_pdf.pdf_hash} already processed during retry - returning"
496
+ )
396
497
  return
397
-
498
+
398
499
  if not self.current_pdf:
399
500
  raise RuntimeError("Failed to create RawPdfFile instance")
400
-
501
+
401
502
  logger.info(f"PDF instance ready: {self.current_pdf.pdf_hash}")
402
-
503
+
403
504
  except IntegrityError:
404
505
  # Race condition - another worker created it
405
506
  if file_hash:
@@ -410,111 +511,198 @@ class PdfImportService:
410
511
 
411
512
  def _setup_processing_environment(self):
412
513
  """Setup processing environment and state."""
413
- original_path = self.processing_context.get('file_path')
414
-
514
+ original_path = self.processing_context.get("file_path")
515
+ if not original_path or not self.current_pdf:
516
+ try:
517
+ self.current_pdf = RawPdfFile.objects.get(pdf_hash=self.processing_context["file_hash"])
518
+ self.original_path = Path(str(self.current_pdf.file.path))
519
+
520
+ except RawPdfFile.DoesNotExist:
521
+ raise RuntimeError("Processing environment setup failed")
415
522
  # Create sensitive file copy
523
+ if original_path is None or not isinstance(original_path, (str, Path)):
524
+ logger.error(f"No original path: {original_path!r}")
525
+ return
416
526
  self.create_sensitive_file(self.current_pdf, original_path)
417
-
527
+
418
528
  # Update file path to point to sensitive copy
419
- self.processing_context['file_path'] = self.current_pdf.file.path
420
- self.processing_context['sensitive_copy_created'] = True
529
+ self.processing_context["file_path"] = self.current_pdf.file.path
530
+ self.processing_context["sensitive_copy_created"] = True
421
531
  try:
422
- self.processing_context['sensitive_file_path'] = Path(self.current_pdf.file.path)
532
+ self.processing_context["sensitive_file_path"] = Path(
533
+ self.current_pdf.file.path
534
+ )
423
535
  except Exception:
424
- self.processing_context['sensitive_file_path'] = None
425
-
536
+ self.processing_context["sensitive_file_path"] = None
537
+
426
538
  # Ensure state exists
427
539
  state = self.current_pdf.get_or_create_state()
428
540
  state.mark_processing_started()
429
- self.processing_context['processing_started'] = True
430
-
541
+ self.processing_context["processing_started"] = True
542
+
431
543
  # Mark as processed to prevent duplicates
432
- self.processed_files.add(str(self.processing_context['file_path']))
433
-
544
+ self.processed_files.add(str(self.processing_context["file_path"]))
545
+
434
546
  # Ensure default patient data
435
547
  logger.info("Ensuring default patient data...")
436
548
  self._ensure_default_patient_data(self.current_pdf)
437
549
 
438
550
  def _process_text_and_metadata(self):
439
551
  """Process text extraction and metadata using ReportReader."""
440
- report_reading_available, ReportReader = self._ensure_report_reading_available()
441
-
552
+ report_reading_available, ReportReaderCls = self._ensure_report_reading_available()
553
+ try:
554
+ assert ReportReaderCls is not None and report_reading_available
555
+ assert self.current_pdf is not None
556
+ except AssertionError as e:
557
+ logger.error(f"PDF Import failed on Error:{e} Ensure the pdf was passed correctly and report reading is available in function _process_text_and_metadata() ")
442
558
  if not report_reading_available:
443
559
  logger.warning("Report reading not available (lx_anonymizer not found)")
444
560
  self._mark_processing_incomplete("no_report_reader")
445
- return
446
-
561
+ return
562
+ assert self.current_pdf is not None
447
563
  if not self.current_pdf.file:
448
564
  logger.warning("No file available for text processing")
449
565
  self._mark_processing_incomplete("no_file")
450
566
  return
451
-
567
+
452
568
  try:
453
- logger.info("Starting text extraction and metadata processing with ReportReader...")
454
-
455
- # Setup output directories
456
- crops_dir = path_utils.PDF_DIR / 'cropped_regions'
457
- anonymized_dir = path_utils.PDF_DIR / 'anonymized'
458
- crops_dir.mkdir(parents=True, exist_ok=True)
459
- anonymized_dir.mkdir(parents=True, exist_ok=True)
569
+ logger.info(
570
+ f"Starting text extraction and metadata processing with ReportReader (mode: {self.processing_mode})..."
571
+ )
572
+ ReportReaderCls = lx_anonymizer.ReportReader
460
573
 
461
574
  # Initialize ReportReader
462
- report_reader = ReportReader(
575
+ report_reader = ReportReaderCls(
463
576
  report_root_path=str(path_utils.STORAGE_DIR),
464
577
  locale="de_DE",
465
- text_date_format="%d.%m.%Y"
578
+ text_date_format="%d.%m.%Y",
466
579
  )
467
580
 
468
- # Process with cropping
469
- original_text, anonymized_text, extracted_metadata, cropped_regions, anonymized_pdf_path = report_reader.process_report_with_cropping(
470
- pdf_path=self.processing_context['file_path'],
471
- crop_sensitive_regions=True,
472
- crop_output_dir=str(crops_dir),
473
- anonymization_output_dir=str(anonymized_dir)
474
- )
475
-
476
- # Store results in context
477
- self.processing_context.update({
478
- 'original_text': original_text,
479
- 'anonymized_text': anonymized_text,
480
- 'extracted_metadata': extracted_metadata,
481
- 'cropped_regions': cropped_regions,
482
- 'anonymized_pdf_path': anonymized_pdf_path
483
- })
484
-
485
- if original_text:
486
- self._apply_text_results()
487
- self.processing_context['text_extracted'] = True
488
-
489
- if extracted_metadata:
490
- self._apply_metadata_results()
491
- self.processing_context['metadata_processed'] = True
492
-
493
- if anonymized_pdf_path:
494
- self._apply_anonymized_pdf()
495
- self.processing_context['anonymization_completed'] = True
496
-
581
+ if self.processing_mode == "cropping":
582
+ # Use advanced cropping method (existing implementation)
583
+ self._process_with_cropping(report_reader)
584
+ else: # blackening mode
585
+ # Use enhanced process_report with PDF masking
586
+ self._process_with_blackening(report_reader)
587
+
497
588
  except Exception as e:
498
589
  logger.warning(f"Text processing failed: {e}")
499
590
  self._mark_processing_incomplete("text_processing_failed")
500
591
 
592
+ def _process_with_blackening(self, report_reader):
593
+ """Process PDF using simple blackening/masking mode."""
594
+ logger.info("Using simple PDF blackening mode...")
595
+
596
+ # Setup anonymized directory
597
+ anonymized_dir = path_utils.PDF_DIR / "anonymized"
598
+ anonymized_dir.mkdir(parents=True, exist_ok=True)
599
+ assert self.current_pdf is not None
600
+ # Generate output path for anonymized PDF
601
+ pdf_hash = self.current_pdf.pdf_hash
602
+ anonymized_output_path = anonymized_dir / f"{pdf_hash}_anonymized.pdf"
603
+
604
+ # Process with enhanced process_report method (returns 4-tuple now)
605
+ original_text, anonymized_text, extracted_metadata, anonymized_pdf_path = (
606
+ report_reader.process_report(
607
+ pdf_path=self.processing_context["file_path"],
608
+ create_anonymized_pdf=True,
609
+ anonymized_pdf_output_path=str(anonymized_output_path),
610
+ )
611
+ )
612
+
613
+ # Store results in context
614
+ self.processing_context.update(
615
+ {
616
+ "original_text": original_text,
617
+ "anonymized_text": anonymized_text,
618
+ "extracted_metadata": extracted_metadata,
619
+ "cropped_regions": None, # Not available in blackening mode
620
+ "anonymized_pdf_path": anonymized_pdf_path,
621
+ }
622
+ )
623
+
624
+ # Apply results
625
+ if original_text:
626
+ self._apply_text_results()
627
+ self.processing_context["text_extracted"] = True
628
+
629
+ if extracted_metadata:
630
+ self._apply_metadata_results()
631
+ self.processing_context["metadata_processed"] = True
632
+
633
+ if anonymized_pdf_path:
634
+ self._apply_anonymized_pdf()
635
+ self.processing_context["anonymization_completed"] = True
636
+
637
+ logger.info("PDF blackening processing completed")
638
+
639
+ def _process_with_cropping(self, report_reader):
640
+ """Process PDF using advanced cropping mode (existing implementation)."""
641
+ logger.info("Using advanced cropping mode...")
642
+
643
+ # Setup output directories
644
+ crops_dir = path_utils.PDF_DIR / "cropped_regions"
645
+ anonymized_dir = path_utils.PDF_DIR / "anonymized"
646
+ crops_dir.mkdir(parents=True, exist_ok=True)
647
+ anonymized_dir.mkdir(parents=True, exist_ok=True)
648
+
649
+ # Process with cropping (returns 5-tuple)
650
+ (
651
+ original_text,
652
+ anonymized_text,
653
+ extracted_metadata,
654
+ cropped_regions,
655
+ anonymized_pdf_path,
656
+ ) = report_reader.process_report_with_cropping(
657
+ pdf_path=self.processing_context["file_path"],
658
+ crop_sensitive_regions=True,
659
+ crop_output_dir=str(crops_dir),
660
+ anonymization_output_dir=str(anonymized_dir),
661
+ )
662
+
663
+ # Store results in context
664
+ self.processing_context.update(
665
+ {
666
+ "original_text": original_text,
667
+ "anonymized_text": anonymized_text,
668
+ "extracted_metadata": extracted_metadata,
669
+ "cropped_regions": cropped_regions,
670
+ "anonymized_pdf_path": anonymized_pdf_path,
671
+ }
672
+ )
673
+
674
+ # Apply results
675
+ if original_text:
676
+ self._apply_text_results()
677
+ self.processing_context["text_extracted"] = True
678
+
679
+ if extracted_metadata:
680
+ self._apply_metadata_results()
681
+ self.processing_context["metadata_processed"] = True
682
+
683
+ if anonymized_pdf_path:
684
+ self._apply_anonymized_pdf()
685
+ self.processing_context["anonymization_completed"] = True
686
+
687
+ logger.info("PDF cropping processing completed")
688
+
501
689
  def _apply_text_results(self):
502
690
  """Apply text extraction results to the PDF instance."""
503
691
  if not self.current_pdf:
504
692
  logger.warning("Cannot apply text results - no PDF instance available")
505
693
  return
506
-
507
- original_text = self.processing_context.get('original_text')
508
- anonymized_text = self.processing_context.get('anonymized_text')
509
-
694
+
695
+ original_text = self.processing_context.get("original_text")
696
+ anonymized_text = self.processing_context.get("anonymized_text")
697
+
510
698
  if not original_text:
511
699
  logger.warning("No original text available to apply")
512
700
  return
513
-
701
+
514
702
  # Store extracted text
515
703
  self.current_pdf.text = original_text
516
704
  logger.info(f"Extracted {len(original_text)} characters of text from PDF")
517
-
705
+
518
706
  # Handle anonymized text
519
707
  if anonymized_text and anonymized_text != original_text:
520
708
  self.current_pdf.anonymized = True
@@ -525,56 +713,57 @@ class PdfImportService:
525
713
  if not self.current_pdf:
526
714
  logger.warning("Cannot apply metadata results - no PDF instance available")
527
715
  return
528
-
529
- extracted_metadata = self.processing_context.get('extracted_metadata')
530
-
716
+
717
+ extracted_metadata = self.processing_context.get("extracted_metadata")
718
+
531
719
  if not self.current_pdf.sensitive_meta or not extracted_metadata:
532
720
  logger.debug("No sensitive meta or extracted metadata available")
533
721
  return
534
-
722
+
535
723
  sm = self.current_pdf.sensitive_meta
536
-
724
+
537
725
  # Map ReportReader metadata to SensitiveMeta fields
538
726
  metadata_mapping = {
539
- 'patient_first_name': 'patient_first_name',
540
- 'patient_last_name': 'patient_last_name',
541
- 'patient_dob': 'patient_dob',
542
- 'examination_date': 'examination_date',
543
- 'examiner_first_name': 'examiner_first_name',
544
- 'examiner_last_name': 'examiner_last_name',
545
- 'endoscope_type': 'endoscope_type',
546
- 'casenumber': 'case_number'
727
+ "patient_first_name": "patient_first_name",
728
+ "patient_last_name": "patient_last_name",
729
+ "patient_dob": "patient_dob",
730
+ "examination_date": "examination_date",
731
+ "examiner_first_name": "examiner_first_name",
732
+ "examiner_last_name": "examiner_last_name",
733
+ "endoscope_type": "endoscope_type",
734
+ "casenumber": "casenumber",
735
+ "center_name": "center_name",
547
736
  }
548
-
737
+
549
738
  # Update fields with extracted information
550
739
  updated_fields = []
551
740
  for meta_key, sm_field in metadata_mapping.items():
552
741
  if extracted_metadata.get(meta_key) and hasattr(sm, sm_field):
553
742
  old_value = getattr(sm, sm_field)
554
743
  raw_value = extracted_metadata[meta_key]
555
-
744
+
556
745
  # Skip if we just got the field name as a string (indicates no actual data)
557
746
  if isinstance(raw_value, str) and raw_value == meta_key:
558
747
  continue
559
-
748
+
560
749
  # Handle date fields specially
561
- if sm_field in ['patient_dob', 'examination_date']:
750
+ if sm_field in ["patient_dob", "examination_date"]:
562
751
  new_value = self._parse_date_field(raw_value, meta_key, sm_field)
563
752
  if new_value is None:
564
753
  continue
565
754
  else:
566
755
  new_value = raw_value
567
-
756
+
568
757
  # Configurable overwrite policy
569
758
  should_overwrite = (
570
759
  self.allow_meta_overwrite
571
- or not old_value
572
- or old_value in ['Patient', 'Unknown']
760
+ or self._is_placeholder_value(sm_field, old_value)
573
761
  )
762
+
574
763
  if new_value and should_overwrite:
575
764
  setattr(sm, sm_field, new_value)
576
765
  updated_fields.append(sm_field)
577
-
766
+
578
767
  if updated_fields:
579
768
  sm.save()
580
769
  logger.info(f"Updated SensitiveMeta fields: {updated_fields}")
@@ -587,26 +776,29 @@ class PdfImportService:
587
776
  if raw_value == meta_key:
588
777
  logger.warning(
589
778
  "Skipping date field %s - got field name '%s' instead of actual date",
590
- sm_field, raw_value
779
+ sm_field,
780
+ raw_value,
591
781
  )
592
782
  return None
593
-
783
+
594
784
  # Try common date formats
595
- date_formats = ['%Y-%m-%d', '%d.%m.%Y', '%d/%m/%Y', '%m/%d/%Y']
785
+ date_formats = ["%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y"]
596
786
  for fmt in date_formats:
597
787
  try:
598
788
  return datetime.strptime(raw_value, fmt).date()
599
789
  except ValueError:
600
790
  continue
601
-
602
- logger.warning("Could not parse date '%s' for field %s", raw_value, sm_field)
791
+
792
+ logger.warning(
793
+ "Could not parse date '%s' for field %s", raw_value, sm_field
794
+ )
603
795
  return None
604
-
605
- elif hasattr(raw_value, 'date'):
796
+
797
+ elif hasattr(raw_value, "date"):
606
798
  return raw_value.date()
607
799
  else:
608
800
  return raw_value
609
-
801
+
610
802
  except (ValueError, AttributeError) as e:
611
803
  logger.warning("Date parsing failed for %s: %s", sm_field, e)
612
804
  return None
@@ -626,14 +818,17 @@ class PdfImportService:
626
818
  logger.warning("Cannot apply anonymized PDF - no PDF instance available")
627
819
  return
628
820
 
629
- anonymized_pdf_path = self.processing_context.get('anonymized_pdf_path')
821
+ anonymized_pdf_path = self.processing_context.get("anonymized_pdf_path")
630
822
  if not anonymized_pdf_path:
631
823
  logger.debug("No anonymized_pdf_path present in processing context")
632
824
  return
633
825
 
634
826
  anonymized_path = Path(anonymized_pdf_path)
635
827
  if not anonymized_path.exists():
636
- logger.warning("Anonymized PDF path returned but file does not exist: %s", anonymized_path)
828
+ logger.warning(
829
+ "Anonymized PDF path returned but file does not exist: %s",
830
+ anonymized_path,
831
+ )
637
832
  return
638
833
 
639
834
  logger.info("Anonymized PDF created by ReportReader at: %s", anonymized_path)
@@ -647,7 +842,7 @@ class PdfImportService:
647
842
  relative_name = str(anonymized_path)
648
843
 
649
844
  # Only update if something actually changed
650
- if getattr(self.current_pdf.anonymized_file, 'name', None) != relative_name:
845
+ if getattr(self.current_pdf.anonymized_file, "name", None) != relative_name:
651
846
  self.current_pdf.anonymized_file.name = relative_name
652
847
 
653
848
  # Ensure model/state reflect anonymization even if text didn't differ
@@ -656,46 +851,59 @@ class PdfImportService:
656
851
 
657
852
  # Persist cropped regions info somewhere useful (optional & non-breaking)
658
853
  # If your model has a field for this, persist there; otherwise we just log.
659
- cropped_regions = self.processing_context.get('cropped_regions')
854
+ cropped_regions = self.processing_context.get("cropped_regions")
660
855
  if cropped_regions:
661
- logger.debug("Cropped regions recorded (%d regions).", len(cropped_regions))
856
+ logger.debug(
857
+ "Cropped regions recorded (%d regions).", len(cropped_regions)
858
+ )
662
859
 
663
860
  # Save model changes
664
- update_fields = ['anonymized_file']
665
- if 'anonymized' in self.current_pdf.__dict__:
666
- update_fields.append('anonymized')
861
+ update_fields = ["anonymized_file"]
862
+ if "anonymized" in self.current_pdf.__dict__:
863
+ update_fields.append("anonymized")
667
864
  self.current_pdf.save(update_fields=update_fields)
668
865
 
669
866
  # Mark state as anonymized immediately; this keeps downstream flows working
670
867
  state = self._ensure_state(self.current_pdf)
671
- if state and not state.anonymized:
672
- state.mark_anonymized(save=True)
868
+
869
+ if state and not state.processing_started:
870
+ state.mark_processing_started()
673
871
 
674
- logger.info("Updated anonymized_file reference to: %s", self.current_pdf.anonymized_file.name)
872
+ logger.info(
873
+ "Updated anonymized_file reference to: %s",
874
+ self.current_pdf.anonymized_file.name,
875
+ )
675
876
 
676
877
  except Exception as e:
677
878
  logger.warning("Could not set anonymized file reference: %s", e)
678
879
 
679
-
680
880
  def _finalize_processing(self):
681
881
  """Finalize processing and update state."""
682
882
  if not self.current_pdf:
683
883
  logger.warning("Cannot finalize processing - no PDF instance available")
684
884
  return
685
-
885
+
686
886
  try:
687
887
  # Update state based on processing results
688
888
  state = self._ensure_state(self.current_pdf)
689
-
690
- if self.processing_context.get('text_extracted') and state:
889
+
890
+ if self.processing_context.get("text_extracted") and state:
691
891
  state.mark_anonymized()
692
-
892
+
893
+ # Mark as ready for validation after successful anonymization
894
+ if self.processing_context.get("anonymization_completed") and state:
895
+ state.mark_sensitive_meta_processed()
896
+ logger.info(
897
+ f"PDF {self.current_pdf.pdf_hash} processing completed - "
898
+ f"ready for validation (status: {state.anonymization_status})"
899
+ )
900
+
693
901
  # Save all changes
694
902
  with transaction.atomic():
695
903
  self.current_pdf.save()
696
904
  if state:
697
905
  state.save()
698
-
906
+
699
907
  logger.info("PDF processing completed successfully")
700
908
  except Exception as e:
701
909
  logger.warning(f"Failed to finalize processing: {e}")
@@ -703,9 +911,11 @@ class PdfImportService:
703
911
  def _mark_processing_incomplete(self, reason: str):
704
912
  """Mark processing as incomplete with reason."""
705
913
  if not self.current_pdf:
706
- logger.warning(f"Cannot mark processing incomplete - no PDF instance available. Reason: {reason}")
914
+ logger.warning(
915
+ f"Cannot mark processing incomplete - no PDF instance available. Reason: {reason}"
916
+ )
707
917
  return
708
-
918
+
709
919
  try:
710
920
  state = self._ensure_state(self.current_pdf)
711
921
  if state:
@@ -714,7 +924,7 @@ class PdfImportService:
714
924
  state.sensitive_meta_processed = False
715
925
  state.save()
716
926
  logger.info(f"Set PDF state: processed=False due to {reason}")
717
-
927
+
718
928
  # Save changes
719
929
  with transaction.atomic():
720
930
  self.current_pdf.save()
@@ -722,41 +932,114 @@ class PdfImportService:
722
932
  logger.warning(f"Failed to mark processing incomplete: {e}")
723
933
 
724
934
  def _retry_existing_pdf(self, existing_pdf):
725
- """Retry processing for existing PDF."""
935
+ """
936
+ Retry processing for existing PDF.
937
+
938
+ Uses get_raw_file_path() to find the original raw file instead of
939
+ relying on the file field which may point to a deleted sensitive file.
940
+ """
726
941
  try:
942
+ # ✅ FIX: Use get_raw_file_path() to find original file
943
+ raw_file_path = existing_pdf.get_raw_file_path()
944
+
945
+ if not raw_file_path or not raw_file_path.exists():
946
+ logger.error(
947
+ f"Cannot retry PDF {existing_pdf.pdf_hash}: Raw file not found. "
948
+ f"Please re-upload the original PDF file."
949
+ )
950
+ self.current_pdf = existing_pdf
951
+ return existing_pdf
952
+
953
+ logger.info(f"Found raw file for retry at: {raw_file_path}")
954
+
727
955
  # Remove from processed files to allow retry
728
- file_path_str = str(existing_pdf.file.path) if existing_pdf.file else None
729
- if file_path_str and file_path_str in self.processed_files:
956
+ file_path_str = str(raw_file_path)
957
+ if file_path_str in self.processed_files:
730
958
  self.processed_files.remove(file_path_str)
731
959
  logger.debug(f"Removed {file_path_str} from processed files for retry")
732
-
960
+
733
961
  return self.import_and_anonymize(
734
- file_path=existing_pdf.file.path,
735
- center_name=existing_pdf.center.name if existing_pdf.center else "unknown_center",
736
- delete_source=False,
737
- retry=True
962
+ file_path=raw_file_path, # ✅ Use raw file path, not sensitive path
963
+ center_name=existing_pdf.center.name
964
+ if existing_pdf.center
965
+ else "unknown_center",
966
+ delete_source=False, # Never delete during retry
967
+ retry=True,
738
968
  )
739
969
  except Exception as e:
740
- logger.error(f"Failed to re-import existing PDF {existing_pdf.pdf_hash}: {e}")
970
+ logger.error(
971
+ f"Failed to re-import existing PDF {existing_pdf.pdf_hash}: {e}"
972
+ )
741
973
  self.current_pdf = existing_pdf
742
974
  return existing_pdf
743
975
 
744
976
  def _cleanup_on_error(self):
745
977
  """Cleanup processing context on error."""
978
+ original_path = self.original_path
746
979
  try:
747
- if self.current_pdf and hasattr(self.current_pdf, 'state'):
980
+ if self.current_pdf and hasattr(self.current_pdf, "state"):
748
981
  state = self._ensure_state(self.current_pdf)
749
- if state and self.processing_context.get('processing_started'):
982
+ raw_file_path = self.current_pdf.get_raw_file_path()
983
+ if raw_file_path is not None and original_path is not None:
984
+ # Ensure reprocessing for next attempt by restoring original file
985
+ shutil.copy2(str(raw_file_path), str(original_path))
986
+
987
+ # Ensure no two files can remain
988
+ if raw_file_path == original_path and raw_file_path is not None and original_path is not None:
989
+ os.remove(str(raw_file_path))
990
+
991
+
992
+ # Remove Lock file also
993
+ lock_path = Path(str(path_utils.PDF_DIR) + ".lock")
994
+ try:
995
+ if lock_path.exists():
996
+ lock_path.unlink()
997
+ logger.info("Removed lock file during quarantine: %s", lock_path)
998
+ except Exception as e:
999
+ logger.warning("Could not remove lock file during quarantine: %s", e)
1000
+
1001
+
1002
+ if state and self.processing_context.get("processing_started"):
750
1003
  state.text_meta_extracted = False
751
1004
  state.pdf_meta_extracted = False
752
1005
  state.sensitive_meta_processed = False
1006
+ state.anonymized = False
753
1007
  state.save()
754
1008
  logger.debug("Updated PDF state to indicate processing failure")
1009
+ else:
1010
+ # 🔧 Early failure: no current_pdf (or no state).
1011
+ # In this case we want to make sure we don't leave stray files
1012
+ # under PDF_DIR or PDF_DIR/sensitive.
1013
+
1014
+ pdf_dir = self._get_pdf_dir()
1015
+ if pdf_dir and pdf_dir.exists():
1016
+ for candidate_dir in (pdf_dir, pdf_dir / "sensitive"):
1017
+ if candidate_dir.exists():
1018
+ for candidate in candidate_dir.glob("*.pdf"):
1019
+ # Don't delete the original ingress file
1020
+ if (
1021
+ original_path is not None
1022
+ and candidate.resolve() == Path(original_path).resolve()
1023
+ ):
1024
+ continue
1025
+ try:
1026
+ candidate.unlink()
1027
+ logger.debug(
1028
+ "Removed stray PDF during early error cleanup: %s",
1029
+ candidate,
1030
+ )
1031
+ except Exception as e:
1032
+ logger.warning(
1033
+ "Failed to remove stray PDF %s: %s",
1034
+ candidate,
1035
+ e,
1036
+ )
1037
+
755
1038
  except Exception as e:
756
1039
  logger.warning(f"Error during cleanup: {e}")
757
1040
  finally:
758
1041
  # Remove any sensitive copy created during this processing run
759
- sensitive_created = self.processing_context.get('sensitive_copy_created')
1042
+ sensitive_created = self.processing_context.get("sensitive_copy_created")
760
1043
  if sensitive_created:
761
1044
  pdf_obj = self.current_pdf
762
1045
  try:
@@ -765,30 +1048,48 @@ class PdfImportService:
765
1048
  if file_field and getattr(file_field, "name", None):
766
1049
  storage_name = file_field.name
767
1050
  file_field.delete(save=False)
768
- logger.debug("Deleted sensitive copy %s during error cleanup", storage_name)
1051
+ logger.debug(
1052
+ "Deleted sensitive copy %s during error cleanup",
1053
+ storage_name,
1054
+ )
769
1055
  except Exception as cleanup_exc:
770
- logger.warning("Failed to remove sensitive copy during error cleanup: %s", cleanup_exc)
1056
+ logger.warning(
1057
+ "Failed to remove sensitive copy during error cleanup: %s",
1058
+ cleanup_exc,
1059
+ )
1060
+ pdf_dir = self._get_pdf_dir()
1061
+ if original_path and pdf_dir:
1062
+ # Try to remove any extra file that was created during import
1063
+ # Simplest heuristic: same basename as original, but in pdf dir or pdf/sensitive dir
1064
+ for candidate_dir in (pdf_dir, pdf_dir / "sensitive"):
1065
+ candidate = candidate_dir / original_path.name
1066
+ if candidate.exists() and candidate != original_path:
1067
+ try:
1068
+ candidate.unlink()
1069
+ logger.debug(
1070
+ "Removed stray PDF copy during early error cleanup: %s",
1071
+ candidate,
1072
+ )
1073
+ except Exception as e:
1074
+ logger.warning(
1075
+ "Failed to remove stray PDF copy %s: %s",
1076
+ candidate,
1077
+ e,
1078
+ )
771
1079
 
772
1080
  # Always clean up processed files set to prevent blocks
773
- file_path = self.processing_context.get('file_path')
1081
+ file_path = self.processing_context.get("file_path")
774
1082
  if file_path and str(file_path) in self.processed_files:
775
1083
  self.processed_files.remove(str(file_path))
776
- logger.debug(f"Removed {file_path} from processed files during error cleanup")
1084
+ logger.debug(
1085
+ f"Removed {file_path} from processed files during error cleanup"
1086
+ )
777
1087
 
778
1088
  try:
779
- original_path = self.processing_context.get('original_file_path')
780
- logger.debug("PDF cleanup original path: %s (%s)", original_path, type(original_path))
781
- raw_dir = original_path.parent if isinstance(original_path, Path) else None
782
- if (
783
- isinstance(original_path, Path)
784
- and original_path.exists()
785
- and not self.processing_context.get('sensitive_copy_created')
786
- ):
787
- try:
788
- original_path.unlink()
789
- logger.info("Removed original file %s during error cleanup", original_path)
790
- except Exception as remove_exc:
791
- logger.warning("Could not remove original file %s during error cleanup: %s", original_path, remove_exc)
1089
+ raw_dir = (
1090
+ original_path.parent if isinstance(original_path, Path) else None
1091
+ )
1092
+
792
1093
  pdf_dir = self._get_pdf_dir()
793
1094
  if not pdf_dir and raw_dir:
794
1095
  base_dir = raw_dir.parent
@@ -805,7 +1106,12 @@ class PdfImportService:
805
1106
 
806
1107
  # Remove empty PDF subdirectories that might have been created during setup
807
1108
  if pdf_dir and pdf_dir.exists():
808
- for subdir_name in ("sensitive", "cropped_regions", "anonymized", "_processing"):
1109
+ for subdir_name in (
1110
+ "sensitive",
1111
+ "cropped_regions",
1112
+ "anonymized",
1113
+ "_processing",
1114
+ ):
809
1115
  subdir_path = pdf_dir / subdir_name
810
1116
  if subdir_path.exists() and subdir_path.is_dir():
811
1117
  try:
@@ -813,22 +1119,49 @@ class PdfImportService:
813
1119
  except StopIteration:
814
1120
  try:
815
1121
  subdir_path.rmdir()
816
- logger.debug("Removed empty directory %s during error cleanup", subdir_path)
1122
+ logger.debug(
1123
+ "Removed empty directory %s during error cleanup",
1124
+ subdir_path,
1125
+ )
817
1126
  except OSError as rm_err:
818
- logger.debug("Could not remove directory %s: %s", subdir_path, rm_err)
1127
+ logger.debug(
1128
+ "Could not remove directory %s: %s",
1129
+ subdir_path,
1130
+ rm_err,
1131
+ )
819
1132
  except Exception as iter_err:
820
- logger.debug("Could not inspect directory %s: %s", subdir_path, iter_err)
821
-
822
- raw_count = len(list(raw_dir.glob("*"))) if raw_dir and raw_dir.exists() else None
823
- pdf_count = len(list(pdf_dir.glob("*"))) if pdf_dir and pdf_dir.exists() else None
1133
+ logger.debug(
1134
+ "Could not inspect directory %s: %s",
1135
+ subdir_path,
1136
+ iter_err,
1137
+ )
1138
+
1139
+ raw_count = (
1140
+ len(list(raw_dir.glob("*")))
1141
+ if raw_dir and raw_dir.exists()
1142
+ else None
1143
+ )
1144
+ pdf_count = (
1145
+ len(list(pdf_dir.glob("*")))
1146
+ if pdf_dir and pdf_dir.exists()
1147
+ else None
1148
+ )
824
1149
 
825
- sensitive_path = self.processing_context.get('sensitive_file_path')
1150
+ sensitive_path = self.processing_context.get("sensitive_file_path")
826
1151
  if sensitive_path:
827
1152
  sensitive_parent = Path(sensitive_path).parent
828
- sensitive_count = len(list(sensitive_parent.glob("*"))) if sensitive_parent.exists() else None
1153
+ sensitive_count = (
1154
+ len(list(sensitive_parent.glob("*")))
1155
+ if sensitive_parent.exists()
1156
+ else None
1157
+ )
829
1158
  else:
830
1159
  sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
831
- sensitive_count = len(list(sensitive_dir.glob("*"))) if sensitive_dir and sensitive_dir.exists() else None
1160
+ sensitive_count = (
1161
+ len(list(sensitive_dir.glob("*")))
1162
+ if sensitive_dir and sensitive_dir.exists()
1163
+ else None
1164
+ )
832
1165
 
833
1166
  logger.info(
834
1167
  "PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
@@ -843,17 +1176,17 @@ class PdfImportService:
843
1176
  """Cleanup processing context."""
844
1177
  try:
845
1178
  # Clean up temporary directories
846
- if self.processing_context.get('text_extracted'):
847
- crops_dir = path_utils.PDF_DIR / 'cropped_regions'
1179
+ if self.processing_context.get("text_extracted"):
1180
+ crops_dir = path_utils.PDF_DIR / "cropped_regions"
848
1181
  if crops_dir.exists() and not any(crops_dir.iterdir()):
849
1182
  crops_dir.rmdir()
850
-
1183
+
851
1184
  # Always remove from processed files set after processing attempt
852
- file_path = self.processing_context.get('file_path')
1185
+ file_path = self.processing_context.get("file_path")
853
1186
  if file_path and str(file_path) in self.processed_files:
854
1187
  self.processed_files.remove(str(file_path))
855
1188
  logger.debug(f"Removed {file_path} from processed files set")
856
-
1189
+
857
1190
  except Exception as e:
858
1191
  logger.warning(f"Error during context cleanup: {e}")
859
1192
  finally:
@@ -862,44 +1195,43 @@ class PdfImportService:
862
1195
  self.processing_context = {}
863
1196
 
864
1197
  def import_simple(
865
- self,
866
- file_path: Union[Path, str],
867
- center_name: str,
868
- delete_source: bool = False
1198
+ self, file_path: Union[Path, str], center_name: str, delete_source: bool = False
869
1199
  ) -> "RawPdfFile":
870
1200
  """
871
1201
  Simple PDF import without text processing or anonymization.
872
1202
  Uses centralized PDF instance management pattern.
873
-
1203
+
874
1204
  Args:
875
1205
  file_path: Path to the PDF file to import
876
1206
  center_name: Name of the center to associate with PDF
877
1207
  delete_source: Whether to delete the source file after import
878
-
1208
+
879
1209
  Returns:
880
1210
  RawPdfFile instance after basic import
881
1211
  """
882
1212
  try:
883
1213
  # Initialize simple processing context
884
- self._initialize_processing_context(file_path, center_name, delete_source, False)
885
-
1214
+ self._initialize_processing_context(
1215
+ file_path, center_name, delete_source, False
1216
+ )
1217
+
886
1218
  # Validate file
887
1219
  self._validate_and_prepare_file()
888
-
1220
+
889
1221
  # Create PDF instance
890
1222
  logger.info("Starting simple import - creating RawPdfFile instance...")
891
1223
  self.current_pdf = RawPdfFile.create_from_file_initialized(
892
- file_path=self.processing_context['file_path'],
1224
+ file_path=self.processing_context["file_path"],
893
1225
  center_name=center_name,
894
1226
  delete_source=delete_source,
895
1227
  )
896
-
1228
+
897
1229
  if not self.current_pdf:
898
1230
  raise RuntimeError("Failed to create RawPdfFile instance")
899
-
1231
+
900
1232
  # Mark as processed
901
- self.processed_files.add(str(self.processing_context['file_path']))
902
-
1233
+ self.processed_files.add(str(self.processing_context["file_path"]))
1234
+
903
1235
  # Set basic state for simple import
904
1236
  state = self._ensure_state(self.current_pdf)
905
1237
  if state:
@@ -908,56 +1240,68 @@ class PdfImportService:
908
1240
  state.sensitive_meta_processed = False
909
1241
  state.save()
910
1242
  logger.info("Set PDF state: processed=False for simple import")
911
-
1243
+
912
1244
  # Save changes
913
1245
  with transaction.atomic():
914
1246
  self.current_pdf.save()
915
-
916
- logger.info("Simple import completed for RawPdfFile hash: %s", self.current_pdf.pdf_hash)
1247
+
1248
+ logger.info(
1249
+ "Simple import completed for RawPdfFile hash: %s",
1250
+ self.current_pdf.pdf_hash,
1251
+ )
917
1252
  return self.current_pdf
918
-
1253
+
919
1254
  except Exception as e:
920
1255
  logger.error(f"Simple PDF import failed for {file_path}: {e}")
921
1256
  self._cleanup_on_error()
922
1257
  raise
923
1258
  finally:
924
1259
  self._cleanup_processing_context()
925
-
926
- def check_storage_capacity(self, file_path: Union[Path, str], storage_root, min_required_space) -> None:
1260
+
1261
+ def check_storage_capacity(
1262
+ self, file_path: Union[Path, str], storage_root, min_required_space
1263
+ ) -> bool:
927
1264
  """
928
1265
  Check if there is sufficient storage capacity for the PDF file.
929
-
1266
+
930
1267
  Args:
931
1268
  file_path: Path to the PDF file to check
932
-
1269
+
933
1270
  Raises:
934
1271
  InsufficientStorageError: If there is not enough space
935
1272
  """
936
1273
  import shutil
1274
+
937
1275
  from endoreg_db.exceptions import InsufficientStorageError
938
-
1276
+
939
1277
  file_path = Path(file_path)
940
1278
  if not file_path.exists():
941
1279
  raise FileNotFoundError(f"File not found for storage check: {file_path}")
942
-
1280
+
943
1281
  # Get the size of the file
944
1282
  file_size = file_path.stat().st_size
945
-
1283
+
946
1284
  # Get available space in the storage directory
947
1285
 
948
1286
  total, used, free = shutil.disk_usage(storage_root)
949
-
1287
+
950
1288
  if file_size:
951
1289
  min_required_space = file_size if isinstance(min_required_space, int) else 0
952
1290
 
953
1291
  # Check if there is enough space
954
1292
  if file_size > free:
955
- raise InsufficientStorageError(f"Not enough space to store PDF file: {file_path}")
956
- logger.info(f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available")
957
-
1293
+ raise InsufficientStorageError(
1294
+ f"Not enough space to store PDF file: {file_path}"
1295
+ )
1296
+ logger.info(
1297
+ f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available"
1298
+ )
1299
+
958
1300
  return True
959
-
960
- def create_sensitive_file(self, pdf_instance: "RawPdfFile" = None, file_path: Union[Path, str] = None) -> None:
1301
+
1302
+ def create_sensitive_file(
1303
+ self, pdf_instance: "RawPdfFile", file_path: Union[Path, str]
1304
+ ) -> None:
961
1305
  """
962
1306
  Create a copy of the PDF file in the sensitive directory and update the file reference.
963
1307
  Delete the source path to avoid duplicates.
@@ -966,7 +1310,9 @@ class PdfImportService:
966
1310
  Ensures the FileField points to the file under STORAGE_DIR/pdfs/sensitive and never back to raw_pdfs.
967
1311
  """
968
1312
  pdf_file = pdf_instance or self.current_pdf
969
- source_path = Path(file_path) if file_path else self.processing_context.get('file_path')
1313
+ source_path = (
1314
+ Path(file_path) if file_path else self.processing_context.get("file_path")
1315
+ )
970
1316
 
971
1317
  if not pdf_file:
972
1318
  raise ValueError("No PDF instance available for creating sensitive file")
@@ -989,25 +1335,37 @@ class PdfImportService:
989
1335
  try:
990
1336
  target.unlink()
991
1337
  except Exception as e:
992
- logger.warning("Could not remove existing sensitive target %s: %s", target, e)
1338
+ logger.warning(
1339
+ "Could not remove existing sensitive target %s: %s",
1340
+ target,
1341
+ e,
1342
+ )
993
1343
  shutil.move(str(source_path), str(target))
994
1344
  logger.info(f"Moved PDF to sensitive directory: {target}")
995
1345
 
996
1346
  # Update FileField to reference the file under STORAGE_DIR
997
1347
  # We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
998
1348
  try:
999
- relative_name = str(target.relative_to(path_utils.STORAGE_DIR)) # Point Django FileField to sensitive storage
1349
+ relative_name = str(
1350
+ target.relative_to(path_utils.STORAGE_DIR)
1351
+ ) # Point Django FileField to sensitive storage
1000
1352
  except ValueError:
1001
1353
  # Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
1002
1354
  relative_name = str(target)
1003
1355
 
1004
1356
  # Only update when changed
1005
- if getattr(pdf_file.file, 'name', None) != relative_name:
1357
+ if getattr(pdf_file.file, "name", None) != relative_name:
1006
1358
  pdf_file.file.name = relative_name
1007
- pdf_file.save(update_fields=['file'])
1008
- logger.info("Updated PDF FileField reference to sensitive path: %s", pdf_file.file.path)
1359
+ pdf_file.save(update_fields=["file"])
1360
+ logger.info(
1361
+ "Updated PDF FileField reference to sensitive path: %s",
1362
+ pdf_file.file.path,
1363
+ )
1009
1364
  else:
1010
- logger.debug("PDF FileField already points to sensitive path: %s", pdf_file.file.path)
1365
+ logger.debug(
1366
+ "PDF FileField already points to sensitive path: %s",
1367
+ pdf_file.file.path,
1368
+ )
1011
1369
 
1012
1370
  # Best-effort: if original source still exists (e.g., copy), remove it to avoid re-triggers
1013
1371
  try:
@@ -1018,57 +1376,78 @@ class PdfImportService:
1018
1376
  logger.warning(f"Could not delete original PDF file {source_path}: {e}")
1019
1377
 
1020
1378
  except Exception as e:
1021
- logger.warning(f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}", exc_info=True)
1379
+ logger.warning(
1380
+ f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}",
1381
+ exc_info=True,
1382
+ )
1022
1383
 
1023
- def archive_or_quarantine_file(self, pdf_instance: "RawPdfFile" = None, source_file_path: Union[Path, str] = None,
1024
- quarantine_reason: str = None, is_pdf_problematic: bool = None) -> bool:
1384
+ def archive_or_quarantine_file(
1385
+ self,
1386
+ pdf_instance: "RawPdfFile",
1387
+ source_file_path: Union[Path, str],
1388
+ quarantine_reason: str,
1389
+ is_pdf_problematic: bool,
1390
+ ) -> bool:
1025
1391
  """
1026
1392
  Archive or quarantine file based on the state of the PDF processing.
1027
1393
  Uses the central PDF instance and processing context if parameters not provided.
1028
-
1394
+
1029
1395
  Args:
1030
1396
  pdf_instance: Optional PDF instance, defaults to self.current_pdf
1031
1397
  source_file_path: Optional source file path, defaults to processing_context['file_path']
1032
1398
  quarantine_reason: Optional quarantine reason, defaults to processing_context['error_reason']
1033
1399
  is_pdf_problematic: Optional override for problematic state
1034
-
1400
+
1035
1401
  Returns:
1036
1402
  bool: True if file was quarantined, False if archived successfully
1037
1403
  """
1038
1404
  pdf_file = pdf_instance or self.current_pdf
1039
- file_path = Path(source_file_path) if source_file_path else self.processing_context.get('file_path')
1040
- quarantine_reason = quarantine_reason or self.processing_context.get('error_reason')
1041
-
1405
+ file_path = (
1406
+ Path(source_file_path)
1407
+ if source_file_path
1408
+ else self.processing_context.get("file_path")
1409
+ )
1410
+ quarantine_reason = str(quarantine_reason or self.processing_context.get(
1411
+ "error_reason"
1412
+ ))
1413
+
1042
1414
  if not pdf_file:
1043
1415
  raise ValueError("No PDF instance available for archiving/quarantine")
1044
1416
  if not file_path:
1045
1417
  raise ValueError("No file path available for archiving/quarantine")
1046
-
1418
+
1047
1419
  # Determine if the PDF is problematic
1048
- pdf_problematic = is_pdf_problematic if is_pdf_problematic is not None else pdf_file.is_problematic
1049
-
1420
+ pdf_problematic = (
1421
+ is_pdf_problematic
1422
+ if is_pdf_problematic is not None
1423
+ else pdf_file.is_problematic
1424
+ )
1425
+
1050
1426
  if pdf_problematic:
1051
1427
  # Quarantine the file
1052
- logger.warning(f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}")
1428
+ logger.warning(
1429
+ f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}"
1430
+ )
1053
1431
  quarantine_dir = path_utils.PDF_DIR / "quarantine"
1054
1432
  os.makedirs(quarantine_dir, exist_ok=True)
1055
-
1433
+
1056
1434
  quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
1057
1435
  try:
1058
1436
  shutil.move(file_path, quarantine_path)
1059
- pdf_file.quarantine_reason = quarantine_reason or "File processing failed"
1060
- pdf_file.save(update_fields=['quarantine_reason'])
1437
+ pdf_file.save(update_fields=["quarantine_reason"])
1061
1438
  logger.info(f"Moved problematic PDF to quarantine: {quarantine_path}")
1062
1439
  return True
1063
1440
  except Exception as e:
1064
1441
  logger.error(f"Failed to quarantine PDF {pdf_file.pdf_hash}: {e}")
1065
- return True # Still consider as quarantined to prevent further processing
1442
+ return (
1443
+ True # Still consider as quarantined to prevent further processing
1444
+ )
1066
1445
  else:
1067
1446
  # Archive the file normally
1068
1447
  logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
1069
1448
  archive_dir = path_utils.PDF_DIR / "processed"
1070
1449
  os.makedirs(archive_dir, exist_ok=True)
1071
-
1450
+
1072
1451
  archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
1073
1452
  try:
1074
1453
  shutil.move(file_path, archive_path)
@@ -1077,3 +1456,25 @@ class PdfImportService:
1077
1456
  except Exception as e:
1078
1457
  logger.error(f"Failed to archive PDF {pdf_file.pdf_hash}: {e}")
1079
1458
  return False
1459
+
1460
+ def _is_placeholder_value(self, field_name: str, value) -> bool:
1461
+ """Return True if a SensitiveMeta field still has a dummy/default value."""
1462
+ if value is None:
1463
+ return True
1464
+
1465
+ # String placeholders
1466
+ if isinstance(value, str):
1467
+ if value in {self.DEFAULT_PATIENT_FIRST_NAME, self.DEFAULT_PATIENT_LAST_NAME}:
1468
+ return True
1469
+
1470
+ # Date placeholders
1471
+ if isinstance(value, date):
1472
+ # Default DOB
1473
+ if field_name == "patient_dob" and value == self.DEFAULT_PATIENT_DOB:
1474
+ return True
1475
+ # "Today" exam date created as fallback – allow anonymizer to override
1476
+ if field_name == "examination_date" and value == date.today():
1477
+ return True
1478
+
1479
+ return False
1480
+