endoreg-db 0.8.4.4__py3-none-any.whl → 0.8.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of endoreg-db might be problematic. Click here for more details.
- endoreg_db/authz/auth.py +74 -0
- endoreg_db/authz/backends.py +168 -0
- endoreg_db/authz/management/commands/list_routes.py +18 -0
- endoreg_db/authz/middleware.py +83 -0
- endoreg_db/authz/permissions.py +127 -0
- endoreg_db/authz/policy.py +218 -0
- endoreg_db/authz/views_auth.py +66 -0
- endoreg_db/config/env.py +13 -8
- endoreg_db/data/__init__.py +8 -31
- endoreg_db/data/_examples/disease.yaml +55 -0
- endoreg_db/data/_examples/disease_classification.yaml +13 -0
- endoreg_db/data/_examples/disease_classification_choice.yaml +62 -0
- endoreg_db/data/_examples/event.yaml +64 -0
- endoreg_db/data/_examples/examination.yaml +72 -0
- endoreg_db/data/_examples/finding/anatomy_colon.yaml +128 -0
- endoreg_db/data/_examples/finding/colonoscopy.yaml +40 -0
- endoreg_db/data/_examples/finding/colonoscopy_bowel_prep.yaml +56 -0
- endoreg_db/data/_examples/finding/complication.yaml +16 -0
- endoreg_db/data/_examples/finding/data.yaml +105 -0
- endoreg_db/data/_examples/finding/examination_setting.yaml +16 -0
- endoreg_db/data/_examples/finding/medication_related.yaml +18 -0
- endoreg_db/data/_examples/finding/outcome.yaml +12 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_bowel_preparation.yaml +68 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_jnet.yaml +22 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_kudo.yaml +25 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_circularity.yaml +20 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_planarity.yaml +24 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_size.yaml +68 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_lesion_surface.yaml +20 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_location.yaml +80 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_lst.yaml +21 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_nice.yaml +20 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_paris.yaml +26 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_sano.yaml +22 -0
- endoreg_db/data/_examples/finding_classification/colonoscopy_summary.yaml +53 -0
- endoreg_db/data/_examples/finding_classification/complication_generic.yaml +25 -0
- endoreg_db/data/_examples/finding_classification/examination_setting_generic.yaml +40 -0
- endoreg_db/data/_examples/finding_classification/histology_colo.yaml +51 -0
- endoreg_db/data/_examples/finding_classification/intervention_required.yaml +26 -0
- endoreg_db/data/_examples/finding_classification/medication_related.yaml +23 -0
- endoreg_db/data/_examples/finding_classification/visualized.yaml +33 -0
- endoreg_db/data/_examples/finding_classification_choice/bowel_preparation.yaml +78 -0
- endoreg_db/data/_examples/finding_classification_choice/colon_lesion_circularity_default.yaml +32 -0
- endoreg_db/data/_examples/finding_classification_choice/colon_lesion_jnet.yaml +15 -0
- endoreg_db/data/_examples/finding_classification_choice/colon_lesion_kudo.yaml +23 -0
- endoreg_db/data/_examples/finding_classification_choice/colon_lesion_lst.yaml +15 -0
- endoreg_db/data/_examples/finding_classification_choice/colon_lesion_nice.yaml +17 -0
- endoreg_db/data/_examples/finding_classification_choice/colon_lesion_paris.yaml +57 -0
- endoreg_db/data/_examples/finding_classification_choice/colon_lesion_planarity_default.yaml +49 -0
- endoreg_db/data/_examples/finding_classification_choice/colon_lesion_sano.yaml +14 -0
- endoreg_db/data/_examples/finding_classification_choice/colon_lesion_surface_intact_default.yaml +36 -0
- endoreg_db/data/_examples/finding_classification_choice/colonoscopy_location.yaml +229 -0
- endoreg_db/data/_examples/finding_classification_choice/colonoscopy_not_complete_reason.yaml +19 -0
- endoreg_db/data/_examples/finding_classification_choice/colonoscopy_size.yaml +82 -0
- endoreg_db/data/_examples/finding_classification_choice/colonoscopy_summary_worst_finding.yaml +15 -0
- endoreg_db/data/_examples/finding_classification_choice/complication_generic_types.yaml +15 -0
- endoreg_db/data/_examples/finding_classification_choice/examination_setting_generic_types.yaml +15 -0
- endoreg_db/data/_examples/finding_classification_choice/histology.yaml +24 -0
- endoreg_db/data/_examples/finding_classification_choice/histology_polyp.yaml +20 -0
- endoreg_db/data/_examples/finding_classification_choice/outcome.yaml +19 -0
- endoreg_db/data/_examples/finding_classification_choice/yes_no_na.yaml +11 -0
- endoreg_db/data/_examples/finding_classification_type/colonoscopy_basic.yaml +48 -0
- endoreg_db/data/_examples/finding_intervention/endoscopy.yaml +43 -0
- endoreg_db/data/_examples/finding_intervention/endoscopy_colonoscopy.yaml +168 -0
- endoreg_db/data/_examples/finding_intervention/endoscopy_egd.yaml +128 -0
- endoreg_db/data/_examples/finding_intervention/endoscopy_ercp.yaml +32 -0
- endoreg_db/data/_examples/finding_intervention/endoscopy_eus_lower.yaml +9 -0
- endoreg_db/data/_examples/finding_intervention/endoscopy_eus_upper.yaml +36 -0
- endoreg_db/data/_examples/finding_intervention_type/endoscopy.yaml +15 -0
- endoreg_db/data/_examples/finding_type/data.yaml +43 -0
- endoreg_db/data/_examples/requirement/age.yaml +26 -0
- endoreg_db/data/_examples/requirement/colonoscopy_baseline_austria.yaml +45 -0
- endoreg_db/data/_examples/requirement/disease_cardiovascular.yaml +79 -0
- endoreg_db/data/_examples/requirement/disease_classification_choice_cardiovascular.yaml +41 -0
- endoreg_db/data/_examples/requirement/disease_hepatology.yaml +12 -0
- endoreg_db/data/_examples/requirement/disease_misc.yaml +12 -0
- endoreg_db/data/_examples/requirement/disease_renal.yaml +96 -0
- endoreg_db/data/_examples/requirement/endoscopy_bleeding_risk.yaml +59 -0
- endoreg_db/data/_examples/requirement/event_cardiology.yaml +251 -0
- endoreg_db/data/_examples/requirement/event_requirements.yaml +145 -0
- endoreg_db/data/_examples/requirement/finding_colon_polyp.yaml +50 -0
- endoreg_db/data/_examples/requirement/gender.yaml +25 -0
- endoreg_db/data/_examples/requirement/lab_value.yaml +441 -0
- endoreg_db/data/_examples/requirement/medication.yaml +93 -0
- endoreg_db/data/_examples/requirement_operator/age.yaml +13 -0
- endoreg_db/data/_examples/requirement_operator/lab_operators.yaml +129 -0
- endoreg_db/data/_examples/requirement_operator/model_operators.yaml +96 -0
- endoreg_db/data/_examples/requirement_set/01_endoscopy_generic.yaml +48 -0
- endoreg_db/data/_examples/requirement_set/colonoscopy_austria_screening.yaml +57 -0
- endoreg_db/data/_examples/yaml_examples.xlsx +0 -0
- endoreg_db/data/ai_model_meta/default_multilabel_classification.yaml +4 -3
- endoreg_db/data/event_classification/data.yaml +4 -0
- endoreg_db/data/event_classification_choice/data.yaml +9 -0
- endoreg_db/data/finding_classification/colonoscopy_bowel_preparation.yaml +43 -70
- endoreg_db/data/finding_classification/colonoscopy_lesion_size.yaml +22 -52
- endoreg_db/data/finding_classification/colonoscopy_location.yaml +31 -62
- endoreg_db/data/finding_classification/histology_colo.yaml +28 -36
- endoreg_db/data/requirement/colon_polyp_intervention.yaml +49 -0
- endoreg_db/data/requirement/coloreg_colon_polyp.yaml +49 -0
- endoreg_db/data/requirement_set/01_endoscopy_generic.yaml +31 -12
- endoreg_db/data/requirement_set/01_laboratory.yaml +13 -0
- endoreg_db/data/requirement_set/02_endoscopy_bleeding_risk.yaml +46 -0
- endoreg_db/data/requirement_set/90_coloreg.yaml +178 -0
- endoreg_db/data/requirement_set/_old_ +109 -0
- endoreg_db/data/requirement_set_type/data.yaml +21 -0
- endoreg_db/data/setup_config.yaml +4 -4
- endoreg_db/data/tag/requirement_set_tags.yaml +21 -0
- endoreg_db/exceptions.py +5 -2
- endoreg_db/helpers/data_loader.py +1 -1
- endoreg_db/management/commands/create_model_meta_from_huggingface.py +21 -10
- endoreg_db/management/commands/create_multilabel_model_meta.py +299 -129
- endoreg_db/management/commands/import_video.py +9 -10
- endoreg_db/management/commands/import_video_with_classification.py +1 -1
- endoreg_db/management/commands/init_default_ai_model.py +1 -1
- endoreg_db/management/commands/list_routes.py +18 -0
- endoreg_db/management/commands/load_ai_model_data.py +2 -1
- endoreg_db/management/commands/load_center_data.py +12 -12
- endoreg_db/management/commands/load_requirement_data.py +60 -31
- endoreg_db/management/commands/load_requirement_set_tags.py +95 -0
- endoreg_db/management/commands/setup_endoreg_db.py +14 -10
- endoreg_db/management/commands/storage_management.py +271 -203
- endoreg_db/migrations/0001_initial.py +1799 -1300
- endoreg_db/migrations/0002_requirementset_depends_on.py +18 -0
- endoreg_db/migrations/_old/0001_initial.py +1857 -0
- endoreg_db/migrations/_old/0004_employee_city_employee_post_code_employee_street_and_more.py +68 -0
- endoreg_db/migrations/_old/0004_remove_casetemplate_rules_and_more.py +77 -0
- endoreg_db/migrations/_old/0005_merge_20251111_1003.py +14 -0
- endoreg_db/migrations/_old/0006_sensitivemeta_anonymized_text_and_more.py +68 -0
- endoreg_db/migrations/_old/0007_remove_rule_attribute_dtype_remove_rule_rule_type_and_more.py +89 -0
- endoreg_db/migrations/_old/0008_remove_event_event_classification_and_more.py +27 -0
- endoreg_db/migrations/_old/0009_alter_modelmeta_options_and_more.py +21 -0
- endoreg_db/models/__init__.py +78 -123
- endoreg_db/models/administration/__init__.py +21 -42
- endoreg_db/models/administration/ai/active_model.py +2 -2
- endoreg_db/models/administration/ai/ai_model.py +7 -6
- endoreg_db/models/administration/case/__init__.py +1 -15
- endoreg_db/models/administration/case/case.py +3 -3
- endoreg_db/models/administration/case/case_template/__init__.py +2 -14
- endoreg_db/models/administration/case/case_template/case_template.py +2 -124
- endoreg_db/models/administration/case/case_template/case_template_rule.py +2 -268
- endoreg_db/models/administration/case/case_template/case_template_rule_value.py +2 -85
- endoreg_db/models/administration/case/case_template/case_template_type.py +2 -25
- endoreg_db/models/administration/center/center.py +33 -19
- endoreg_db/models/administration/center/center_product.py +12 -9
- endoreg_db/models/administration/center/center_resource.py +25 -19
- endoreg_db/models/administration/center/center_shift.py +21 -17
- endoreg_db/models/administration/center/center_waste.py +16 -8
- endoreg_db/models/administration/person/__init__.py +2 -0
- endoreg_db/models/administration/person/employee/employee.py +10 -5
- endoreg_db/models/administration/person/employee/employee_qualification.py +9 -4
- endoreg_db/models/administration/person/employee/employee_type.py +12 -6
- endoreg_db/models/administration/person/examiner/examiner.py +13 -11
- endoreg_db/models/administration/person/patient/__init__.py +2 -0
- endoreg_db/models/administration/person/patient/patient.py +103 -100
- endoreg_db/models/administration/person/patient/patient_external_id.py +37 -0
- endoreg_db/models/administration/person/person.py +4 -0
- endoreg_db/models/administration/person/profession/__init__.py +8 -4
- endoreg_db/models/administration/person/user/portal_user_information.py +11 -7
- endoreg_db/models/administration/product/product.py +20 -15
- endoreg_db/models/administration/product/product_material.py +17 -18
- endoreg_db/models/administration/product/product_weight.py +12 -8
- endoreg_db/models/administration/product/reference_product.py +23 -55
- endoreg_db/models/administration/qualification/qualification.py +7 -3
- endoreg_db/models/administration/qualification/qualification_type.py +7 -3
- endoreg_db/models/administration/shift/scheduled_days.py +8 -5
- endoreg_db/models/administration/shift/shift.py +16 -12
- endoreg_db/models/administration/shift/shift_type.py +23 -31
- endoreg_db/models/label/__init__.py +7 -8
- endoreg_db/models/label/annotation/image_classification.py +10 -9
- endoreg_db/models/label/annotation/video_segmentation_annotation.py +8 -5
- endoreg_db/models/label/label.py +15 -15
- endoreg_db/models/label/label_set.py +19 -6
- endoreg_db/models/label/label_type.py +1 -1
- endoreg_db/models/label/label_video_segment/_create_from_video.py +5 -8
- endoreg_db/models/label/label_video_segment/label_video_segment.py +76 -102
- endoreg_db/models/label/video_segmentation_label.py +4 -0
- endoreg_db/models/label/video_segmentation_labelset.py +4 -3
- endoreg_db/models/media/frame/frame.py +22 -22
- endoreg_db/models/media/pdf/raw_pdf.py +249 -177
- endoreg_db/models/media/pdf/report_file.py +25 -29
- endoreg_db/models/media/pdf/report_reader/report_reader_config.py +30 -46
- endoreg_db/models/media/pdf/report_reader/report_reader_flag.py +23 -7
- endoreg_db/models/media/video/__init__.py +1 -0
- endoreg_db/models/media/video/create_from_file.py +48 -56
- endoreg_db/models/media/video/pipe_1.py +30 -33
- endoreg_db/models/media/video/pipe_2.py +8 -9
- endoreg_db/models/media/video/video_file.py +359 -204
- endoreg_db/models/media/video/video_file_ai.py +288 -74
- endoreg_db/models/media/video/video_file_anonymize.py +38 -38
- endoreg_db/models/media/video/video_file_frames/__init__.py +3 -1
- endoreg_db/models/media/video/video_file_frames/_bulk_create_frames.py +6 -8
- endoreg_db/models/media/video/video_file_frames/_create_frame_object.py +7 -9
- endoreg_db/models/media/video/video_file_frames/_delete_frames.py +9 -8
- endoreg_db/models/media/video/video_file_frames/_extract_frames.py +38 -45
- endoreg_db/models/media/video/video_file_frames/_get_frame.py +6 -8
- endoreg_db/models/media/video/video_file_frames/_get_frame_number.py +4 -18
- endoreg_db/models/media/video/video_file_frames/_get_frame_path.py +4 -3
- endoreg_db/models/media/video/video_file_frames/_get_frame_paths.py +7 -6
- endoreg_db/models/media/video/video_file_frames/_get_frame_range.py +6 -8
- endoreg_db/models/media/video/video_file_frames/_get_frames.py +6 -8
- endoreg_db/models/media/video/video_file_frames/_initialize_frames.py +15 -25
- endoreg_db/models/media/video/video_file_frames/_manage_frame_range.py +26 -23
- endoreg_db/models/media/video/video_file_frames/_mark_frames_extracted_status.py +23 -14
- endoreg_db/models/media/video/video_file_io.py +109 -62
- endoreg_db/models/media/video/video_file_meta/get_crop_template.py +3 -3
- endoreg_db/models/media/video/video_file_meta/get_endo_roi.py +5 -3
- endoreg_db/models/media/video/video_file_meta/get_fps.py +37 -34
- endoreg_db/models/media/video/video_file_meta/initialize_video_specs.py +19 -25
- endoreg_db/models/media/video/video_file_meta/text_meta.py +41 -38
- endoreg_db/models/media/video/video_file_meta/video_meta.py +14 -7
- endoreg_db/models/media/video/video_file_segments.py +24 -17
- endoreg_db/models/media/video/video_metadata.py +19 -35
- endoreg_db/models/media/video/video_processing.py +96 -95
- endoreg_db/models/medical/contraindication/__init__.py +13 -3
- endoreg_db/models/medical/disease.py +22 -16
- endoreg_db/models/medical/event.py +31 -18
- endoreg_db/models/medical/examination/__init__.py +13 -6
- endoreg_db/models/medical/examination/examination.py +17 -18
- endoreg_db/models/medical/examination/examination_indication.py +26 -25
- endoreg_db/models/medical/examination/examination_time.py +16 -6
- endoreg_db/models/medical/examination/examination_time_type.py +9 -6
- endoreg_db/models/medical/examination/examination_type.py +3 -4
- endoreg_db/models/medical/finding/finding.py +38 -39
- endoreg_db/models/medical/finding/finding_classification.py +37 -48
- endoreg_db/models/medical/finding/finding_intervention.py +27 -22
- endoreg_db/models/medical/finding/finding_type.py +13 -12
- endoreg_db/models/medical/hardware/endoscope.py +20 -26
- endoreg_db/models/medical/hardware/endoscopy_processor.py +2 -2
- endoreg_db/models/medical/laboratory/lab_value.py +62 -91
- endoreg_db/models/medical/medication/medication.py +22 -10
- endoreg_db/models/medical/medication/medication_indication.py +29 -3
- endoreg_db/models/medical/medication/medication_indication_type.py +25 -14
- endoreg_db/models/medical/medication/medication_intake_time.py +31 -19
- endoreg_db/models/medical/medication/medication_schedule.py +27 -16
- endoreg_db/models/medical/organ/__init__.py +15 -12
- endoreg_db/models/medical/patient/medication_examples.py +1 -5
- endoreg_db/models/medical/patient/patient_disease.py +20 -23
- endoreg_db/models/medical/patient/patient_event.py +19 -22
- endoreg_db/models/medical/patient/patient_examination.py +48 -54
- endoreg_db/models/medical/patient/patient_examination_indication.py +16 -14
- endoreg_db/models/medical/patient/patient_finding.py +122 -139
- endoreg_db/models/medical/patient/patient_finding_classification.py +44 -49
- endoreg_db/models/medical/patient/patient_finding_intervention.py +8 -19
- endoreg_db/models/medical/patient/patient_lab_sample.py +28 -23
- endoreg_db/models/medical/patient/patient_lab_value.py +82 -89
- endoreg_db/models/medical/patient/patient_medication.py +27 -38
- endoreg_db/models/medical/patient/patient_medication_schedule.py +28 -36
- endoreg_db/models/medical/risk/risk.py +7 -6
- endoreg_db/models/medical/risk/risk_type.py +8 -5
- endoreg_db/models/metadata/model_meta.py +60 -29
- endoreg_db/models/metadata/model_meta_logic.py +139 -18
- endoreg_db/models/metadata/pdf_meta.py +19 -24
- endoreg_db/models/metadata/sensitive_meta.py +102 -85
- endoreg_db/models/metadata/sensitive_meta_logic.py +383 -43
- endoreg_db/models/metadata/video_meta.py +51 -31
- endoreg_db/models/metadata/video_prediction_logic.py +16 -23
- endoreg_db/models/metadata/video_prediction_meta.py +29 -33
- endoreg_db/models/other/distribution/date_value_distribution.py +89 -29
- endoreg_db/models/other/distribution/multiple_categorical_value_distribution.py +21 -5
- endoreg_db/models/other/distribution/numeric_value_distribution.py +114 -53
- endoreg_db/models/other/distribution/single_categorical_value_distribution.py +4 -3
- endoreg_db/models/other/emission/emission_factor.py +18 -8
- endoreg_db/models/other/gender.py +10 -5
- endoreg_db/models/other/information_source.py +25 -25
- endoreg_db/models/other/material.py +9 -5
- endoreg_db/models/other/resource.py +6 -4
- endoreg_db/models/other/tag.py +10 -5
- endoreg_db/models/other/transport_route.py +13 -8
- endoreg_db/models/other/unit.py +10 -6
- endoreg_db/models/other/waste.py +6 -5
- endoreg_db/models/requirement/requirement.py +580 -272
- endoreg_db/models/requirement/requirement_error.py +85 -0
- endoreg_db/models/requirement/requirement_evaluation/evaluate_with_dependencies.py +268 -0
- endoreg_db/models/requirement/requirement_evaluation/operator_evaluation_models.py +3 -6
- endoreg_db/models/requirement/requirement_evaluation/requirement_type_parser.py +90 -64
- endoreg_db/models/requirement/requirement_operator.py +36 -33
- endoreg_db/models/requirement/requirement_set.py +74 -57
- endoreg_db/models/state/__init__.py +4 -4
- endoreg_db/models/state/abstract.py +2 -2
- endoreg_db/models/state/anonymization.py +12 -0
- endoreg_db/models/state/audit_ledger.py +46 -47
- endoreg_db/models/state/label_video_segment.py +9 -0
- endoreg_db/models/state/raw_pdf.py +40 -46
- endoreg_db/models/state/sensitive_meta.py +6 -2
- endoreg_db/models/state/video.py +58 -53
- endoreg_db/models/upload_job.py +32 -55
- endoreg_db/models/utils.py +1 -2
- endoreg_db/root_urls.py +21 -2
- endoreg_db/serializers/__init__.py +26 -57
- endoreg_db/serializers/anonymization.py +18 -10
- endoreg_db/serializers/meta/report_meta.py +1 -1
- endoreg_db/serializers/meta/sensitive_meta_detail.py +63 -118
- endoreg_db/serializers/misc/__init__.py +1 -1
- endoreg_db/serializers/misc/file_overview.py +33 -91
- endoreg_db/serializers/misc/{vop_patient_data.py → sensitive_patient_data.py} +1 -1
- endoreg_db/serializers/requirements/requirement_sets.py +92 -22
- endoreg_db/serializers/video/segmentation.py +2 -1
- endoreg_db/serializers/video/video_processing_history.py +20 -5
- endoreg_db/serializers/video_examination.py +198 -0
- endoreg_db/services/anonymization.py +75 -73
- endoreg_db/services/lookup_service.py +256 -73
- endoreg_db/services/lookup_store.py +174 -30
- endoreg_db/services/pdf_import.py +711 -310
- endoreg_db/services/storage_aware_video_processor.py +140 -114
- endoreg_db/services/video_import.py +266 -117
- endoreg_db/urls/__init__.py +27 -27
- endoreg_db/urls/label_video_segments.py +2 -0
- endoreg_db/urls/media.py +108 -66
- endoreg_db/urls/root_urls.py +29 -0
- endoreg_db/utils/__init__.py +15 -5
- endoreg_db/utils/ai/multilabel_classification_net.py +116 -20
- endoreg_db/utils/case_generator/__init__.py +3 -0
- endoreg_db/utils/dataloader.py +88 -16
- endoreg_db/utils/defaults/set_default_center.py +32 -0
- endoreg_db/utils/names.py +22 -16
- endoreg_db/utils/permissions.py +2 -1
- endoreg_db/utils/pipelines/process_video_dir.py +1 -1
- endoreg_db/utils/requirement_operator_logic/model_evaluators.py +414 -127
- endoreg_db/utils/setup_config.py +8 -5
- endoreg_db/utils/storage.py +115 -0
- endoreg_db/utils/validate_endo_roi.py +8 -2
- endoreg_db/utils/video/ffmpeg_wrapper.py +184 -188
- endoreg_db/views/__init__.py +5 -12
- endoreg_db/views/anonymization/media_management.py +198 -163
- endoreg_db/views/anonymization/overview.py +4 -1
- endoreg_db/views/anonymization/validate.py +174 -40
- endoreg_db/views/media/__init__.py +2 -0
- endoreg_db/views/media/pdf_media.py +131 -150
- endoreg_db/views/media/sensitive_metadata.py +46 -6
- endoreg_db/views/media/video_media.py +89 -82
- endoreg_db/views/media/video_segments.py +187 -260
- endoreg_db/views/meta/sensitive_meta_detail.py +0 -63
- endoreg_db/views/patient/patient.py +5 -4
- endoreg_db/views/pdf/__init__.py +5 -8
- endoreg_db/views/pdf/pdf_stream.py +186 -0
- endoreg_db/views/pdf/pdf_stream_views.py +0 -127
- endoreg_db/views/pdf/reimport.py +86 -91
- endoreg_db/views/requirement/evaluate.py +188 -187
- endoreg_db/views/requirement/lookup.py +186 -288
- endoreg_db/views/requirement/requirement_utils.py +89 -0
- endoreg_db/views/video/__init__.py +0 -4
- endoreg_db/views/video/correction.py +2 -2
- endoreg_db/views/video/video_examination_viewset.py +202 -289
- {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.8.0.dist-info}/METADATA +7 -3
- {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.8.0.dist-info}/RECORD +350 -255
- endoreg_db/models/administration/permissions/__init__.py +0 -44
- endoreg_db/models/media/video/refactor_plan.md +0 -0
- endoreg_db/models/media/video/video_file_frames.py +0 -0
- endoreg_db/models/metadata/frame_ocr_result.py +0 -0
- endoreg_db/models/rule/__init__.py +0 -13
- endoreg_db/models/rule/rule.py +0 -27
- endoreg_db/models/rule/rule_applicator.py +0 -224
- endoreg_db/models/rule/rule_attribute_dtype.py +0 -17
- endoreg_db/models/rule/rule_type.py +0 -20
- endoreg_db/models/rule/ruleset.py +0 -17
- endoreg_db/serializers/video/video_metadata.py +0 -105
- endoreg_db/urls/report.py +0 -48
- endoreg_db/urls/video.py +0 -61
- endoreg_db/utils/case_generator/case_generator.py +0 -159
- endoreg_db/utils/case_generator/utils.py +0 -30
- endoreg_db/views/pdf/pdf_media.py +0 -239
- endoreg_db/views/report/__init__.py +0 -9
- endoreg_db/views/report/report_list.py +0 -112
- endoreg_db/views/report/report_with_secure_url.py +0 -28
- endoreg_db/views/report/start_examination.py +0 -7
- endoreg_db/views/video/video_media.py +0 -158
- endoreg_db/views.py +0 -0
- /endoreg_db/data/{requirement_set → _examples/requirement_set}/endoscopy_bleeding_risk.yaml +0 -0
- /endoreg_db/migrations/{0002_add_video_correction_models.py → _old/0002_add_video_correction_models.py} +0 -0
- /endoreg_db/migrations/{0003_add_center_display_name.py → _old/0003_add_center_display_name.py} +0 -0
- {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.8.0.dist-info}/WHEEL +0 -0
- {endoreg_db-0.8.4.4.dist-info → endoreg_db-0.8.8.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,24 +2,31 @@
|
|
|
2
2
|
PDF import service module.
|
|
3
3
|
|
|
4
4
|
Provides high-level functions for importing and anonymizing PDF files,
|
|
5
|
-
combining RawPdfFile creation with text extraction and anonymization.
|
|
5
|
+
combining RawPdfFile creation with text extraction and anonymization using lx anonymizer.
|
|
6
|
+
|
|
7
|
+
All Fields should be overwritten from anonymizer defaults except for the center which is given.
|
|
6
8
|
"""
|
|
7
|
-
|
|
9
|
+
|
|
8
10
|
import errno
|
|
11
|
+
import hashlib
|
|
9
12
|
import logging
|
|
13
|
+
import os
|
|
10
14
|
import shutil
|
|
11
15
|
import sys
|
|
12
|
-
import
|
|
13
|
-
import
|
|
16
|
+
import time
|
|
17
|
+
from contextlib import contextmanager
|
|
18
|
+
from datetime import date, datetime
|
|
14
19
|
from pathlib import Path
|
|
15
20
|
from typing import TYPE_CHECKING, Union
|
|
16
|
-
|
|
21
|
+
import subprocess
|
|
17
22
|
from django.db import transaction
|
|
23
|
+
from django.core.exceptions import ObjectDoesNotExist
|
|
24
|
+
import lx_anonymizer
|
|
25
|
+
|
|
26
|
+
from endoreg_db.models import SensitiveMeta
|
|
18
27
|
from endoreg_db.models.media.pdf.raw_pdf import RawPdfFile
|
|
19
28
|
from endoreg_db.models.state.raw_pdf import RawPdfState
|
|
20
|
-
from endoreg_db.models import SensitiveMeta
|
|
21
29
|
from endoreg_db.utils import paths as path_utils
|
|
22
|
-
import time
|
|
23
30
|
|
|
24
31
|
logger = logging.getLogger(__name__)
|
|
25
32
|
|
|
@@ -34,24 +41,76 @@ class PdfImportService:
|
|
|
34
41
|
"""
|
|
35
42
|
Service class for importing and processing PDF files with text extraction and anonymization.
|
|
36
43
|
Uses a central PDF instance pattern for cleaner state management.
|
|
44
|
+
|
|
45
|
+
Supports two processing modes:
|
|
46
|
+
- 'blackening': Simple PDF masking with black rectangles over sensitive areas
|
|
47
|
+
- 'cropping': Advanced mode that crops sensitive regions to separate images
|
|
37
48
|
"""
|
|
38
|
-
|
|
39
|
-
def __init__(
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self, allow_meta_overwrite: bool = True, processing_mode: str = "blackening"
|
|
52
|
+
):
|
|
40
53
|
"""
|
|
41
54
|
Initialize the PDF import service.
|
|
42
|
-
|
|
55
|
+
|
|
43
56
|
Args:
|
|
44
57
|
allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
|
|
58
|
+
processing_mode: Processing mode - 'blackening' for simple masking, 'cropping' for advanced cropping
|
|
45
59
|
"""
|
|
46
60
|
self.processed_files = set()
|
|
47
61
|
self._report_reader_available = None
|
|
48
62
|
self._report_reader_class = None
|
|
49
63
|
self.allow_meta_overwrite = allow_meta_overwrite
|
|
50
|
-
|
|
64
|
+
|
|
65
|
+
# Validate and set processing mode
|
|
66
|
+
valid_modes = ["blackening", "cropping"]
|
|
67
|
+
if processing_mode not in valid_modes:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"Invalid processing_mode '{processing_mode}'. Must be one of: {valid_modes}"
|
|
70
|
+
)
|
|
71
|
+
self.processing_mode = processing_mode
|
|
72
|
+
|
|
51
73
|
# Central PDF instance management
|
|
52
74
|
self.current_pdf = None
|
|
75
|
+
self.current_pdf_state = None
|
|
53
76
|
self.processing_context = {}
|
|
77
|
+
self.original_path = None
|
|
54
78
|
|
|
79
|
+
self.DEFAULT_PATIENT_FIRST_NAME = "Patient"
|
|
80
|
+
self.DEFAULT_PATIENT_LAST_NAME = "Unknown"
|
|
81
|
+
self.DEFAULT_PATIENT_DOB = date(1990, 1, 1)
|
|
82
|
+
self.DEFAULT_CENTER_NAME = "university_hospital_wuerzburg"
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def with_blackening(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
|
|
86
|
+
"""
|
|
87
|
+
Create a PdfImportService configured for simple PDF blackening mode.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
PdfImportService instance configured for blackening mode
|
|
94
|
+
"""
|
|
95
|
+
return cls(
|
|
96
|
+
allow_meta_overwrite=allow_meta_overwrite, processing_mode="blackening"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def with_cropping(cls, allow_meta_overwrite: bool = False) -> "PdfImportService":
|
|
101
|
+
"""
|
|
102
|
+
Create a PdfImportService configured for advanced cropping mode.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
allow_meta_overwrite: Whether to allow overwriting existing SensitiveMeta fields
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
PdfImportService instance configured for cropping mode
|
|
109
|
+
"""
|
|
110
|
+
return cls(
|
|
111
|
+
allow_meta_overwrite=allow_meta_overwrite, processing_mode="cropping"
|
|
112
|
+
)
|
|
113
|
+
|
|
55
114
|
@contextmanager
|
|
56
115
|
def _file_lock(self, path: Path):
|
|
57
116
|
"""Create a file lock to prevent duplicate processing.
|
|
@@ -77,15 +136,19 @@ class PdfImportService:
|
|
|
77
136
|
try:
|
|
78
137
|
logger.warning(
|
|
79
138
|
"Stale lock detected for %s (age %.0fs). Reclaiming lock...",
|
|
80
|
-
path,
|
|
139
|
+
path,
|
|
140
|
+
age,
|
|
81
141
|
)
|
|
82
142
|
lock_path.unlink()
|
|
83
143
|
except Exception as e:
|
|
84
|
-
logger.warning(
|
|
144
|
+
logger.warning(
|
|
145
|
+
"Failed to remove stale lock %s: %s", lock_path, e
|
|
146
|
+
)
|
|
85
147
|
# retry acquire
|
|
86
148
|
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
|
|
87
149
|
else:
|
|
88
150
|
# Another worker is processing this file
|
|
151
|
+
|
|
89
152
|
raise ValueError(f"File already being processed: {path}")
|
|
90
153
|
|
|
91
154
|
os.write(fd, b"lock")
|
|
@@ -100,7 +163,7 @@ class PdfImportService:
|
|
|
100
163
|
lock_path.unlink()
|
|
101
164
|
except OSError:
|
|
102
165
|
pass
|
|
103
|
-
|
|
166
|
+
|
|
104
167
|
def _sha256(self, path: Path, chunk: int = 1024 * 1024) -> str:
|
|
105
168
|
"""Compute SHA256 hash of a file."""
|
|
106
169
|
h = hashlib.sha256()
|
|
@@ -134,7 +197,7 @@ class PdfImportService:
|
|
|
134
197
|
return Path(str(candidate))
|
|
135
198
|
except Exception:
|
|
136
199
|
return None
|
|
137
|
-
|
|
200
|
+
|
|
138
201
|
def _quarantine(self, source: Path) -> Path:
|
|
139
202
|
"""Move file to quarantine directory to prevent re-processing."""
|
|
140
203
|
qdir = path_utils.PDF_DIR / "_processing"
|
|
@@ -149,8 +212,12 @@ class PdfImportService:
|
|
|
149
212
|
shutil.move(str(source), str(target))
|
|
150
213
|
else:
|
|
151
214
|
raise
|
|
215
|
+
lock_path = Path(str(source) + ".lock")
|
|
216
|
+
if lock_path.exists():
|
|
217
|
+
lock_path.unlink()
|
|
218
|
+
|
|
152
219
|
return target
|
|
153
|
-
|
|
220
|
+
|
|
154
221
|
def _ensure_state(self, pdf_file: "RawPdfFile"):
|
|
155
222
|
"""Ensure PDF file has a state object."""
|
|
156
223
|
if getattr(pdf_file, "state", None):
|
|
@@ -158,147 +225,167 @@ class PdfImportService:
|
|
|
158
225
|
if hasattr(pdf_file, "get_or_create_state"):
|
|
159
226
|
state = pdf_file.get_or_create_state()
|
|
160
227
|
pdf_file.state = state
|
|
228
|
+
self.current_pdf_state = state
|
|
229
|
+
assert isinstance(self.current_pdf_state, RawPdfState)
|
|
161
230
|
return state
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
state, _ = pdf_file.get_or_create_state(raw_pdf_file=pdf_file)
|
|
165
|
-
pdf_file.state = state
|
|
166
|
-
return state
|
|
167
|
-
except Exception:
|
|
168
|
-
return None
|
|
169
|
-
|
|
231
|
+
|
|
232
|
+
|
|
170
233
|
def _ensure_report_reading_available(self):
|
|
171
234
|
"""
|
|
172
235
|
Ensure report reading modules are available by adding lx-anonymizer to path.
|
|
173
|
-
|
|
236
|
+
|
|
174
237
|
Returns:
|
|
175
238
|
Tuple of (availability_flag, ReportReader_class)
|
|
176
239
|
"""
|
|
177
240
|
if self._report_reader_available is not None:
|
|
178
241
|
return self._report_reader_available, self._report_reader_class
|
|
179
|
-
|
|
242
|
+
|
|
180
243
|
try:
|
|
181
244
|
# Try direct import first
|
|
182
245
|
from lx_anonymizer import ReportReader
|
|
183
|
-
|
|
246
|
+
|
|
184
247
|
logger.info("Successfully imported lx_anonymizer ReportReader module")
|
|
185
248
|
self._report_reader_available = True
|
|
186
249
|
self._report_reader_class = ReportReader
|
|
187
250
|
return True, ReportReader
|
|
188
|
-
|
|
251
|
+
|
|
189
252
|
except ImportError:
|
|
190
253
|
# Optional: honor LX_ANONYMIZER_PATH=/abs/path/to/src
|
|
191
254
|
import importlib
|
|
255
|
+
|
|
192
256
|
extra = os.getenv("LX_ANONYMIZER_PATH")
|
|
193
257
|
if extra and extra not in sys.path and Path(extra).exists():
|
|
194
258
|
sys.path.insert(0, extra)
|
|
195
259
|
try:
|
|
196
260
|
mod = importlib.import_module("lx_anonymizer")
|
|
197
261
|
ReportReader = getattr(mod, "ReportReader")
|
|
198
|
-
logger.info(
|
|
262
|
+
logger.info(
|
|
263
|
+
"Imported lx_anonymizer.ReportReader via LX_ANONYMIZER_PATH"
|
|
264
|
+
)
|
|
199
265
|
self._report_reader_available = True
|
|
200
266
|
self._report_reader_class = ReportReader
|
|
201
267
|
return True, ReportReader
|
|
202
268
|
except Exception as e:
|
|
203
|
-
logger.warning(
|
|
269
|
+
logger.warning(
|
|
270
|
+
"Failed importing lx_anonymizer via LX_ANONYMIZER_PATH: %s", e
|
|
271
|
+
)
|
|
204
272
|
finally:
|
|
205
273
|
# Keep path for future imports if it worked; otherwise remove.
|
|
206
274
|
if "ReportReader" not in locals() and extra in sys.path:
|
|
207
275
|
sys.path.remove(extra)
|
|
208
|
-
|
|
276
|
+
|
|
209
277
|
self._report_reader_available = False
|
|
210
278
|
self._report_reader_class = None
|
|
211
279
|
return False, None
|
|
212
280
|
|
|
213
|
-
|
|
214
|
-
def _ensure_default_patient_data(self, pdf_instance: "RawPdfFile" = None) -> None:
|
|
281
|
+
def _ensure_default_patient_data(self, pdf_instance: "RawPdfFile") -> None:
|
|
215
282
|
"""
|
|
216
283
|
Ensure PDF has minimum required patient data in SensitiveMeta.
|
|
217
284
|
Creates default values if data is missing after text processing.
|
|
218
285
|
Uses the central PDF instance if no specific instance provided.
|
|
219
|
-
|
|
286
|
+
|
|
220
287
|
Args:
|
|
221
288
|
pdf_instance: Optional specific PDF instance, defaults to self.current_pdf
|
|
222
289
|
"""
|
|
223
290
|
pdf_file = pdf_instance or self.current_pdf
|
|
224
291
|
if not pdf_file:
|
|
225
|
-
logger.warning(
|
|
292
|
+
logger.warning(
|
|
293
|
+
"No PDF instance available for ensuring default patient data"
|
|
294
|
+
)
|
|
226
295
|
return
|
|
227
|
-
|
|
296
|
+
|
|
228
297
|
if not pdf_file.sensitive_meta:
|
|
229
|
-
logger.info(
|
|
230
|
-
|
|
298
|
+
logger.info(
|
|
299
|
+
f"No SensitiveMeta found for PDF {pdf_file.pdf_hash}, creating default"
|
|
300
|
+
)
|
|
301
|
+
|
|
231
302
|
# Create default SensitiveMeta with placeholder data
|
|
232
303
|
default_data = {
|
|
233
|
-
"patient_first_name":
|
|
234
|
-
"patient_last_name":
|
|
235
|
-
"patient_dob":
|
|
236
|
-
"examination_date": date.today(),
|
|
237
|
-
"center_name":
|
|
304
|
+
"patient_first_name": self.DEFAULT_PATIENT_FIRST_NAME,
|
|
305
|
+
"patient_last_name": self.DEFAULT_PATIENT_LAST_NAME,
|
|
306
|
+
"patient_dob": self.DEFAULT_PATIENT_DOB,
|
|
307
|
+
"examination_date": date.today(), # today is intentionally *not* a constant
|
|
308
|
+
"center_name": (
|
|
309
|
+
pdf_file.center.name
|
|
310
|
+
if pdf_file.center
|
|
311
|
+
else self.DEFAULT_CENTER_NAME
|
|
312
|
+
),
|
|
238
313
|
}
|
|
239
|
-
|
|
314
|
+
|
|
315
|
+
|
|
240
316
|
try:
|
|
241
317
|
sensitive_meta = SensitiveMeta.create_from_dict(default_data)
|
|
242
318
|
pdf_file.sensitive_meta = sensitive_meta
|
|
243
|
-
pdf_file.save(update_fields=[
|
|
244
|
-
logger.info(
|
|
319
|
+
pdf_file.save(update_fields=["sensitive_meta"])
|
|
320
|
+
logger.info(
|
|
321
|
+
f"Created default SensitiveMeta for PDF {pdf_file.pdf_hash}"
|
|
322
|
+
)
|
|
245
323
|
except Exception as e:
|
|
246
|
-
logger.error(
|
|
324
|
+
logger.error(
|
|
325
|
+
f"Failed to create default SensitiveMeta for PDF {pdf_file.pdf_hash}: {e}"
|
|
326
|
+
)
|
|
247
327
|
|
|
248
328
|
def import_and_anonymize(
|
|
249
|
-
self,
|
|
250
|
-
file_path: Union[Path, str],
|
|
251
|
-
center_name: str,
|
|
329
|
+
self,
|
|
330
|
+
file_path: Union[Path, str],
|
|
331
|
+
center_name: str,
|
|
252
332
|
delete_source: bool = False,
|
|
253
333
|
retry: bool = False,
|
|
254
|
-
) -> "RawPdfFile":
|
|
334
|
+
) -> "RawPdfFile | None":
|
|
255
335
|
"""
|
|
256
336
|
Import a PDF file and anonymize it using ReportReader.
|
|
257
337
|
Uses centralized PDF instance management pattern.
|
|
258
|
-
|
|
338
|
+
|
|
339
|
+
The processing mode is determined by the service initialization:
|
|
340
|
+
- 'blackening': Creates an anonymized PDF with black rectangles over sensitive regions
|
|
341
|
+
- 'cropping': Advanced mode that crops sensitive regions to separate images
|
|
342
|
+
|
|
259
343
|
Args:
|
|
260
344
|
file_path: Path to the PDF file to import
|
|
261
345
|
center_name: Name of the center to associate with PDF
|
|
262
346
|
delete_source: Whether to delete the source file after import
|
|
263
347
|
retry: Whether this is a retry attempt
|
|
264
|
-
|
|
348
|
+
|
|
265
349
|
Returns:
|
|
266
350
|
RawPdfFile instance after import and processing
|
|
267
|
-
|
|
351
|
+
|
|
268
352
|
Raises:
|
|
269
353
|
Exception: On any failure during import or processing
|
|
270
354
|
"""
|
|
271
355
|
try:
|
|
272
356
|
# Initialize processing context
|
|
273
|
-
self._initialize_processing_context(
|
|
274
|
-
|
|
357
|
+
self._initialize_processing_context(
|
|
358
|
+
file_path, center_name, delete_source, retry
|
|
359
|
+
)
|
|
360
|
+
|
|
275
361
|
# Step 1: Validate and prepare file
|
|
276
362
|
self._validate_and_prepare_file()
|
|
277
|
-
|
|
363
|
+
|
|
278
364
|
# Step 2: Create or retrieve PDF instance
|
|
279
365
|
self._create_or_retrieve_pdf_instance()
|
|
280
|
-
|
|
366
|
+
|
|
281
367
|
# Early return check - if no PDF instance was created, return None
|
|
282
368
|
if not self.current_pdf:
|
|
283
|
-
logger.warning(
|
|
284
|
-
|
|
285
|
-
|
|
369
|
+
logger.warning(
|
|
370
|
+
f"No PDF instance created for {file_path}, returning None"
|
|
371
|
+
)
|
|
372
|
+
raise ObjectDoesNotExist
|
|
286
373
|
# Step 3: Setup processing environment
|
|
287
374
|
self._setup_processing_environment()
|
|
288
|
-
|
|
375
|
+
|
|
289
376
|
# Step 4: Process text and metadata
|
|
290
377
|
self._process_text_and_metadata()
|
|
291
|
-
|
|
378
|
+
|
|
292
379
|
# Step 5: Finalize processing
|
|
293
380
|
self._finalize_processing()
|
|
294
|
-
|
|
381
|
+
|
|
295
382
|
return self.current_pdf
|
|
296
|
-
|
|
383
|
+
|
|
297
384
|
except ValueError as e:
|
|
298
385
|
# Handle "File already being processed" case specifically
|
|
299
386
|
if "already being processed" in str(e):
|
|
300
387
|
logger.info(f"Skipping file {file_path}: {e}")
|
|
301
|
-
return
|
|
388
|
+
return
|
|
302
389
|
else:
|
|
303
390
|
logger.error(f"PDF import failed for {file_path}: {e}")
|
|
304
391
|
self._cleanup_on_error()
|
|
@@ -312,50 +399,58 @@ class PdfImportService:
|
|
|
312
399
|
# Always cleanup context
|
|
313
400
|
self._cleanup_processing_context()
|
|
314
401
|
|
|
315
|
-
def _initialize_processing_context(
|
|
316
|
-
|
|
402
|
+
def _initialize_processing_context(
|
|
403
|
+
self,
|
|
404
|
+
file_path: Union[Path, str],
|
|
405
|
+
center_name: str,
|
|
406
|
+
delete_source: bool,
|
|
407
|
+
retry: bool,
|
|
408
|
+
):
|
|
317
409
|
"""Initialize the processing context for the current PDF."""
|
|
318
410
|
self.processing_context = {
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
411
|
+
"file_path": Path(file_path),
|
|
412
|
+
"original_file_path": Path(file_path),
|
|
413
|
+
"center_name": center_name,
|
|
414
|
+
"delete_source": delete_source,
|
|
415
|
+
"retry": retry,
|
|
416
|
+
"file_hash": None,
|
|
417
|
+
"processing_started": False,
|
|
418
|
+
"text_extracted": False,
|
|
419
|
+
"metadata_processed": False,
|
|
420
|
+
"anonymization_completed": False,
|
|
329
421
|
}
|
|
330
|
-
|
|
422
|
+
self.original_path = Path(file_path)
|
|
423
|
+
|
|
331
424
|
# Check if already processed (only during current session to prevent race conditions)
|
|
332
425
|
if str(file_path) in self.processed_files:
|
|
333
|
-
logger.info(
|
|
426
|
+
logger.info(
|
|
427
|
+
f"File {file_path} already being processed in current session, skipping"
|
|
428
|
+
)
|
|
334
429
|
raise ValueError("File already being processed")
|
|
335
|
-
|
|
430
|
+
|
|
336
431
|
logger.info(f"Starting import and processing for: {file_path}")
|
|
337
432
|
|
|
338
433
|
def _validate_and_prepare_file(self):
|
|
339
434
|
"""Validate file existence and calculate hash."""
|
|
340
|
-
file_path = self.processing_context[
|
|
341
|
-
|
|
435
|
+
file_path = self.processing_context["file_path"]
|
|
436
|
+
|
|
342
437
|
if not file_path.exists():
|
|
343
438
|
raise FileNotFoundError(f"PDF file not found: {file_path}")
|
|
344
|
-
|
|
439
|
+
|
|
345
440
|
try:
|
|
346
|
-
self.processing_context[
|
|
441
|
+
self.processing_context["file_hash"] = self._sha256(file_path)
|
|
347
442
|
except Exception as e:
|
|
348
443
|
logger.warning(f"Could not calculate file hash: {e}")
|
|
349
|
-
self.processing_context[
|
|
444
|
+
self.processing_context["file_hash"] = None
|
|
350
445
|
|
|
351
446
|
def _create_or_retrieve_pdf_instance(self):
|
|
352
447
|
"""Create new or retrieve existing PDF instance."""
|
|
353
|
-
file_path = self.processing_context[
|
|
354
|
-
center_name = self.processing_context[
|
|
355
|
-
delete_source = self.processing_context[
|
|
356
|
-
retry = self.processing_context[
|
|
357
|
-
file_hash = self.processing_context[
|
|
358
|
-
|
|
448
|
+
file_path = self.processing_context["file_path"]
|
|
449
|
+
center_name = self.processing_context["center_name"]
|
|
450
|
+
delete_source = self.processing_context["delete_source"]
|
|
451
|
+
retry = self.processing_context["retry"]
|
|
452
|
+
file_hash = self.processing_context["file_hash"]
|
|
453
|
+
|
|
359
454
|
if not retry:
|
|
360
455
|
# Check for existing PDF and handle duplicates
|
|
361
456
|
with self._file_lock(file_path):
|
|
@@ -366,18 +461,20 @@ class PdfImportService:
|
|
|
366
461
|
if existing:
|
|
367
462
|
logger.info(f"Found existing RawPdfFile {existing.pdf_hash}")
|
|
368
463
|
if existing.text:
|
|
369
|
-
logger.info(
|
|
464
|
+
logger.info(
|
|
465
|
+
f"Existing PDF {existing.pdf_hash} already processed - returning"
|
|
466
|
+
)
|
|
370
467
|
self.current_pdf = existing
|
|
371
468
|
return
|
|
372
469
|
else:
|
|
373
470
|
# Retry processing
|
|
374
471
|
logger.info(f"Reprocessing existing PDF {existing.pdf_hash}")
|
|
375
472
|
return self._retry_existing_pdf(existing)
|
|
376
|
-
|
|
473
|
+
|
|
377
474
|
# Create new PDF instance
|
|
378
475
|
logger.info("Creating new RawPdfFile instance...")
|
|
379
476
|
from django.db import IntegrityError
|
|
380
|
-
|
|
477
|
+
|
|
381
478
|
try:
|
|
382
479
|
if not retry:
|
|
383
480
|
self.current_pdf = RawPdfFile.create_from_file_initialized(
|
|
@@ -388,18 +485,22 @@ class PdfImportService:
|
|
|
388
485
|
else:
|
|
389
486
|
# Retrieve existing for retry
|
|
390
487
|
self.current_pdf = RawPdfFile.objects.get(pdf_hash=file_hash)
|
|
391
|
-
logger.info(
|
|
392
|
-
|
|
488
|
+
logger.info(
|
|
489
|
+
f"Retrying import for existing RawPdfFile {self.current_pdf.pdf_hash}"
|
|
490
|
+
)
|
|
491
|
+
|
|
393
492
|
# Check if retry is actually needed
|
|
394
493
|
if self.current_pdf.text:
|
|
395
|
-
logger.info(
|
|
494
|
+
logger.info(
|
|
495
|
+
f"Existing PDF {self.current_pdf.pdf_hash} already processed during retry - returning"
|
|
496
|
+
)
|
|
396
497
|
return
|
|
397
|
-
|
|
498
|
+
|
|
398
499
|
if not self.current_pdf:
|
|
399
500
|
raise RuntimeError("Failed to create RawPdfFile instance")
|
|
400
|
-
|
|
501
|
+
|
|
401
502
|
logger.info(f"PDF instance ready: {self.current_pdf.pdf_hash}")
|
|
402
|
-
|
|
503
|
+
|
|
403
504
|
except IntegrityError:
|
|
404
505
|
# Race condition - another worker created it
|
|
405
506
|
if file_hash:
|
|
@@ -410,111 +511,198 @@ class PdfImportService:
|
|
|
410
511
|
|
|
411
512
|
def _setup_processing_environment(self):
|
|
412
513
|
"""Setup processing environment and state."""
|
|
413
|
-
original_path = self.processing_context.get(
|
|
414
|
-
|
|
514
|
+
original_path = self.processing_context.get("file_path")
|
|
515
|
+
if not original_path or not self.current_pdf:
|
|
516
|
+
try:
|
|
517
|
+
self.current_pdf = RawPdfFile.objects.get(pdf_hash=self.processing_context["file_hash"])
|
|
518
|
+
self.original_path = Path(str(self.current_pdf.file.path))
|
|
519
|
+
|
|
520
|
+
except RawPdfFile.DoesNotExist:
|
|
521
|
+
raise RuntimeError("Processing environment setup failed")
|
|
415
522
|
# Create sensitive file copy
|
|
523
|
+
if original_path is None or not isinstance(original_path, (str, Path)):
|
|
524
|
+
logger.error(f"No original path: {original_path!r}")
|
|
525
|
+
return
|
|
416
526
|
self.create_sensitive_file(self.current_pdf, original_path)
|
|
417
|
-
|
|
527
|
+
|
|
418
528
|
# Update file path to point to sensitive copy
|
|
419
|
-
self.processing_context[
|
|
420
|
-
self.processing_context[
|
|
529
|
+
self.processing_context["file_path"] = self.current_pdf.file.path
|
|
530
|
+
self.processing_context["sensitive_copy_created"] = True
|
|
421
531
|
try:
|
|
422
|
-
self.processing_context[
|
|
532
|
+
self.processing_context["sensitive_file_path"] = Path(
|
|
533
|
+
self.current_pdf.file.path
|
|
534
|
+
)
|
|
423
535
|
except Exception:
|
|
424
|
-
self.processing_context[
|
|
425
|
-
|
|
536
|
+
self.processing_context["sensitive_file_path"] = None
|
|
537
|
+
|
|
426
538
|
# Ensure state exists
|
|
427
539
|
state = self.current_pdf.get_or_create_state()
|
|
428
540
|
state.mark_processing_started()
|
|
429
|
-
self.processing_context[
|
|
430
|
-
|
|
541
|
+
self.processing_context["processing_started"] = True
|
|
542
|
+
|
|
431
543
|
# Mark as processed to prevent duplicates
|
|
432
|
-
self.processed_files.add(str(self.processing_context[
|
|
433
|
-
|
|
544
|
+
self.processed_files.add(str(self.processing_context["file_path"]))
|
|
545
|
+
|
|
434
546
|
# Ensure default patient data
|
|
435
547
|
logger.info("Ensuring default patient data...")
|
|
436
548
|
self._ensure_default_patient_data(self.current_pdf)
|
|
437
549
|
|
|
438
550
|
def _process_text_and_metadata(self):
|
|
439
551
|
"""Process text extraction and metadata using ReportReader."""
|
|
440
|
-
report_reading_available,
|
|
441
|
-
|
|
552
|
+
report_reading_available, ReportReaderCls = self._ensure_report_reading_available()
|
|
553
|
+
try:
|
|
554
|
+
assert ReportReaderCls is not None and report_reading_available
|
|
555
|
+
assert self.current_pdf is not None
|
|
556
|
+
except AssertionError as e:
|
|
557
|
+
logger.error(f"PDF Import failed on Error:{e} Ensure the pdf was passed correctly and report reading is available in function _process_text_and_metadata() ")
|
|
442
558
|
if not report_reading_available:
|
|
443
559
|
logger.warning("Report reading not available (lx_anonymizer not found)")
|
|
444
560
|
self._mark_processing_incomplete("no_report_reader")
|
|
445
|
-
return
|
|
446
|
-
|
|
561
|
+
return
|
|
562
|
+
assert self.current_pdf is not None
|
|
447
563
|
if not self.current_pdf.file:
|
|
448
564
|
logger.warning("No file available for text processing")
|
|
449
565
|
self._mark_processing_incomplete("no_file")
|
|
450
566
|
return
|
|
451
|
-
|
|
567
|
+
|
|
452
568
|
try:
|
|
453
|
-
logger.info(
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
anonymized_dir = path_utils.PDF_DIR / 'anonymized'
|
|
458
|
-
crops_dir.mkdir(parents=True, exist_ok=True)
|
|
459
|
-
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
569
|
+
logger.info(
|
|
570
|
+
f"Starting text extraction and metadata processing with ReportReader (mode: {self.processing_mode})..."
|
|
571
|
+
)
|
|
572
|
+
ReportReaderCls = lx_anonymizer.ReportReader
|
|
460
573
|
|
|
461
574
|
# Initialize ReportReader
|
|
462
|
-
report_reader =
|
|
575
|
+
report_reader = ReportReaderCls(
|
|
463
576
|
report_root_path=str(path_utils.STORAGE_DIR),
|
|
464
577
|
locale="de_DE",
|
|
465
|
-
text_date_format="%d.%m.%Y"
|
|
578
|
+
text_date_format="%d.%m.%Y",
|
|
466
579
|
)
|
|
467
580
|
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
# Store results in context
|
|
477
|
-
self.processing_context.update({
|
|
478
|
-
'original_text': original_text,
|
|
479
|
-
'anonymized_text': anonymized_text,
|
|
480
|
-
'extracted_metadata': extracted_metadata,
|
|
481
|
-
'cropped_regions': cropped_regions,
|
|
482
|
-
'anonymized_pdf_path': anonymized_pdf_path
|
|
483
|
-
})
|
|
484
|
-
|
|
485
|
-
if original_text:
|
|
486
|
-
self._apply_text_results()
|
|
487
|
-
self.processing_context['text_extracted'] = True
|
|
488
|
-
|
|
489
|
-
if extracted_metadata:
|
|
490
|
-
self._apply_metadata_results()
|
|
491
|
-
self.processing_context['metadata_processed'] = True
|
|
492
|
-
|
|
493
|
-
if anonymized_pdf_path:
|
|
494
|
-
self._apply_anonymized_pdf()
|
|
495
|
-
self.processing_context['anonymization_completed'] = True
|
|
496
|
-
|
|
581
|
+
if self.processing_mode == "cropping":
|
|
582
|
+
# Use advanced cropping method (existing implementation)
|
|
583
|
+
self._process_with_cropping(report_reader)
|
|
584
|
+
else: # blackening mode
|
|
585
|
+
# Use enhanced process_report with PDF masking
|
|
586
|
+
self._process_with_blackening(report_reader)
|
|
587
|
+
|
|
497
588
|
except Exception as e:
|
|
498
589
|
logger.warning(f"Text processing failed: {e}")
|
|
499
590
|
self._mark_processing_incomplete("text_processing_failed")
|
|
500
591
|
|
|
592
|
+
def _process_with_blackening(self, report_reader):
|
|
593
|
+
"""Process PDF using simple blackening/masking mode."""
|
|
594
|
+
logger.info("Using simple PDF blackening mode...")
|
|
595
|
+
|
|
596
|
+
# Setup anonymized directory
|
|
597
|
+
anonymized_dir = path_utils.PDF_DIR / "anonymized"
|
|
598
|
+
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
599
|
+
assert self.current_pdf is not None
|
|
600
|
+
# Generate output path for anonymized PDF
|
|
601
|
+
pdf_hash = self.current_pdf.pdf_hash
|
|
602
|
+
anonymized_output_path = anonymized_dir / f"{pdf_hash}_anonymized.pdf"
|
|
603
|
+
|
|
604
|
+
# Process with enhanced process_report method (returns 4-tuple now)
|
|
605
|
+
original_text, anonymized_text, extracted_metadata, anonymized_pdf_path = (
|
|
606
|
+
report_reader.process_report(
|
|
607
|
+
pdf_path=self.processing_context["file_path"],
|
|
608
|
+
create_anonymized_pdf=True,
|
|
609
|
+
anonymized_pdf_output_path=str(anonymized_output_path),
|
|
610
|
+
)
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
# Store results in context
|
|
614
|
+
self.processing_context.update(
|
|
615
|
+
{
|
|
616
|
+
"original_text": original_text,
|
|
617
|
+
"anonymized_text": anonymized_text,
|
|
618
|
+
"extracted_metadata": extracted_metadata,
|
|
619
|
+
"cropped_regions": None, # Not available in blackening mode
|
|
620
|
+
"anonymized_pdf_path": anonymized_pdf_path,
|
|
621
|
+
}
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
# Apply results
|
|
625
|
+
if original_text:
|
|
626
|
+
self._apply_text_results()
|
|
627
|
+
self.processing_context["text_extracted"] = True
|
|
628
|
+
|
|
629
|
+
if extracted_metadata:
|
|
630
|
+
self._apply_metadata_results()
|
|
631
|
+
self.processing_context["metadata_processed"] = True
|
|
632
|
+
|
|
633
|
+
if anonymized_pdf_path:
|
|
634
|
+
self._apply_anonymized_pdf()
|
|
635
|
+
self.processing_context["anonymization_completed"] = True
|
|
636
|
+
|
|
637
|
+
logger.info("PDF blackening processing completed")
|
|
638
|
+
|
|
639
|
+
def _process_with_cropping(self, report_reader):
|
|
640
|
+
"""Process PDF using advanced cropping mode (existing implementation)."""
|
|
641
|
+
logger.info("Using advanced cropping mode...")
|
|
642
|
+
|
|
643
|
+
# Setup output directories
|
|
644
|
+
crops_dir = path_utils.PDF_DIR / "cropped_regions"
|
|
645
|
+
anonymized_dir = path_utils.PDF_DIR / "anonymized"
|
|
646
|
+
crops_dir.mkdir(parents=True, exist_ok=True)
|
|
647
|
+
anonymized_dir.mkdir(parents=True, exist_ok=True)
|
|
648
|
+
|
|
649
|
+
# Process with cropping (returns 5-tuple)
|
|
650
|
+
(
|
|
651
|
+
original_text,
|
|
652
|
+
anonymized_text,
|
|
653
|
+
extracted_metadata,
|
|
654
|
+
cropped_regions,
|
|
655
|
+
anonymized_pdf_path,
|
|
656
|
+
) = report_reader.process_report_with_cropping(
|
|
657
|
+
pdf_path=self.processing_context["file_path"],
|
|
658
|
+
crop_sensitive_regions=True,
|
|
659
|
+
crop_output_dir=str(crops_dir),
|
|
660
|
+
anonymization_output_dir=str(anonymized_dir),
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
# Store results in context
|
|
664
|
+
self.processing_context.update(
|
|
665
|
+
{
|
|
666
|
+
"original_text": original_text,
|
|
667
|
+
"anonymized_text": anonymized_text,
|
|
668
|
+
"extracted_metadata": extracted_metadata,
|
|
669
|
+
"cropped_regions": cropped_regions,
|
|
670
|
+
"anonymized_pdf_path": anonymized_pdf_path,
|
|
671
|
+
}
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
# Apply results
|
|
675
|
+
if original_text:
|
|
676
|
+
self._apply_text_results()
|
|
677
|
+
self.processing_context["text_extracted"] = True
|
|
678
|
+
|
|
679
|
+
if extracted_metadata:
|
|
680
|
+
self._apply_metadata_results()
|
|
681
|
+
self.processing_context["metadata_processed"] = True
|
|
682
|
+
|
|
683
|
+
if anonymized_pdf_path:
|
|
684
|
+
self._apply_anonymized_pdf()
|
|
685
|
+
self.processing_context["anonymization_completed"] = True
|
|
686
|
+
|
|
687
|
+
logger.info("PDF cropping processing completed")
|
|
688
|
+
|
|
501
689
|
def _apply_text_results(self):
|
|
502
690
|
"""Apply text extraction results to the PDF instance."""
|
|
503
691
|
if not self.current_pdf:
|
|
504
692
|
logger.warning("Cannot apply text results - no PDF instance available")
|
|
505
693
|
return
|
|
506
|
-
|
|
507
|
-
original_text = self.processing_context.get(
|
|
508
|
-
anonymized_text = self.processing_context.get(
|
|
509
|
-
|
|
694
|
+
|
|
695
|
+
original_text = self.processing_context.get("original_text")
|
|
696
|
+
anonymized_text = self.processing_context.get("anonymized_text")
|
|
697
|
+
|
|
510
698
|
if not original_text:
|
|
511
699
|
logger.warning("No original text available to apply")
|
|
512
700
|
return
|
|
513
|
-
|
|
701
|
+
|
|
514
702
|
# Store extracted text
|
|
515
703
|
self.current_pdf.text = original_text
|
|
516
704
|
logger.info(f"Extracted {len(original_text)} characters of text from PDF")
|
|
517
|
-
|
|
705
|
+
|
|
518
706
|
# Handle anonymized text
|
|
519
707
|
if anonymized_text and anonymized_text != original_text:
|
|
520
708
|
self.current_pdf.anonymized = True
|
|
@@ -525,56 +713,57 @@ class PdfImportService:
|
|
|
525
713
|
if not self.current_pdf:
|
|
526
714
|
logger.warning("Cannot apply metadata results - no PDF instance available")
|
|
527
715
|
return
|
|
528
|
-
|
|
529
|
-
extracted_metadata = self.processing_context.get(
|
|
530
|
-
|
|
716
|
+
|
|
717
|
+
extracted_metadata = self.processing_context.get("extracted_metadata")
|
|
718
|
+
|
|
531
719
|
if not self.current_pdf.sensitive_meta or not extracted_metadata:
|
|
532
720
|
logger.debug("No sensitive meta or extracted metadata available")
|
|
533
721
|
return
|
|
534
|
-
|
|
722
|
+
|
|
535
723
|
sm = self.current_pdf.sensitive_meta
|
|
536
|
-
|
|
724
|
+
|
|
537
725
|
# Map ReportReader metadata to SensitiveMeta fields
|
|
538
726
|
metadata_mapping = {
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
727
|
+
"patient_first_name": "patient_first_name",
|
|
728
|
+
"patient_last_name": "patient_last_name",
|
|
729
|
+
"patient_dob": "patient_dob",
|
|
730
|
+
"examination_date": "examination_date",
|
|
731
|
+
"examiner_first_name": "examiner_first_name",
|
|
732
|
+
"examiner_last_name": "examiner_last_name",
|
|
733
|
+
"endoscope_type": "endoscope_type",
|
|
734
|
+
"casenumber": "casenumber",
|
|
735
|
+
"center_name": "center_name",
|
|
547
736
|
}
|
|
548
|
-
|
|
737
|
+
|
|
549
738
|
# Update fields with extracted information
|
|
550
739
|
updated_fields = []
|
|
551
740
|
for meta_key, sm_field in metadata_mapping.items():
|
|
552
741
|
if extracted_metadata.get(meta_key) and hasattr(sm, sm_field):
|
|
553
742
|
old_value = getattr(sm, sm_field)
|
|
554
743
|
raw_value = extracted_metadata[meta_key]
|
|
555
|
-
|
|
744
|
+
|
|
556
745
|
# Skip if we just got the field name as a string (indicates no actual data)
|
|
557
746
|
if isinstance(raw_value, str) and raw_value == meta_key:
|
|
558
747
|
continue
|
|
559
|
-
|
|
748
|
+
|
|
560
749
|
# Handle date fields specially
|
|
561
|
-
if sm_field in [
|
|
750
|
+
if sm_field in ["patient_dob", "examination_date"]:
|
|
562
751
|
new_value = self._parse_date_field(raw_value, meta_key, sm_field)
|
|
563
752
|
if new_value is None:
|
|
564
753
|
continue
|
|
565
754
|
else:
|
|
566
755
|
new_value = raw_value
|
|
567
|
-
|
|
756
|
+
|
|
568
757
|
# Configurable overwrite policy
|
|
569
758
|
should_overwrite = (
|
|
570
759
|
self.allow_meta_overwrite
|
|
571
|
-
or
|
|
572
|
-
or old_value in ['Patient', 'Unknown']
|
|
760
|
+
or self._is_placeholder_value(sm_field, old_value)
|
|
573
761
|
)
|
|
762
|
+
|
|
574
763
|
if new_value and should_overwrite:
|
|
575
764
|
setattr(sm, sm_field, new_value)
|
|
576
765
|
updated_fields.append(sm_field)
|
|
577
|
-
|
|
766
|
+
|
|
578
767
|
if updated_fields:
|
|
579
768
|
sm.save()
|
|
580
769
|
logger.info(f"Updated SensitiveMeta fields: {updated_fields}")
|
|
@@ -587,26 +776,29 @@ class PdfImportService:
|
|
|
587
776
|
if raw_value == meta_key:
|
|
588
777
|
logger.warning(
|
|
589
778
|
"Skipping date field %s - got field name '%s' instead of actual date",
|
|
590
|
-
sm_field,
|
|
779
|
+
sm_field,
|
|
780
|
+
raw_value,
|
|
591
781
|
)
|
|
592
782
|
return None
|
|
593
|
-
|
|
783
|
+
|
|
594
784
|
# Try common date formats
|
|
595
|
-
date_formats = [
|
|
785
|
+
date_formats = ["%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y"]
|
|
596
786
|
for fmt in date_formats:
|
|
597
787
|
try:
|
|
598
788
|
return datetime.strptime(raw_value, fmt).date()
|
|
599
789
|
except ValueError:
|
|
600
790
|
continue
|
|
601
|
-
|
|
602
|
-
logger.warning(
|
|
791
|
+
|
|
792
|
+
logger.warning(
|
|
793
|
+
"Could not parse date '%s' for field %s", raw_value, sm_field
|
|
794
|
+
)
|
|
603
795
|
return None
|
|
604
|
-
|
|
605
|
-
elif hasattr(raw_value,
|
|
796
|
+
|
|
797
|
+
elif hasattr(raw_value, "date"):
|
|
606
798
|
return raw_value.date()
|
|
607
799
|
else:
|
|
608
800
|
return raw_value
|
|
609
|
-
|
|
801
|
+
|
|
610
802
|
except (ValueError, AttributeError) as e:
|
|
611
803
|
logger.warning("Date parsing failed for %s: %s", sm_field, e)
|
|
612
804
|
return None
|
|
@@ -626,14 +818,17 @@ class PdfImportService:
|
|
|
626
818
|
logger.warning("Cannot apply anonymized PDF - no PDF instance available")
|
|
627
819
|
return
|
|
628
820
|
|
|
629
|
-
anonymized_pdf_path = self.processing_context.get(
|
|
821
|
+
anonymized_pdf_path = self.processing_context.get("anonymized_pdf_path")
|
|
630
822
|
if not anonymized_pdf_path:
|
|
631
823
|
logger.debug("No anonymized_pdf_path present in processing context")
|
|
632
824
|
return
|
|
633
825
|
|
|
634
826
|
anonymized_path = Path(anonymized_pdf_path)
|
|
635
827
|
if not anonymized_path.exists():
|
|
636
|
-
logger.warning(
|
|
828
|
+
logger.warning(
|
|
829
|
+
"Anonymized PDF path returned but file does not exist: %s",
|
|
830
|
+
anonymized_path,
|
|
831
|
+
)
|
|
637
832
|
return
|
|
638
833
|
|
|
639
834
|
logger.info("Anonymized PDF created by ReportReader at: %s", anonymized_path)
|
|
@@ -647,7 +842,7 @@ class PdfImportService:
|
|
|
647
842
|
relative_name = str(anonymized_path)
|
|
648
843
|
|
|
649
844
|
# Only update if something actually changed
|
|
650
|
-
if getattr(self.current_pdf.anonymized_file,
|
|
845
|
+
if getattr(self.current_pdf.anonymized_file, "name", None) != relative_name:
|
|
651
846
|
self.current_pdf.anonymized_file.name = relative_name
|
|
652
847
|
|
|
653
848
|
# Ensure model/state reflect anonymization even if text didn't differ
|
|
@@ -656,46 +851,59 @@ class PdfImportService:
|
|
|
656
851
|
|
|
657
852
|
# Persist cropped regions info somewhere useful (optional & non-breaking)
|
|
658
853
|
# If your model has a field for this, persist there; otherwise we just log.
|
|
659
|
-
cropped_regions = self.processing_context.get(
|
|
854
|
+
cropped_regions = self.processing_context.get("cropped_regions")
|
|
660
855
|
if cropped_regions:
|
|
661
|
-
logger.debug(
|
|
856
|
+
logger.debug(
|
|
857
|
+
"Cropped regions recorded (%d regions).", len(cropped_regions)
|
|
858
|
+
)
|
|
662
859
|
|
|
663
860
|
# Save model changes
|
|
664
|
-
update_fields = [
|
|
665
|
-
if
|
|
666
|
-
update_fields.append(
|
|
861
|
+
update_fields = ["anonymized_file"]
|
|
862
|
+
if "anonymized" in self.current_pdf.__dict__:
|
|
863
|
+
update_fields.append("anonymized")
|
|
667
864
|
self.current_pdf.save(update_fields=update_fields)
|
|
668
865
|
|
|
669
866
|
# Mark state as anonymized immediately; this keeps downstream flows working
|
|
670
867
|
state = self._ensure_state(self.current_pdf)
|
|
671
|
-
|
|
672
|
-
|
|
868
|
+
|
|
869
|
+
if state and not state.processing_started:
|
|
870
|
+
state.mark_processing_started()
|
|
673
871
|
|
|
674
|
-
logger.info(
|
|
872
|
+
logger.info(
|
|
873
|
+
"Updated anonymized_file reference to: %s",
|
|
874
|
+
self.current_pdf.anonymized_file.name,
|
|
875
|
+
)
|
|
675
876
|
|
|
676
877
|
except Exception as e:
|
|
677
878
|
logger.warning("Could not set anonymized file reference: %s", e)
|
|
678
879
|
|
|
679
|
-
|
|
680
880
|
def _finalize_processing(self):
|
|
681
881
|
"""Finalize processing and update state."""
|
|
682
882
|
if not self.current_pdf:
|
|
683
883
|
logger.warning("Cannot finalize processing - no PDF instance available")
|
|
684
884
|
return
|
|
685
|
-
|
|
885
|
+
|
|
686
886
|
try:
|
|
687
887
|
# Update state based on processing results
|
|
688
888
|
state = self._ensure_state(self.current_pdf)
|
|
689
|
-
|
|
690
|
-
if self.processing_context.get(
|
|
889
|
+
|
|
890
|
+
if self.processing_context.get("text_extracted") and state:
|
|
691
891
|
state.mark_anonymized()
|
|
692
|
-
|
|
892
|
+
|
|
893
|
+
# Mark as ready for validation after successful anonymization
|
|
894
|
+
if self.processing_context.get("anonymization_completed") and state:
|
|
895
|
+
state.mark_sensitive_meta_processed()
|
|
896
|
+
logger.info(
|
|
897
|
+
f"PDF {self.current_pdf.pdf_hash} processing completed - "
|
|
898
|
+
f"ready for validation (status: {state.anonymization_status})"
|
|
899
|
+
)
|
|
900
|
+
|
|
693
901
|
# Save all changes
|
|
694
902
|
with transaction.atomic():
|
|
695
903
|
self.current_pdf.save()
|
|
696
904
|
if state:
|
|
697
905
|
state.save()
|
|
698
|
-
|
|
906
|
+
|
|
699
907
|
logger.info("PDF processing completed successfully")
|
|
700
908
|
except Exception as e:
|
|
701
909
|
logger.warning(f"Failed to finalize processing: {e}")
|
|
@@ -703,9 +911,11 @@ class PdfImportService:
|
|
|
703
911
|
def _mark_processing_incomplete(self, reason: str):
|
|
704
912
|
"""Mark processing as incomplete with reason."""
|
|
705
913
|
if not self.current_pdf:
|
|
706
|
-
logger.warning(
|
|
914
|
+
logger.warning(
|
|
915
|
+
f"Cannot mark processing incomplete - no PDF instance available. Reason: {reason}"
|
|
916
|
+
)
|
|
707
917
|
return
|
|
708
|
-
|
|
918
|
+
|
|
709
919
|
try:
|
|
710
920
|
state = self._ensure_state(self.current_pdf)
|
|
711
921
|
if state:
|
|
@@ -714,7 +924,7 @@ class PdfImportService:
|
|
|
714
924
|
state.sensitive_meta_processed = False
|
|
715
925
|
state.save()
|
|
716
926
|
logger.info(f"Set PDF state: processed=False due to {reason}")
|
|
717
|
-
|
|
927
|
+
|
|
718
928
|
# Save changes
|
|
719
929
|
with transaction.atomic():
|
|
720
930
|
self.current_pdf.save()
|
|
@@ -722,41 +932,114 @@ class PdfImportService:
|
|
|
722
932
|
logger.warning(f"Failed to mark processing incomplete: {e}")
|
|
723
933
|
|
|
724
934
|
def _retry_existing_pdf(self, existing_pdf):
|
|
725
|
-
"""
|
|
935
|
+
"""
|
|
936
|
+
Retry processing for existing PDF.
|
|
937
|
+
|
|
938
|
+
Uses get_raw_file_path() to find the original raw file instead of
|
|
939
|
+
relying on the file field which may point to a deleted sensitive file.
|
|
940
|
+
"""
|
|
726
941
|
try:
|
|
942
|
+
# ✅ FIX: Use get_raw_file_path() to find original file
|
|
943
|
+
raw_file_path = existing_pdf.get_raw_file_path()
|
|
944
|
+
|
|
945
|
+
if not raw_file_path or not raw_file_path.exists():
|
|
946
|
+
logger.error(
|
|
947
|
+
f"Cannot retry PDF {existing_pdf.pdf_hash}: Raw file not found. "
|
|
948
|
+
f"Please re-upload the original PDF file."
|
|
949
|
+
)
|
|
950
|
+
self.current_pdf = existing_pdf
|
|
951
|
+
return existing_pdf
|
|
952
|
+
|
|
953
|
+
logger.info(f"Found raw file for retry at: {raw_file_path}")
|
|
954
|
+
|
|
727
955
|
# Remove from processed files to allow retry
|
|
728
|
-
file_path_str = str(
|
|
729
|
-
if file_path_str
|
|
956
|
+
file_path_str = str(raw_file_path)
|
|
957
|
+
if file_path_str in self.processed_files:
|
|
730
958
|
self.processed_files.remove(file_path_str)
|
|
731
959
|
logger.debug(f"Removed {file_path_str} from processed files for retry")
|
|
732
|
-
|
|
960
|
+
|
|
733
961
|
return self.import_and_anonymize(
|
|
734
|
-
file_path=
|
|
735
|
-
center_name=existing_pdf.center.name
|
|
736
|
-
|
|
737
|
-
|
|
962
|
+
file_path=raw_file_path, # ✅ Use raw file path, not sensitive path
|
|
963
|
+
center_name=existing_pdf.center.name
|
|
964
|
+
if existing_pdf.center
|
|
965
|
+
else "unknown_center",
|
|
966
|
+
delete_source=False, # Never delete during retry
|
|
967
|
+
retry=True,
|
|
738
968
|
)
|
|
739
969
|
except Exception as e:
|
|
740
|
-
logger.error(
|
|
970
|
+
logger.error(
|
|
971
|
+
f"Failed to re-import existing PDF {existing_pdf.pdf_hash}: {e}"
|
|
972
|
+
)
|
|
741
973
|
self.current_pdf = existing_pdf
|
|
742
974
|
return existing_pdf
|
|
743
975
|
|
|
744
976
|
def _cleanup_on_error(self):
|
|
745
977
|
"""Cleanup processing context on error."""
|
|
978
|
+
original_path = self.original_path
|
|
746
979
|
try:
|
|
747
|
-
if self.current_pdf and hasattr(self.current_pdf,
|
|
980
|
+
if self.current_pdf and hasattr(self.current_pdf, "state"):
|
|
748
981
|
state = self._ensure_state(self.current_pdf)
|
|
749
|
-
|
|
982
|
+
raw_file_path = self.current_pdf.get_raw_file_path()
|
|
983
|
+
if raw_file_path is not None and original_path is not None:
|
|
984
|
+
# Ensure reprocessing for next attempt by restoring original file
|
|
985
|
+
shutil.copy2(str(raw_file_path), str(original_path))
|
|
986
|
+
|
|
987
|
+
# Ensure no two files can remain
|
|
988
|
+
if raw_file_path == original_path and raw_file_path is not None and original_path is not None:
|
|
989
|
+
os.remove(str(raw_file_path))
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
# Remove Lock file also
|
|
993
|
+
lock_path = Path(str(path_utils.PDF_DIR) + ".lock")
|
|
994
|
+
try:
|
|
995
|
+
if lock_path.exists():
|
|
996
|
+
lock_path.unlink()
|
|
997
|
+
logger.info("Removed lock file during quarantine: %s", lock_path)
|
|
998
|
+
except Exception as e:
|
|
999
|
+
logger.warning("Could not remove lock file during quarantine: %s", e)
|
|
1000
|
+
|
|
1001
|
+
|
|
1002
|
+
if state and self.processing_context.get("processing_started"):
|
|
750
1003
|
state.text_meta_extracted = False
|
|
751
1004
|
state.pdf_meta_extracted = False
|
|
752
1005
|
state.sensitive_meta_processed = False
|
|
1006
|
+
state.anonymized = False
|
|
753
1007
|
state.save()
|
|
754
1008
|
logger.debug("Updated PDF state to indicate processing failure")
|
|
1009
|
+
else:
|
|
1010
|
+
# 🔧 Early failure: no current_pdf (or no state).
|
|
1011
|
+
# In this case we want to make sure we don't leave stray files
|
|
1012
|
+
# under PDF_DIR or PDF_DIR/sensitive.
|
|
1013
|
+
|
|
1014
|
+
pdf_dir = self._get_pdf_dir()
|
|
1015
|
+
if pdf_dir and pdf_dir.exists():
|
|
1016
|
+
for candidate_dir in (pdf_dir, pdf_dir / "sensitive"):
|
|
1017
|
+
if candidate_dir.exists():
|
|
1018
|
+
for candidate in candidate_dir.glob("*.pdf"):
|
|
1019
|
+
# Don't delete the original ingress file
|
|
1020
|
+
if (
|
|
1021
|
+
original_path is not None
|
|
1022
|
+
and candidate.resolve() == Path(original_path).resolve()
|
|
1023
|
+
):
|
|
1024
|
+
continue
|
|
1025
|
+
try:
|
|
1026
|
+
candidate.unlink()
|
|
1027
|
+
logger.debug(
|
|
1028
|
+
"Removed stray PDF during early error cleanup: %s",
|
|
1029
|
+
candidate,
|
|
1030
|
+
)
|
|
1031
|
+
except Exception as e:
|
|
1032
|
+
logger.warning(
|
|
1033
|
+
"Failed to remove stray PDF %s: %s",
|
|
1034
|
+
candidate,
|
|
1035
|
+
e,
|
|
1036
|
+
)
|
|
1037
|
+
|
|
755
1038
|
except Exception as e:
|
|
756
1039
|
logger.warning(f"Error during cleanup: {e}")
|
|
757
1040
|
finally:
|
|
758
1041
|
# Remove any sensitive copy created during this processing run
|
|
759
|
-
sensitive_created = self.processing_context.get(
|
|
1042
|
+
sensitive_created = self.processing_context.get("sensitive_copy_created")
|
|
760
1043
|
if sensitive_created:
|
|
761
1044
|
pdf_obj = self.current_pdf
|
|
762
1045
|
try:
|
|
@@ -765,30 +1048,48 @@ class PdfImportService:
|
|
|
765
1048
|
if file_field and getattr(file_field, "name", None):
|
|
766
1049
|
storage_name = file_field.name
|
|
767
1050
|
file_field.delete(save=False)
|
|
768
|
-
logger.debug(
|
|
1051
|
+
logger.debug(
|
|
1052
|
+
"Deleted sensitive copy %s during error cleanup",
|
|
1053
|
+
storage_name,
|
|
1054
|
+
)
|
|
769
1055
|
except Exception as cleanup_exc:
|
|
770
|
-
logger.warning(
|
|
1056
|
+
logger.warning(
|
|
1057
|
+
"Failed to remove sensitive copy during error cleanup: %s",
|
|
1058
|
+
cleanup_exc,
|
|
1059
|
+
)
|
|
1060
|
+
pdf_dir = self._get_pdf_dir()
|
|
1061
|
+
if original_path and pdf_dir:
|
|
1062
|
+
# Try to remove any extra file that was created during import
|
|
1063
|
+
# Simplest heuristic: same basename as original, but in pdf dir or pdf/sensitive dir
|
|
1064
|
+
for candidate_dir in (pdf_dir, pdf_dir / "sensitive"):
|
|
1065
|
+
candidate = candidate_dir / original_path.name
|
|
1066
|
+
if candidate.exists() and candidate != original_path:
|
|
1067
|
+
try:
|
|
1068
|
+
candidate.unlink()
|
|
1069
|
+
logger.debug(
|
|
1070
|
+
"Removed stray PDF copy during early error cleanup: %s",
|
|
1071
|
+
candidate,
|
|
1072
|
+
)
|
|
1073
|
+
except Exception as e:
|
|
1074
|
+
logger.warning(
|
|
1075
|
+
"Failed to remove stray PDF copy %s: %s",
|
|
1076
|
+
candidate,
|
|
1077
|
+
e,
|
|
1078
|
+
)
|
|
771
1079
|
|
|
772
1080
|
# Always clean up processed files set to prevent blocks
|
|
773
|
-
file_path = self.processing_context.get(
|
|
1081
|
+
file_path = self.processing_context.get("file_path")
|
|
774
1082
|
if file_path and str(file_path) in self.processed_files:
|
|
775
1083
|
self.processed_files.remove(str(file_path))
|
|
776
|
-
logger.debug(
|
|
1084
|
+
logger.debug(
|
|
1085
|
+
f"Removed {file_path} from processed files during error cleanup"
|
|
1086
|
+
)
|
|
777
1087
|
|
|
778
1088
|
try:
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
isinstance(original_path, Path)
|
|
784
|
-
and original_path.exists()
|
|
785
|
-
and not self.processing_context.get('sensitive_copy_created')
|
|
786
|
-
):
|
|
787
|
-
try:
|
|
788
|
-
original_path.unlink()
|
|
789
|
-
logger.info("Removed original file %s during error cleanup", original_path)
|
|
790
|
-
except Exception as remove_exc:
|
|
791
|
-
logger.warning("Could not remove original file %s during error cleanup: %s", original_path, remove_exc)
|
|
1089
|
+
raw_dir = (
|
|
1090
|
+
original_path.parent if isinstance(original_path, Path) else None
|
|
1091
|
+
)
|
|
1092
|
+
|
|
792
1093
|
pdf_dir = self._get_pdf_dir()
|
|
793
1094
|
if not pdf_dir and raw_dir:
|
|
794
1095
|
base_dir = raw_dir.parent
|
|
@@ -805,7 +1106,12 @@ class PdfImportService:
|
|
|
805
1106
|
|
|
806
1107
|
# Remove empty PDF subdirectories that might have been created during setup
|
|
807
1108
|
if pdf_dir and pdf_dir.exists():
|
|
808
|
-
for subdir_name in (
|
|
1109
|
+
for subdir_name in (
|
|
1110
|
+
"sensitive",
|
|
1111
|
+
"cropped_regions",
|
|
1112
|
+
"anonymized",
|
|
1113
|
+
"_processing",
|
|
1114
|
+
):
|
|
809
1115
|
subdir_path = pdf_dir / subdir_name
|
|
810
1116
|
if subdir_path.exists() and subdir_path.is_dir():
|
|
811
1117
|
try:
|
|
@@ -813,22 +1119,49 @@ class PdfImportService:
|
|
|
813
1119
|
except StopIteration:
|
|
814
1120
|
try:
|
|
815
1121
|
subdir_path.rmdir()
|
|
816
|
-
logger.debug(
|
|
1122
|
+
logger.debug(
|
|
1123
|
+
"Removed empty directory %s during error cleanup",
|
|
1124
|
+
subdir_path,
|
|
1125
|
+
)
|
|
817
1126
|
except OSError as rm_err:
|
|
818
|
-
logger.debug(
|
|
1127
|
+
logger.debug(
|
|
1128
|
+
"Could not remove directory %s: %s",
|
|
1129
|
+
subdir_path,
|
|
1130
|
+
rm_err,
|
|
1131
|
+
)
|
|
819
1132
|
except Exception as iter_err:
|
|
820
|
-
logger.debug(
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
1133
|
+
logger.debug(
|
|
1134
|
+
"Could not inspect directory %s: %s",
|
|
1135
|
+
subdir_path,
|
|
1136
|
+
iter_err,
|
|
1137
|
+
)
|
|
1138
|
+
|
|
1139
|
+
raw_count = (
|
|
1140
|
+
len(list(raw_dir.glob("*")))
|
|
1141
|
+
if raw_dir and raw_dir.exists()
|
|
1142
|
+
else None
|
|
1143
|
+
)
|
|
1144
|
+
pdf_count = (
|
|
1145
|
+
len(list(pdf_dir.glob("*")))
|
|
1146
|
+
if pdf_dir and pdf_dir.exists()
|
|
1147
|
+
else None
|
|
1148
|
+
)
|
|
824
1149
|
|
|
825
|
-
sensitive_path = self.processing_context.get(
|
|
1150
|
+
sensitive_path = self.processing_context.get("sensitive_file_path")
|
|
826
1151
|
if sensitive_path:
|
|
827
1152
|
sensitive_parent = Path(sensitive_path).parent
|
|
828
|
-
sensitive_count =
|
|
1153
|
+
sensitive_count = (
|
|
1154
|
+
len(list(sensitive_parent.glob("*")))
|
|
1155
|
+
if sensitive_parent.exists()
|
|
1156
|
+
else None
|
|
1157
|
+
)
|
|
829
1158
|
else:
|
|
830
1159
|
sensitive_dir = pdf_dir / "sensitive" if pdf_dir else None
|
|
831
|
-
sensitive_count =
|
|
1160
|
+
sensitive_count = (
|
|
1161
|
+
len(list(sensitive_dir.glob("*")))
|
|
1162
|
+
if sensitive_dir and sensitive_dir.exists()
|
|
1163
|
+
else None
|
|
1164
|
+
)
|
|
832
1165
|
|
|
833
1166
|
logger.info(
|
|
834
1167
|
"PDF import error cleanup counts - raw: %s, pdf: %s, sensitive: %s",
|
|
@@ -843,17 +1176,17 @@ class PdfImportService:
|
|
|
843
1176
|
"""Cleanup processing context."""
|
|
844
1177
|
try:
|
|
845
1178
|
# Clean up temporary directories
|
|
846
|
-
if self.processing_context.get(
|
|
847
|
-
crops_dir = path_utils.PDF_DIR /
|
|
1179
|
+
if self.processing_context.get("text_extracted"):
|
|
1180
|
+
crops_dir = path_utils.PDF_DIR / "cropped_regions"
|
|
848
1181
|
if crops_dir.exists() and not any(crops_dir.iterdir()):
|
|
849
1182
|
crops_dir.rmdir()
|
|
850
|
-
|
|
1183
|
+
|
|
851
1184
|
# Always remove from processed files set after processing attempt
|
|
852
|
-
file_path = self.processing_context.get(
|
|
1185
|
+
file_path = self.processing_context.get("file_path")
|
|
853
1186
|
if file_path and str(file_path) in self.processed_files:
|
|
854
1187
|
self.processed_files.remove(str(file_path))
|
|
855
1188
|
logger.debug(f"Removed {file_path} from processed files set")
|
|
856
|
-
|
|
1189
|
+
|
|
857
1190
|
except Exception as e:
|
|
858
1191
|
logger.warning(f"Error during context cleanup: {e}")
|
|
859
1192
|
finally:
|
|
@@ -862,44 +1195,43 @@ class PdfImportService:
|
|
|
862
1195
|
self.processing_context = {}
|
|
863
1196
|
|
|
864
1197
|
def import_simple(
|
|
865
|
-
self,
|
|
866
|
-
file_path: Union[Path, str],
|
|
867
|
-
center_name: str,
|
|
868
|
-
delete_source: bool = False
|
|
1198
|
+
self, file_path: Union[Path, str], center_name: str, delete_source: bool = False
|
|
869
1199
|
) -> "RawPdfFile":
|
|
870
1200
|
"""
|
|
871
1201
|
Simple PDF import without text processing or anonymization.
|
|
872
1202
|
Uses centralized PDF instance management pattern.
|
|
873
|
-
|
|
1203
|
+
|
|
874
1204
|
Args:
|
|
875
1205
|
file_path: Path to the PDF file to import
|
|
876
1206
|
center_name: Name of the center to associate with PDF
|
|
877
1207
|
delete_source: Whether to delete the source file after import
|
|
878
|
-
|
|
1208
|
+
|
|
879
1209
|
Returns:
|
|
880
1210
|
RawPdfFile instance after basic import
|
|
881
1211
|
"""
|
|
882
1212
|
try:
|
|
883
1213
|
# Initialize simple processing context
|
|
884
|
-
self._initialize_processing_context(
|
|
885
|
-
|
|
1214
|
+
self._initialize_processing_context(
|
|
1215
|
+
file_path, center_name, delete_source, False
|
|
1216
|
+
)
|
|
1217
|
+
|
|
886
1218
|
# Validate file
|
|
887
1219
|
self._validate_and_prepare_file()
|
|
888
|
-
|
|
1220
|
+
|
|
889
1221
|
# Create PDF instance
|
|
890
1222
|
logger.info("Starting simple import - creating RawPdfFile instance...")
|
|
891
1223
|
self.current_pdf = RawPdfFile.create_from_file_initialized(
|
|
892
|
-
file_path=self.processing_context[
|
|
1224
|
+
file_path=self.processing_context["file_path"],
|
|
893
1225
|
center_name=center_name,
|
|
894
1226
|
delete_source=delete_source,
|
|
895
1227
|
)
|
|
896
|
-
|
|
1228
|
+
|
|
897
1229
|
if not self.current_pdf:
|
|
898
1230
|
raise RuntimeError("Failed to create RawPdfFile instance")
|
|
899
|
-
|
|
1231
|
+
|
|
900
1232
|
# Mark as processed
|
|
901
|
-
self.processed_files.add(str(self.processing_context[
|
|
902
|
-
|
|
1233
|
+
self.processed_files.add(str(self.processing_context["file_path"]))
|
|
1234
|
+
|
|
903
1235
|
# Set basic state for simple import
|
|
904
1236
|
state = self._ensure_state(self.current_pdf)
|
|
905
1237
|
if state:
|
|
@@ -908,56 +1240,68 @@ class PdfImportService:
|
|
|
908
1240
|
state.sensitive_meta_processed = False
|
|
909
1241
|
state.save()
|
|
910
1242
|
logger.info("Set PDF state: processed=False for simple import")
|
|
911
|
-
|
|
1243
|
+
|
|
912
1244
|
# Save changes
|
|
913
1245
|
with transaction.atomic():
|
|
914
1246
|
self.current_pdf.save()
|
|
915
|
-
|
|
916
|
-
logger.info(
|
|
1247
|
+
|
|
1248
|
+
logger.info(
|
|
1249
|
+
"Simple import completed for RawPdfFile hash: %s",
|
|
1250
|
+
self.current_pdf.pdf_hash,
|
|
1251
|
+
)
|
|
917
1252
|
return self.current_pdf
|
|
918
|
-
|
|
1253
|
+
|
|
919
1254
|
except Exception as e:
|
|
920
1255
|
logger.error(f"Simple PDF import failed for {file_path}: {e}")
|
|
921
1256
|
self._cleanup_on_error()
|
|
922
1257
|
raise
|
|
923
1258
|
finally:
|
|
924
1259
|
self._cleanup_processing_context()
|
|
925
|
-
|
|
926
|
-
def check_storage_capacity(
|
|
1260
|
+
|
|
1261
|
+
def check_storage_capacity(
|
|
1262
|
+
self, file_path: Union[Path, str], storage_root, min_required_space
|
|
1263
|
+
) -> bool:
|
|
927
1264
|
"""
|
|
928
1265
|
Check if there is sufficient storage capacity for the PDF file.
|
|
929
|
-
|
|
1266
|
+
|
|
930
1267
|
Args:
|
|
931
1268
|
file_path: Path to the PDF file to check
|
|
932
|
-
|
|
1269
|
+
|
|
933
1270
|
Raises:
|
|
934
1271
|
InsufficientStorageError: If there is not enough space
|
|
935
1272
|
"""
|
|
936
1273
|
import shutil
|
|
1274
|
+
|
|
937
1275
|
from endoreg_db.exceptions import InsufficientStorageError
|
|
938
|
-
|
|
1276
|
+
|
|
939
1277
|
file_path = Path(file_path)
|
|
940
1278
|
if not file_path.exists():
|
|
941
1279
|
raise FileNotFoundError(f"File not found for storage check: {file_path}")
|
|
942
|
-
|
|
1280
|
+
|
|
943
1281
|
# Get the size of the file
|
|
944
1282
|
file_size = file_path.stat().st_size
|
|
945
|
-
|
|
1283
|
+
|
|
946
1284
|
# Get available space in the storage directory
|
|
947
1285
|
|
|
948
1286
|
total, used, free = shutil.disk_usage(storage_root)
|
|
949
|
-
|
|
1287
|
+
|
|
950
1288
|
if file_size:
|
|
951
1289
|
min_required_space = file_size if isinstance(min_required_space, int) else 0
|
|
952
1290
|
|
|
953
1291
|
# Check if there is enough space
|
|
954
1292
|
if file_size > free:
|
|
955
|
-
raise InsufficientStorageError(
|
|
956
|
-
|
|
957
|
-
|
|
1293
|
+
raise InsufficientStorageError(
|
|
1294
|
+
f"Not enough space to store PDF file: {file_path}"
|
|
1295
|
+
)
|
|
1296
|
+
logger.info(
|
|
1297
|
+
f"Storage check passed for {file_path}: {file_size} bytes, {free} bytes available"
|
|
1298
|
+
)
|
|
1299
|
+
|
|
958
1300
|
return True
|
|
959
|
-
|
|
960
|
-
def create_sensitive_file(
|
|
1301
|
+
|
|
1302
|
+
def create_sensitive_file(
|
|
1303
|
+
self, pdf_instance: "RawPdfFile", file_path: Union[Path, str]
|
|
1304
|
+
) -> None:
|
|
961
1305
|
"""
|
|
962
1306
|
Create a copy of the PDF file in the sensitive directory and update the file reference.
|
|
963
1307
|
Delete the source path to avoid duplicates.
|
|
@@ -966,7 +1310,9 @@ class PdfImportService:
|
|
|
966
1310
|
Ensures the FileField points to the file under STORAGE_DIR/pdfs/sensitive and never back to raw_pdfs.
|
|
967
1311
|
"""
|
|
968
1312
|
pdf_file = pdf_instance or self.current_pdf
|
|
969
|
-
source_path =
|
|
1313
|
+
source_path = (
|
|
1314
|
+
Path(file_path) if file_path else self.processing_context.get("file_path")
|
|
1315
|
+
)
|
|
970
1316
|
|
|
971
1317
|
if not pdf_file:
|
|
972
1318
|
raise ValueError("No PDF instance available for creating sensitive file")
|
|
@@ -989,25 +1335,37 @@ class PdfImportService:
|
|
|
989
1335
|
try:
|
|
990
1336
|
target.unlink()
|
|
991
1337
|
except Exception as e:
|
|
992
|
-
logger.warning(
|
|
1338
|
+
logger.warning(
|
|
1339
|
+
"Could not remove existing sensitive target %s: %s",
|
|
1340
|
+
target,
|
|
1341
|
+
e,
|
|
1342
|
+
)
|
|
993
1343
|
shutil.move(str(source_path), str(target))
|
|
994
1344
|
logger.info(f"Moved PDF to sensitive directory: {target}")
|
|
995
1345
|
|
|
996
1346
|
# Update FileField to reference the file under STORAGE_DIR
|
|
997
1347
|
# We avoid re-saving file content (the file is already at target); set .name relative to STORAGE_DIR
|
|
998
1348
|
try:
|
|
999
|
-
relative_name = str(
|
|
1349
|
+
relative_name = str(
|
|
1350
|
+
target.relative_to(path_utils.STORAGE_DIR)
|
|
1351
|
+
) # Point Django FileField to sensitive storage
|
|
1000
1352
|
except ValueError:
|
|
1001
1353
|
# Fallback: if target is not under STORAGE_DIR, store absolute path (not ideal)
|
|
1002
1354
|
relative_name = str(target)
|
|
1003
1355
|
|
|
1004
1356
|
# Only update when changed
|
|
1005
|
-
if getattr(pdf_file.file,
|
|
1357
|
+
if getattr(pdf_file.file, "name", None) != relative_name:
|
|
1006
1358
|
pdf_file.file.name = relative_name
|
|
1007
|
-
pdf_file.save(update_fields=[
|
|
1008
|
-
logger.info(
|
|
1359
|
+
pdf_file.save(update_fields=["file"])
|
|
1360
|
+
logger.info(
|
|
1361
|
+
"Updated PDF FileField reference to sensitive path: %s",
|
|
1362
|
+
pdf_file.file.path,
|
|
1363
|
+
)
|
|
1009
1364
|
else:
|
|
1010
|
-
logger.debug(
|
|
1365
|
+
logger.debug(
|
|
1366
|
+
"PDF FileField already points to sensitive path: %s",
|
|
1367
|
+
pdf_file.file.path,
|
|
1368
|
+
)
|
|
1011
1369
|
|
|
1012
1370
|
# Best-effort: if original source still exists (e.g., copy), remove it to avoid re-triggers
|
|
1013
1371
|
try:
|
|
@@ -1018,57 +1376,78 @@ class PdfImportService:
|
|
|
1018
1376
|
logger.warning(f"Could not delete original PDF file {source_path}: {e}")
|
|
1019
1377
|
|
|
1020
1378
|
except Exception as e:
|
|
1021
|
-
logger.warning(
|
|
1379
|
+
logger.warning(
|
|
1380
|
+
f"Could not create sensitive file copy for {pdf_file.pdf_hash}: {e}",
|
|
1381
|
+
exc_info=True,
|
|
1382
|
+
)
|
|
1022
1383
|
|
|
1023
|
-
def archive_or_quarantine_file(
|
|
1024
|
-
|
|
1384
|
+
def archive_or_quarantine_file(
|
|
1385
|
+
self,
|
|
1386
|
+
pdf_instance: "RawPdfFile",
|
|
1387
|
+
source_file_path: Union[Path, str],
|
|
1388
|
+
quarantine_reason: str,
|
|
1389
|
+
is_pdf_problematic: bool,
|
|
1390
|
+
) -> bool:
|
|
1025
1391
|
"""
|
|
1026
1392
|
Archive or quarantine file based on the state of the PDF processing.
|
|
1027
1393
|
Uses the central PDF instance and processing context if parameters not provided.
|
|
1028
|
-
|
|
1394
|
+
|
|
1029
1395
|
Args:
|
|
1030
1396
|
pdf_instance: Optional PDF instance, defaults to self.current_pdf
|
|
1031
1397
|
source_file_path: Optional source file path, defaults to processing_context['file_path']
|
|
1032
1398
|
quarantine_reason: Optional quarantine reason, defaults to processing_context['error_reason']
|
|
1033
1399
|
is_pdf_problematic: Optional override for problematic state
|
|
1034
|
-
|
|
1400
|
+
|
|
1035
1401
|
Returns:
|
|
1036
1402
|
bool: True if file was quarantined, False if archived successfully
|
|
1037
1403
|
"""
|
|
1038
1404
|
pdf_file = pdf_instance or self.current_pdf
|
|
1039
|
-
file_path =
|
|
1040
|
-
|
|
1041
|
-
|
|
1405
|
+
file_path = (
|
|
1406
|
+
Path(source_file_path)
|
|
1407
|
+
if source_file_path
|
|
1408
|
+
else self.processing_context.get("file_path")
|
|
1409
|
+
)
|
|
1410
|
+
quarantine_reason = str(quarantine_reason or self.processing_context.get(
|
|
1411
|
+
"error_reason"
|
|
1412
|
+
))
|
|
1413
|
+
|
|
1042
1414
|
if not pdf_file:
|
|
1043
1415
|
raise ValueError("No PDF instance available for archiving/quarantine")
|
|
1044
1416
|
if not file_path:
|
|
1045
1417
|
raise ValueError("No file path available for archiving/quarantine")
|
|
1046
|
-
|
|
1418
|
+
|
|
1047
1419
|
# Determine if the PDF is problematic
|
|
1048
|
-
pdf_problematic =
|
|
1049
|
-
|
|
1420
|
+
pdf_problematic = (
|
|
1421
|
+
is_pdf_problematic
|
|
1422
|
+
if is_pdf_problematic is not None
|
|
1423
|
+
else pdf_file.is_problematic
|
|
1424
|
+
)
|
|
1425
|
+
|
|
1050
1426
|
if pdf_problematic:
|
|
1051
1427
|
# Quarantine the file
|
|
1052
|
-
logger.warning(
|
|
1428
|
+
logger.warning(
|
|
1429
|
+
f"Quarantining problematic PDF: {pdf_file.pdf_hash}, reason: {quarantine_reason}"
|
|
1430
|
+
)
|
|
1053
1431
|
quarantine_dir = path_utils.PDF_DIR / "quarantine"
|
|
1054
1432
|
os.makedirs(quarantine_dir, exist_ok=True)
|
|
1055
|
-
|
|
1433
|
+
|
|
1056
1434
|
quarantine_path = quarantine_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
1057
1435
|
try:
|
|
1058
1436
|
shutil.move(file_path, quarantine_path)
|
|
1059
|
-
pdf_file.
|
|
1060
|
-
pdf_file.save(update_fields=['quarantine_reason'])
|
|
1437
|
+
pdf_file.save(update_fields=["quarantine_reason"])
|
|
1061
1438
|
logger.info(f"Moved problematic PDF to quarantine: {quarantine_path}")
|
|
1062
1439
|
return True
|
|
1063
1440
|
except Exception as e:
|
|
1064
1441
|
logger.error(f"Failed to quarantine PDF {pdf_file.pdf_hash}: {e}")
|
|
1065
|
-
return
|
|
1442
|
+
return (
|
|
1443
|
+
True # Still consider as quarantined to prevent further processing
|
|
1444
|
+
)
|
|
1066
1445
|
else:
|
|
1067
1446
|
# Archive the file normally
|
|
1068
1447
|
logger.info(f"Archiving successfully processed PDF: {pdf_file.pdf_hash}")
|
|
1069
1448
|
archive_dir = path_utils.PDF_DIR / "processed"
|
|
1070
1449
|
os.makedirs(archive_dir, exist_ok=True)
|
|
1071
|
-
|
|
1450
|
+
|
|
1072
1451
|
archive_path = archive_dir / f"{pdf_file.pdf_hash}.pdf"
|
|
1073
1452
|
try:
|
|
1074
1453
|
shutil.move(file_path, archive_path)
|
|
@@ -1077,3 +1456,25 @@ class PdfImportService:
|
|
|
1077
1456
|
except Exception as e:
|
|
1078
1457
|
logger.error(f"Failed to archive PDF {pdf_file.pdf_hash}: {e}")
|
|
1079
1458
|
return False
|
|
1459
|
+
|
|
1460
|
+
def _is_placeholder_value(self, field_name: str, value) -> bool:
|
|
1461
|
+
"""Return True if a SensitiveMeta field still has a dummy/default value."""
|
|
1462
|
+
if value is None:
|
|
1463
|
+
return True
|
|
1464
|
+
|
|
1465
|
+
# String placeholders
|
|
1466
|
+
if isinstance(value, str):
|
|
1467
|
+
if value in {self.DEFAULT_PATIENT_FIRST_NAME, self.DEFAULT_PATIENT_LAST_NAME}:
|
|
1468
|
+
return True
|
|
1469
|
+
|
|
1470
|
+
# Date placeholders
|
|
1471
|
+
if isinstance(value, date):
|
|
1472
|
+
# Default DOB
|
|
1473
|
+
if field_name == "patient_dob" and value == self.DEFAULT_PATIENT_DOB:
|
|
1474
|
+
return True
|
|
1475
|
+
# "Today" exam date created as fallback – allow anonymizer to override
|
|
1476
|
+
if field_name == "examination_date" and value == date.today():
|
|
1477
|
+
return True
|
|
1478
|
+
|
|
1479
|
+
return False
|
|
1480
|
+
|