PyPI - endoreg-db - Versions diffs - 0.8.8.0__py3-none-any.whl → 0.8.8.9__py3-none-any.whl - Mend

endoreg-db 0.8.8.0py3-none-any.whl → 0.8.8.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of endoreg-db might be problematic. Click here for more details.

Files changed (402) hide show

endoreg_db/import_files/import_service.md ADDED Viewed

@@ -0,0 +1,26 @@
+# File Import and Anonymization
+Endoreg-Db imports are guarded by a anonymization step, that is supposed to ensure most data is redacted from the input. Here, fake patients are generated to pseudonymize the sensitive information in the data. This ensures, that videos as well as pdfs are not distributed using sensitive data, and if they are by some accident it is harder to know what data is actually real.
+The Import is handled by two orchestration files:
+Report import service (RIS)
+and
+Video import Service (VIS)
+The orchestration is abstracted out by the base import service (BIS), to ensure newly implemented data imports follow the same structure and to ensure tests run agnostically of the actual media being processed.
+## Import Order of Execution
+The Import starts, when files are dropped into the corresponding media import folders. The locations need to be passed to the import service logic. To ensure atomic processing without overwhelming the server or double processing on parallelization, a file lock is added to the files that are currently processed.
+### File Lock
+File Lock is implemented as a context manager. Per default, this means during the execution the files are marked by adding a additional .lock file path inside the folder. Once the code wrapped in the context manager of file lock stops execution, the .lock file is removed only after error processing. This ensures, the full pipeline is executed on each run even when interrupted.
+https://book.pythontips.com/en/latest/context_managers.html
+### Error Cleanup
+The ErrorCleanup class is called from inside the file lock context manager to avoid leaving half processed files laying around. It passes file type to the class instance and then runs the correct processing logic.

endoreg_db/import_files/processing/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# processing/__init__.py
+from . import sensitive_meta_adapter
+from . import report_processing
+from . import video_processing
+__all__ = [
+    "sensitive_meta_adapter",
+    "report_processing",
+    "video_processing",
+]

endoreg_db/import_files/processing/report_processing/report_anonymization.py ADDED Viewed

@@ -0,0 +1,94 @@
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Any, Callable, Literal, NoReturn
+from lx_anonymizer import ReportReader
+from lx_anonymizer.sensitive_meta_interface import SensitiveMeta as LxSM
+from endoreg_db.import_files.context import ImportContext
+from endoreg_db.import_files.file_storage.sensitive_meta_storage import sensitive_meta_storage
+from endoreg_db.utils.paths import ANONYM_REPORT_DIR
+logger = logging.getLogger(__name__)
+class ReportAnonymizer:
+    def __init__(self):
+        self._report_reader_class = None
+        self._ensure_report_reading_available()
+        self.storage = False
+    def anonymize_report(self, ctx: ImportContext):
+        # Setup anonymized directory
+        anonymized_dir = ANONYM_REPORT_DIR
+        anonymized_dir.mkdir(parents=True, exist_ok=True)
+        assert ctx.current_report is not None
+        # Generate output path for anonymized report
+        pdf_hash = ctx.current_report.pdf_hash
+        anonymized_output_path = anonymized_dir / f"{pdf_hash}.pdf"
+        self._report_reader_class = ReportReader()
+        assert isinstance(self._report_reader_class, ReportReader)
+        # Process with enhanced process_report method (returns 4-tuple now)
+        ctx.original_text, ctx.anonymized_text, extracted_metadata, ctx.anonymized_path = self._report_reader_class.process_report(
+                pdf_path=ctx.file_path,
+                create_anonymized_pdf=True,
+                anonymized_pdf_output_path=str(anonymized_output_path),
+            )
+        if ctx.anonymized_path:
+            logger.info("DEBUG: after anonymizer, ctx.anonymized_path=%s (exists=%s)",
+                        ctx.anonymized_path, isinstance(ctx.anonymized_path, str))
+        sm = LxSM()
+        sm.safe_update(extracted_metadata)
+        self.storage = sensitive_meta_storage(sm, ctx.current_report)
+        return ctx
+    def _ensure_report_reading_available(
+        self
+    )  -> None:
+        """
+        Ensure report reading modules are available by adding lx-anonymizer to path.
+        Returns:
+            Tuple of (availability_flag, ReportReader_class)
+        """
+        try:
+            # Try direct import first
+            from lx_anonymizer import ReportReader
+            logger.info("Successfully imported lx_anonymizer ReportReader module")
+            self._report_reader_available = True
+            self._report_reader_class = ReportReader
+        except ImportError:
+            # Optional: honor LX_ANONYMIZER_PATH=/abs/path/to/src
+            import importlib
+            extra = os.getenv("LX_ANONYMIZER_PATH")
+            if extra and extra not in sys.path and Path(extra).exists():
+                sys.path.insert(0, extra)
+                try:
+                    mod = importlib.import_module("lx_anonymizer")
+                    ReportReader = getattr(mod, "ReportReader")
+                    logger.info(
+                        "Imported lx_anonymizer.ReportReader via LX_ANONYMIZER_PATH"
+                    )
+                    self._report_reader_available = True
+                    self._report_reader_class = ReportReader
+                except Exception as e:
+                    logger.warning(
+                        "Failed importing lx_anonymizer via LX_ANONYMIZER_PATH: %s", e
+                    )
+        self._report_reader_available = False
+        self._report_reader_class = None

endoreg_db/import_files/processing/sensitive_meta_adapter.py ADDED Viewed

@@ -0,0 +1,51 @@
+# endoreg_db/import_files/processing/sensitive_meta_adapter.py
+from typing import Any, Dict
+from lx_anonymizer.sensitive_meta_interface import SensitiveMeta as LxSensitiveMeta
+def normalize_lx_sensitive_meta(meta: LxSensitiveMeta) -> Dict[str, Any]:
+    """
+    Convert lx_anonymizer.SensitiveMeta into a dict suitable for
+    endoreg_db SensitiveMeta.update_from_dict / create_from_dict.
+    - Renames fields where necessary (center -> center_name, patient_gender_name -> patient_gender)
+    - Drops None/blank values (your update logic already handles blanks carefully)
+    - Leaves dates as strings; your logic layer already parses them
+    """
+    raw = meta.to_dict()
+    out: Dict[str, Any] = {}
+    # 1:1 fields (same names in model logic)
+    direct_keys = [
+        "file_path",
+        "patient_first_name",
+        "patient_last_name",
+        "patient_dob",          # string; logic has parsing
+        "casenumber",
+        "examination_date",     # string; logic has parsing
+        "examination_time",     # string "HH:MM" is fine
+        "examiner_first_name",
+        "examiner_last_name",
+        "text",
+        "anonymized_text",
+        "endoscope_type",
+        "endoscope_sn",
+    ]
+    for k in direct_keys:
+        v = raw.get(k)
+        if v not in (None, "", []):
+            out[k] = v
+    # Map patient_gender_name (interface) -> patient_gender (logic)
+    gender_name = raw.get("patient_gender_name")
+    if gender_name not in (None, ""):
+        # Your logic.update_* can handle strings for patient_gender
+        out["patient_gender"] = gender_name
+    # Map center (string) -> center_name (logic)
+    center_name = raw.get("center")
+    if center_name not in (None, ""):
+        out["center_name"] = center_name
+    return out

endoreg_db/import_files/processing/video_processing/video_anonymization.py ADDED Viewed

@@ -0,0 +1,107 @@
+from typing import List, Dict, Any, Tuple, Optional
+import logging
+logger = logging.getLogger(__name__)
+from lx_anonymizer import FrameCleaner
+from lx_anonymizer.sensitive_meta_interface import SensitiveMeta as LxSM
+from endoreg_db.import_files.file_storage.sensitive_meta_storage import sensitive_meta_storage
+from endoreg_db.import_files.context import ImportContext
+from endoreg_db.utils.paths import ANONYM_VIDEO_DIR
+from endoreg_db.models import EndoscopyProcessor, VideoFile
+class VideoAnonymizer:
+    def __init__(self):
+        self._ensure_frame_cleaning_available()
+        self._frame_cleaning_available = None
+        self._frame_cleaning_class = None
+        self.storage = False
+    def anonymize_video(self, ctx: ImportContext):
+        # Setup anonymized directory
+        anonymized_dir = ANONYM_VIDEO_DIR
+        anonymized_dir.mkdir(parents=True, exist_ok=True)
+        assert ctx.current_video is not None
+        # Generate output path for anonymized report
+        video_hash = ctx.current_video.video_hash
+        anonymized_output_path = anonymized_dir / f"{video_hash}.mp4"
+        self._frame_cleaning_class = FrameCleaner()
+        assert isinstance(self._frame_cleaning_class, FrameCleaner)
+        endoscope_roi, endoscope_roi_nested = self._get_processor_roi_info(ctx)
+        # Process with enhanced process_report method (returns 4-tuple now)
+        ctx.anonymized_path, extracted_metadata = self._frame_cleaning_class.clean_video(
+                video_path=ctx.file_path,
+                endoscope_image_roi=endoscope_roi,
+                endoscope_data_roi_nested=endoscope_roi_nested,
+                output_path=anonymized_output_path
+            )
+        sm = LxSM()
+        sm.safe_update(extracted_metadata)
+        self.storage = sensitive_meta_storage(sm, ctx.current_video)
+        return ctx
+    def _ensure_frame_cleaning_available(self):
+        """
+        Ensure frame cleaning modules are available by adding lx-anonymizer to path.
+        Returns:
+            Tuple of (availability_flag, FrameCleaner_class, ReportReader_class)
+        """
+        try:
+            from lx_anonymizer import FrameCleaner
+        except Exception as e:
+            logger.warning(
+                f"Frame cleaning not available: {e} Please install or update lx_anonymizer."
+            )
+            raise
+        assert FrameCleaner is not None
+        self._frame_cleaning_class = FrameCleaner()
+        self._frame_cleaning_available = True
+    def _get_processor_roi_info(
+        self,
+        ctx: ImportContext,
+    ) -> tuple[dict[str, int | None] | None,
+            dict[str, dict[str, int | None] | None] | None]:
+        """Get processor ROI information for masking and data extraction."""
+        endoscope_data_roi_nested = None
+        endoscope_image_roi = None
+        video = ctx.current_video
+        assert isinstance(video, VideoFile)
+        try:
+            processor_name = ctx.processor_name if ctx.processor_name else None
+            if processor_name:
+                pr = EndoscopyProcessor()
+                processor = pr.get_by_name(processor_name)
+                assert isinstance(processor, EndoscopyProcessor), (
+                    "Processor is not of type EndoscopyProcessor"
+                )
+                endoscope_image_roi = processor.get_roi_endoscope_image()
+                endoscope_data_roi_nested = processor.get_sensitive_rois()
+                logger.info(
+                    "Retrieved processor ROI information: endoscope_image_roi=%s",
+                    endoscope_image_roi,
+                )
+            else:
+                logger.warning(
+                    "No processor found for video %s, proceeding without ROI masking",
+                    video.uuid,
+                )
+        except Exception as exc:
+            logger.error("Failed to retrieve processor ROI information: %s", exc)
+        # IMPORTANT: return order must match clean_video signature
+        return endoscope_image_roi, endoscope_data_roi_nested

endoreg_db/import_files/processing/video_processing/video_cleanup_on_error.py ADDED Viewed

@@ -0,0 +1,119 @@
+# endoreg_db/services/video_processing/video_cleanup_on_error.py
+import logging
+import shutil
+from pathlib import Path
+from typing import Any, Dict, MutableSet, Optional
+logger = logging.getLogger(__name__)
+def cleanup_video_on_error(
+    *,
+    current_video: Any,
+    original_file_path: Optional[str | Path],
+    processing_context: Dict[str, Any],
+) -> None:
+    """
+    Cleanup processing context on error for video imports.
+    This is extracted from VideoImportService._cleanup_on_error and kept as
+    close as possible to the original behavior.
+    """
+    try:
+        if not current_video or not hasattr(current_video, "state"):
+            # Nothing we can sensibly do here
+            return
+        # Ensure state exists
+        if current_video.state is None:
+            try:
+                current_video.get_or_create_state()
+            except Exception as e:
+                logger.warning(
+                    "Video state not found for video %s during error cleanup: %s",
+                    getattr(current_video, "uuid", None),
+                    e,
+                )
+                return
+        current_video.state = current_video.get_or_create_state()
+        # Try to restore original raw file
+        try:
+            if original_file_path is not None:
+                original_path = Path(original_file_path)
+                if not original_path.exists():
+                    raise AssertionError("Original file path does not exist")
+                logger.info("Marked video import as failed in state")
+                raw_file_path = getattr(getattr(current_video, "raw_file", None), "path", None)
+                if raw_file_path and original_file_path:
+                    shutil.copy2(str(raw_file_path), str(original_file_path))
+                else:
+                    logger.warning("Cannot restore original raw file: path is None")
+            else:
+                logger.warning("Original file path is None")
+        except AssertionError:
+            logger.warning("Original file path does not exist")
+        # Reset state flags if processing had started
+        try:
+            from endoreg_db.models.state import VideoState  # local import to avoid cycles
+            if not isinstance(current_video.state, VideoState):
+                logger.error("Current video state is not a VideoState instance during cleanup")
+                raise AssertionError
+            if processing_context.get("processing_started"):
+                current_video.state.frames_extracted = False
+                current_video.state.frames_initialized = False
+                current_video.state.video_meta_extracted = False
+                current_video.state.text_meta_extracted = False
+                current_video.state.save()
+        except Exception as e:
+            logger.warning("Error during video error cleanup: %s", e)
+    except Exception as outer_exc:
+        logger.warning("Unexpected error in cleanup_video_on_error: %s", outer_exc)
+def cleanup_video_processing_context(
+    *,
+    processing_context: Dict[str, Any],
+    processed_files: MutableSet[str],
+) -> None:
+    """
+    Cleanup processing context and release file lock for video imports.
+    Extracted from VideoImportService._cleanup_processing_context.
+    """
+    # DEFENSIVE: ensure dict
+    if processing_context is None:
+        processing_context = {}
+    # Release file lock if it was acquired
+    try:
+        lock_context = processing_context.get("_lock_context")
+        if lock_context is not None:
+            try:
+                lock_context.__exit__(None, None, None)
+                logger.info("Released file lock")
+            except Exception as e:
+                logger.warning("Error releasing file lock during context cleanup: %s", e)
+    except Exception as e:
+        logger.warning("Error while handling lock release in context cleanup: %s", e)
+    # Remove file from processed_files set if processing failed
+    try:
+        file_path = processing_context.get("file_path")
+        anonymization_completed = processing_context.get("anonymization_completed")
+        if file_path and not anonymization_completed:
+            file_path_str = str(file_path)
+            if file_path_str in processed_files:
+                processed_files.remove(file_path_str)
+                logger.info(
+                    "Removed %s from processed files (failed processing)",
+                    file_path_str,
+                )
+    except Exception as e:
+        logger.warning("Error while cleaning processed_files set: %s", e)

endoreg_db/import_files/pseudonymization/fake.py ADDED Viewed

@@ -0,0 +1,52 @@
+from datetime import date, timedelta
+from typing import Tuple, Optional
+from faker import Faker
+import random
+def fake_name_with_similar_dob_and_gender(
+    gender: str,
+    dob: date,
+    *,
+    year_tolerance: int = 3,
+    locale: str = "de_DE",
+    seed: Optional[int] = None,
+) -> Tuple[str, str, date]:
+    """
+    Generate a fake name with the same gender and a similar date of birth.
+    Args:
+        gender: "male" or "female"
+        dob: Original date of birth
+        year_tolerance: Maximum age difference in years
+        locale: Faker locale (default: German)
+        seed: Optional reproducible seed
+    Returns:
+        (full_name, fake_dob)
+    """
+    if gender not in {"male", "female"}:
+        raise ValueError("gender must be 'male' or 'female'")
+    fake = Faker(locale)
+    if seed is not None:
+        Faker.seed(seed)
+        random.seed(seed)
+    # --- Generate gender-safe name ---
+    if gender == "male":
+        first_name = fake.first_name_male()
+    else:
+        first_name = fake.first_name_female()
+    last_name = fake.last_name()
+    full_name = f"{first_name} {last_name}"
+    # --- Generate similar DOB ---
+    days_range = year_tolerance * 365
+    offset_days = random.randint(-days_range, days_range)
+    fake_dob = dob + timedelta(days=offset_days)
+    return first_name, last_name, fake_dob

endoreg_db/import_files/pseudonymization/k_anonymity.py ADDED Viewed

@@ -0,0 +1,182 @@
+from endoreg_db.models import Patient, SensitiveMeta, Center, Gender
+import logging
+from datetime import timedelta
+from typing import Tuple
+from django.db.models import QuerySet
+from itertools import combinations
+from typing import Dict, Tuple, List
+logger = logging.getLogger(__name__)
+QI_FLAGS = ["first_name", "last_name", "center", "gender", "dob_band"]
+def get_k_profile_for_instance(
+    instance: SensitiveMeta,
+    *,
+    dob_year_tolerance: int = 1,
+    include_self: bool = True,
+) -> Dict[Tuple[str, ...], int]:
+    """
+    For a given SensitiveMeta instance, compute k (equivalence class size)
+    for all non-empty subsets of the quasi-identifiers defined in QI_FLAGS.
+    Returns:
+        {
+          ('first_name',):  12,
+          ('center', 'gender'): 45,
+          ('first_name', 'last_name', 'dob_band'): 3,
+          ...
+        }
+    """
+    result: Dict[Tuple[str, ...], int] = {}
+    for r in range(1, len(QI_FLAGS) + 1):
+        for subset in combinations(QI_FLAGS, r):
+            use_first_name = "first_name" in subset
+            use_last_name = "last_name" in subset
+            use_center = "center" in subset
+            use_gender = "gender" in subset
+            use_dob_band = "dob_band" in subset
+            qs = _build_sensitive_meta_qi_queryset(
+                instance,
+                dob_year_tolerance=dob_year_tolerance,
+                include_self=include_self,
+                use_first_name=use_first_name,
+                use_last_name=use_last_name,
+                use_center=use_center,
+                use_gender=use_gender,
+                use_dob_band=use_dob_band,
+            )
+            k_value = qs.count()
+            result[subset] = k_value
+    return result
+def get_k_anonymity(pk, k=3):
+    """
+    How anonymized is a patient?
+    Get the k value for how many patients can be matched to the current patients attributes.
+    Args:
+        pk (_type_): _description_
+        k (int, optional): _description_. Defaults to 3.
+    """
+    return get_k_anonymity_for_sensitive_meta(pk=pk, k=k, dob_year_tolerance=1)
+def _build_sensitive_meta_qi_queryset(
+    instance: SensitiveMeta,
+    *,
+    dob_year_tolerance: int = 1,
+    include_self: bool = True,
+    use_first_name: bool = True,
+    use_last_name: bool = True,
+    use_center: bool = True,
+    use_gender: bool = True,
+    use_dob_band: bool = True,
+) -> QuerySet[SensitiveMeta]:
+    """
+    Build a queryset of SensitiveMeta records that are indistinguishable from
+    `instance` on the chosen quasi-identifiers:
+        - same center
+        - same patient_gender
+        - patient_dob within ±dob_year_tolerance years (approx via days)
+    Args:
+        instance: The SensitiveMeta instance we evaluate.
+        dob_year_tolerance: Allowed +- years around patient_dob.
+        include_self: Whether to include `instance` itself in the result.
+    Returns:
+        A Django QuerySet for further aggregation.
+    """
+    qs = SensitiveMeta.objects.all()
+    if use_first_name and instance.patient_first_name is not None:
+        qs = qs.filter(patient_first_name=instance.patient_first_name)
+    if use_last_name and instance.patient_first_name is not None:
+        qs = qs.filter(patient_first_name=instance.patient_first_name)
+    # --- Center ---
+    if use_center and instance.center is not None:
+        if instance.center.pk is not None:
+            qs = qs.filter(center=instance.center.pk)
+    # --- Gender ---
+    if use_gender and instance.patient_gender is not None
+        if instance.patient_gender.pk is not None:
+            qs = qs.filter(patient_gender_id=instance.patient_gender)
+    # --- DOB (approximate ±N years using days) ---
+    if use_dob_band and instance.patient_dob is not None:
+        days = dob_year_tolerance * 365
+        ref_date = instance.patient_dob.date()
+        start = ref_date - timedelta(days=days)
+        end = ref_date + timedelta(days=days)
+        qs = qs.filter(patient_dob__date__range=(start, end))
+    # --- Exclude self if requested ---
+    if not include_self and instance.pk is not None:
+        qs = qs.exclude(pk=instance.pk)
+    return qs
+def get_k_anonymity_for_sensitive_meta(
+    pk: int,
+    *,
+    k: int = 3,
+    dob_year_tolerance: int = 1,
+) -> Tuple[int, bool]:
+    """
+    Compute the k-anonymity (equivalence class size) for a SensitiveMeta record.
+    k-anonymity here is defined as the number of SensitiveMeta rows that share
+    the same quasi-identifiers as the given record:
+        - center
+        - patient_gender
+        - patient_dob within ±dob_year_tolerance years (approximate)
+    Args:
+        pk: Primary key of the SensitiveMeta instance to evaluate.
+        k: Desired anonymity threshold (e.g. 3 for 3-anonymity).
+        dob_year_tolerance: Allowed age window in years around patient_dob.
+    Returns:
+        (k_value, is_k_anonymous) where:
+            k_value       = size of the equivalence class
+            is_k_anonymous = True if k_value >= k
+    """
+    try:
+        sm = SensitiveMeta.objects.get(pk=pk)
+    except SensitiveMeta.DoesNotExist:
+        raise ValueError(f"SensitiveMeta with pk={pk} does not exist")
+    qs = _build_sensitive_meta_qi_queryset(
+        sm,
+        dob_year_tolerance=dob_year_tolerance,
+        include_self=True,
+    )
+    k_value = qs.count()
+    is_k_anon = k_value >= k
+    logger.info(
+        "k-anonymity for SensitiveMeta pk=%s -> k=%s (threshold=%s, dob_tol=%s years)",
+        pk,
+        k_value,
+        k,
+        dob_year_tolerance,
+    )
+    return k_value, is_k_anon

endoreg-db 0.8.8.0__py3-none-any.whl → 0.8.8.9__py3-none-any.whl

Potentially problematic release.

endoreg-db 0.8.8.0py3-none-any.whl → 0.8.8.9py3-none-any.whl