PyPI - endoreg-db - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend - Supply Chain Defender

endoreg-db 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of endoreg-db might be problematic. Click here for more details.

Files changed (39) hide show

endoreg_db/models/data_file/import_classes/processing_functions.py ADDED Viewed

@@ -0,0 +1,269 @@
+from .raw_video import RawVideoFile
+# # Starting point
+# Automated tasks generate RawVideoFile objects in our db.
+# Each object has state_{NAME} attributes.
+# We will create functions which query the db for RawVideoFile
+# objects with specific state_{NAME} attributes.
+# Then, we perform the necessary operations on the RawVideoFile and
+# update the state_{NAME} attributes accordingly.
+# # Step 1 - Frame Extraction
+# function to query for videos scheduled for frame extraction,
+# these have state_frames_required and state_frames_extracted
+def get_videos_scheduled_for_frame_extraction():
+    return RawVideoFile.objects.filter(
+        state_frames_required=True,
+        state_frames_extracted=False
+    )
+def extract_frames_from_video(video:RawVideoFile):
+    # extract frames from video
+    video.extract_frames()
+    # update state_frames_extracted
+    video.state_frames_extracted = True
+    video.save()
+    return video
+def extract_frames_from_videos():
+    videos = get_videos_scheduled_for_frame_extraction()
+    for video in videos:
+        extract_frames_from_video(video)
+# # Step 2 - OCR
+# function to query for videos scheduled for OCR,
+# these have
+# state_ocr_required = True and state_ocr_completed = False and state_frames_extracted = True
+def get_videos_scheduled_for_ocr():
+    return RawVideoFile.objects.filter(
+        state_ocr_required=True,
+        state_ocr_completed=False,
+        state_frames_extracted=True
+    )
+# function to set state_frames_required to True for videos
+# which are scheduled for OCR but have not had frames extracted
+def videos_scheduled_for_ocr_preflight():
+    videos = RawVideoFile.objects.filter(
+        state_ocr_required=True,
+        state_ocr_completed=False,
+        state_frames_extracted=False
+    )
+    for video in videos:
+        video.state_frames_required = True
+        video.save()
+def perform_ocr_on_video(video:RawVideoFile):
+    # perform OCR on video
+    video.update_text_metadata()
+    # update state_ocr_completed
+    video.state_ocr_completed = True
+    video.save()
+    return video
+def perform_ocr_on_videos():
+    videos = get_videos_scheduled_for_ocr()
+    for video in videos:
+        perform_ocr_on_video(video)
+# # Step 3 - initial Prediction
+# function to query for videos scheduled for initial prediction,
+# these have
+# state_initial_prediction_required = True and state_initial_prediction_completed = False and state_frames_extracted = True
+def videos_scheduled_for_initial_prediction_preflight():
+    videos = RawVideoFile.objects.filter(
+        state_initial_prediction_required=True,
+        state_initial_prediction_completed=False,
+        state_frames_extracted=False
+    )
+    for video in videos:
+        video.state_frames_required = True
+        video.save()
+def get_videos_scheduled_for_initial_prediction():
+    return RawVideoFile.objects.filter(
+        state_initial_prediction_required=True,
+        state_initial_prediction_completed=False,
+        state_frames_extracted=True
+    )
+from pathlib import Path
+def get_multilabel_model(model_path:Path):
+    from agl_predict_endo_frame.model_loader import MultiLabelClassificationNet
+    model_path_str = model_path.resolve().as_posix()
+    model = MultiLabelClassificationNet.load_from_checkpoint(model_path_str)
+    model.cuda()
+    model.eval()
+    return model
+def get_multilabel_classifier(model, verbose:bool=False):
+    from agl_predict_endo_frame.predict import Classifier
+    classifier = Classifier(model, verbose = verbose)
+    return classifier
+def get_crops(video, paths):
+    endo_roi_dict = video.get_endo_roi()
+    # dict with x, y, width height
+    # crops is list of touples with (y_min, y_max, x_min, x_max)
+    crop_tuple = (
+        endo_roi_dict["y"],
+        endo_roi_dict["y"] + endo_roi_dict["height"],
+        endo_roi_dict["x"],
+        endo_roi_dict["x"] + endo_roi_dict["width"],
+    )
+    crops = [crop_tuple for _ in paths]
+    return crops
+# model = MultiLabelClassificationNet.load_from_checkpoint("model/colo_segmentation_RegNetX800MF_6.ckpt")
+def perform_initial_prediction_on_video(
+    video:RawVideoFile, model_path,
+    window_size_s, min_seq_len_s
+):
+    model = get_multilabel_model(model_path)
+    classifier = get_multilabel_classifier(model, verbose = True)
+    paths = video.get_frame_paths()
+    string_paths = [p.resolve().as_posix() for p in paths]
+    crops = get_crops(video, string_paths)
+    fps = video.get_fps()
+    predictions = classifier.pipe(string_paths, crops)
+    readable_predictions = [classifier.readable(p) for p in predictions]
+    result_dict = classifier.post_process_predictions_serializable(
+        readable_predictions,
+        window_size_s = window_size_s,
+        min_seq_len_s = min_seq_len_s,
+        fps = fps
+    )
+    # pred_target_dir = video.get_pred_target_dir()
+    result_targets = [
+        "predictions",
+        "smooth_predictions",
+        "binary_predictions",
+        "raw_sequences",
+        "filtered_sequences"
+    ]
+    # Predictions
+    _path = video.get_predictions_path()
+    with open(_path, "w") as f:
+        json.dump(result_targets["predictions"])
+    # smooth_predictions
+    _path = video.get_smooth_predictions_path()
+    with open(_path, "w") as f:
+        json.dump(result_targets["smooth_predictions"])
+    # binary_predictions
+    _path = video.get_binary_predictions_path()
+    with open(_path, "w") as f:
+        json.dump(result_targets["binary_predictions"])
+    # Raw Sequences
+    _path = video.get_raw_sequences_path()
+    with open(_path, "w") as f:
+        json.dump(result_targets["raw_sequences"])
+    # filtered_sequences
+    _path = video.get_filtered_sequences_path()
+    with open(_path, "w") as f:
+        json.dump(result_targets["filtered_sequences"])
+    # update state_initial_prediction_completed
+    video.state_initial_prediction_required = False
+    video.state_initial_prediction_completed = True
+    video.state_initial_prediction_import_required = True
+    video.state_initial_prediction_import_completed = False
+    video.save()
+    return video
+def perform_initial_prediction_on_videos(
+    model_path,
+    window_size_s, min_seq_len_s
+):
+    videos = get_videos_scheduled_for_initial_prediction()
+    for video in videos:
+        perform_initial_prediction_on_video(
+        video,
+        model_path, window_size_s, min_seq_len_s
+    )
+def videos_scheduled_for_prediction_import_preflight():
+    videos = RawVideoFile.objects.filter(
+        state_initial_prediction_completed=True,
+        state_initial_prediction_import_completed=False
+    )
+    for video in videos:
+        video.state_initial_prediction_required = True
+        video.save()
+def get_videos_scheduled_for_prediction_import():
+    return RawVideoFile.objects.filter(
+        state_prediction_import_required=True,
+        state_prediction_import_completed=False,
+        state_initial_prediction_completed=True
+    )
+def import_predictions_for_video(video:RawVideoFile):
+    # import predictions for video
+    pass
+    # update state_prediction_import_completed
+    video.state_prediction_import_required = False
+    video.state_prediction_import_completed = True
+    video.save()
+    return video
+def import_predictions_for_videos():
+    videos = get_videos_scheduled_for_prediction_import()
+    for video in videos:
+        import_predictions_for_video(video)
+# # Step 4 - Delete Frames if not needed anymore
+# function to query for videos scheduled for frame deletion,
+# first we need to set state_frames_required = False for videos with:
+# state_ocr_required = False and state_ocr_completed = True and
+# state_initial_prediction_required = False and state_initial_prediction_completed = True
+def delete_frames_preflight():
+    videos = RawVideoFile.objects.filter(
+        state_ocr_required=False,
+        state_ocr_completed=True,
+        state_initial_prediction_required=False,
+        state_initial_prediction_completed=True
+    )
+    for video in videos:
+        video.state_frames_required = False
+        video.save()
+# function to query for videos scheduled for frame deletion,
+# frames should be deleted if state_frames_required = False
+def get_videos_scheduled_for_frame_deletion():
+    return RawVideoFile.objects.filter(
+        state_frames_required=False
+    )
+def delete_frames_for_video(video:RawVideoFile):
+    # delete frames for video
+    # update state_frames_deleted
+    video.state_frames_extracted = False
+    video.save()
+    return video
+def delete_frames():
+    videos = get_videos_scheduled_for_frame_deletion()
+    for video in videos:
+        delete_frames_for_video(video)

endoreg_db/models/data_file/import_classes/raw_video.py ADDED Viewed

@@ -0,0 +1,341 @@
+from django.db import models
+from pathlib import Path
+from collections import defaultdict, Counter
+from endoreg_db.utils.hashs import get_video_hash
+from endoreg_db.utils.file_operations import get_uuid_filename
+from endoreg_db.utils.ocr import extract_text_from_rois
+import shutil
+import os
+import subprocess
+from ..metadata import VideoMeta, SensitiveMeta
+class RawVideoFile(models.Model):
+    uuid = models.UUIDField()
+    file = models.FileField(upload_to="raw_data/")
+    sensitive_meta = models.OneToOneField(
+        "SensitiveMeta", on_delete=models.CASCADE, blank=True, null=True
+    )
+    center = models.ForeignKey("Center", on_delete=models.CASCADE)
+    processor = models.ForeignKey(
+        "EndoscopyProcessor", on_delete=models.CASCADE, blank=True, null=True
+    )
+    video_meta = models.OneToOneField(
+        "VideoMeta", on_delete=models.CASCADE, blank=True, null=True
+    )
+    original_file_name = models.CharField(max_length=255)
+    video_hash = models.CharField(max_length=255, unique=True)
+    uploaded_at = models.DateTimeField(auto_now_add=True)
+    # Frame Extraction States
+    state_frames_required = models.BooleanField(default=True)
+    state_frames_extracted = models.BooleanField(default=False)
+    # Video
+    ## Prediction
+    state_initial_prediction_required = models.BooleanField(default=True)
+    state_initial_prediction_completed = models.BooleanField(default=False)
+    state_initial_prediction_import_required = models.BooleanField(default=True)
+    state_initial_prediction_import_completed = models.BooleanField(default=False)
+    ## OCR
+    state_ocr_required = models.BooleanField(default=True)
+    state_ocr_completed = models.BooleanField(default=False)
+    ## Validation
+    state_outside_validated = models.BooleanField(default=False)
+    state_ocr_result_validated = models.BooleanField(default=False)
+    state_sensitive_data_retrieved = models.BooleanField(default=False)
+    # Dataset complete?
+    state_histology_required = models.BooleanField(blank=True, null=True)
+    state_histology_available = models.BooleanField(default=False)
+    state_follow_up_intervention_required = models.BooleanField(blank=True, null=True)
+    state_follow_up_intervention_available = models.BooleanField(default=False)
+    state_dataset_complete = models.BooleanField(default=False)
+    # Finalizing for Upload
+    state_anonym_video_required = models.BooleanField(default=True)
+    state_anonym_video_performed = models.BooleanField(default=False)
+    state_original_reports_deleted = models.BooleanField(default=False)
+    state_original_video_deleted = models.BooleanField(default=False)
+    state_finalized = models.BooleanField(default=False)
+    frame_dir = models.CharField(max_length=255)
+    prediction_dir = models.CharField(max_length=255)
+    @classmethod
+    def create_from_file(
+        cls,
+        file_path: Path,
+        video_dir_parent: Path,
+        center_name: str,
+        processor_name: str,
+        frame_dir_parent: Path,
+        save: bool = True,
+    ):
+        from endoreg_db.models import Center, EndoscopyProcessor
+        print(f"Creating RawVideoFile from {file_path}")
+        original_file_name = file_path.name
+        # Rename and and move
+        new_file_name, uuid = get_uuid_filename(file_path)
+        framedir: Path = frame_dir_parent / str(uuid)
+        if not framedir.exists():
+            framedir.mkdir(parents=True, exist_ok=True)
+        if not video_dir_parent.exists():
+            video_dir_parent.mkdir(parents=True, exist_ok=True)
+        video_hash = get_video_hash(file_path)
+        center = Center.objects.get(name=center_name)
+        assert center is not None, "Center must exist"
+        processor = EndoscopyProcessor.objects.get(name=processor_name)
+        assert processor is not None, "Processor must exist"
+        new_filepath = video_dir_parent / new_file_name
+        print(f"Moving {file_path} to {new_filepath}")
+        shutil.move(file_path.resolve().as_posix(), new_filepath.resolve().as_posix())
+        print(f"Moved to {new_filepath}")
+        # Make sure file was transferred correctly and hash is correct
+        if not new_filepath.exists():
+            print(f"File {file_path} was not transferred correctly to {new_filepath}")
+            return None
+        new_hash = get_video_hash(new_filepath)
+        if new_hash != video_hash:
+            print(f"Hash of file {file_path} is not correct")
+            return None
+        # make sure that no other file with the same hash exists
+        if cls.objects.filter(video_hash=video_hash).exists():
+            # log and print warnint
+            print(f"File with hash {video_hash} already exists")
+            return None
+        else:
+            print(center)
+            # Create a new instance of RawVideoFile
+            raw_video_file = cls(
+                uuid=uuid,
+                file=new_filepath.resolve().as_posix(),
+                center=center,
+                processor=processor,
+                original_file_name=original_file_name,
+                video_hash=video_hash,
+                frame_dir=framedir.as_posix(),
+            )
+            # Save the instance to the database
+            raw_video_file.save()
+            return raw_video_file
+    def __str__(self):
+        return self.file.name
+    def get_endo_roi(self):
+        endo_roi = self.video_meta.get_endo_roi()
+        return endo_roi
+    # video meta should be created when video file is created
+    def save(self, *args, **kwargs):
+        if self.video_meta is None:
+            center = self.center
+            processor = self.processor
+            self.video_meta = VideoMeta.objects.create(
+                center=center, processor=processor
+            )
+            self.video_meta.initialize_ffmpeg_meta(self.file.path)
+        super(RawVideoFile, self).save(*args, **kwargs)
+    def extract_frames(
+        self,
+        quality: int = 2,
+        frame_dir: Path = None,
+        overwrite: bool = False,
+        ext="jpg",
+    ):
+        """
+        Extract frames from the video file and save them to the frame_dir.
+        For this, ffmpeg must be available in in the current environment.
+        """
+        if frame_dir is None:
+            frame_dir = Path(self.frame_dir)
+        else:
+            frame_dir = Path(frame_dir)
+        if not frame_dir.exists():
+            frame_dir.mkdir(parents=True, exist_ok=True)
+        if not overwrite and len(list(frame_dir.glob("*.jpg"))) > 0:
+            print(f"Frames already extracted for {self.file.name}")
+            return
+        video_path = Path(self.file.path).resolve().as_posix()
+        frame_path_string = frame_dir.resolve().as_posix()
+        command = [
+            "ffmpeg",
+            "-i",
+            video_path,  #
+            "-q:v",
+            str(quality),
+            os.path.join(frame_path_string, f"frame_%07d.{ext}"),
+        ]
+        # Ensure FFmpeg is available
+        if not shutil.which("ffmpeg"):
+            raise EnvironmentError(
+                "FFmpeg could not be found. Ensure it is installed and in your PATH."
+            )
+        # Extract frames from the video file
+        # Execute the command
+        result = subprocess.run(command, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise Exception(f"Error extracting frames: {result.stderr}")
+        self.state_frames_extracted = True
+        return f"Frames extracted to {frame_dir} ({frame_path_string}) with quality {quality}"
+    def delete_frames(self):
+        """
+        Delete frames extracted from the video file.
+        """
+        frame_dir = Path(self.frame_dir)
+        if frame_dir.exists():
+            shutil.rmtree(frame_dir)
+            self.state_frames_extracted = False
+            self.save()
+            return f"Frames deleted from {frame_dir}"
+        else:
+            return f"No frames to delete for {self.file.name}"
+    def get_frame_path(self, n: int = 0):
+        """
+        Get the path to the n-th frame extracted from the video file.
+        Note that the frame numbering starts at 1 in our naming convention.
+        """
+        # Adjust index
+        n = n + 1
+        frame_dir = Path(self.frame_dir)
+        return frame_dir / f"frame_{n:07d}.jpg"
+    def get_frame_paths(self):
+        if not self.state_frames_extracted:
+            return None
+        frame_dir = Path(self.frame_dir)
+        paths = [p for p in frame_dir.glob('*')]
+        indices = [int(p.stem.split("_")[1]) for p in paths]
+        path_index_tuples = list(zip(paths, indices))
+        # sort ascending by index
+        path_index_tuples.sort(key=lambda x: x[1])
+        paths, indices = zip(*path_index_tuples)
+        return paths
+    def get_prediction_dir(self):
+        return Path(elf.prediction_dir)
+    def get_predictions_path(self, suffix = ".json"):
+        pred_dir = self.get_prediction_dir()
+        return pred_dir.joinpath("predictions").with_suffix(suffix)
+    def get_smooth_predictions_path(self, suffix = ".json"):
+        pred_dir = self.get_prediction_dir()
+        return pred_dir.joinpath("smooth_predictions").with_suffix(suffix)
+    def get_binary_predictions_path(self, suffix = ".json"):
+        pred_dir = self.get_prediction_dir()
+        return pred_dir.joinpath("binary_predictions").with_suffix(suffix)
+    def get_raw_sequences_path(self, suffix = ".json"):
+        pred_dir = self.get_prediction_dir()
+        return pred_dir.joinpath("raw_sequences").with_suffix(suffix)
+    def get_filtered_sequences_path(self, suffix=".json"):
+        pred_dir = self.get_prediction_dir()
+        return pred_dir.joinpath("filtered_sequences").with_suffix(suffix)
+    def extract_text_information(self, frame_fraction: float = 0.001):
+        """
+        Extract text information from the video file.
+        Makes sure that frames are extracted and then processes the frames.
+        gets all frames from frame_dir and selects a fraction of them to process (at least 1)
+        """
+        if not self.state_frames_extracted:
+            print(f"Frames not extracted for {self.file.name}")
+            return None
+        processor = self.processor
+        frame_dir = Path(self.frame_dir)
+        frames = list(frame_dir.glob("*"))
+        n_frames = len(frames)
+        n_frames_to_process = max(1, int(frame_fraction * n_frames))
+        # Select evenly spaced frames
+        frames = frames[:: n_frames // n_frames_to_process]
+        # extract text from each frame and store the value to
+        # defaultdict of lists.
+        # Then, extract the most frequent value from each list
+        # Finally, return the dictionary of most frequent values
+        # Create a defaultdict to store the extracted text from each ROI
+        rois_texts = defaultdict(list)
+        print(f"Processing {n_frames_to_process} frames from {self.file.name}")
+        # Process frames
+        for frame_path in frames[:n_frames_to_process]:
+            extracted_texts = extract_text_from_rois(frame_path, processor)
+            for roi, text in extracted_texts.items():
+                rois_texts[roi].append(text)
+        # Get the most frequent text values for each ROI using Counter
+        for key in rois_texts.keys():
+            counter = Counter([text for text in rois_texts[key] if text])
+            rois_texts[key] = counter.most_common(1)[0][0] if counter else None
+        return rois_texts
+    def update_text_metadata(self, ocr_frame_fraction=0.001):
+        print(f"Updating metadata for {self.file.name}")
+        texts = self.extract_text_information(ocr_frame_fraction)
+        self.sensitive_meta = SensitiveMeta.create_from_dict(texts)
+        self.state_sensitive_data_retrieved = True
+        self.save()
+        # Resulting dict depends on defined ROIs for this processor type!
+    def update_video_meta(self):
+        video_meta = self.video_meta
+        video_path = Path(self.file.path)
+        if video_meta is None:
+            video_meta = VideoMeta.create_from_video(video_path)
+            self.video_meta = video_meta
+            self.save()
+        else:
+            video_meta.update_meta(video_path)
+    def get_fps(self):
+        if self.video_meta is None:
+            self.update_video_meta()
+        if self.video_meta.ffmpeg_meta is None:
+            self.video_meta.initialize_ffmpeg_meta(self.file.path)
+        return self.video_meta.get_fps()