PyPI - deepdoctection - Versions diffs - 0.32__py3-none-any.whl → 0.34__py3-none-any.whl - Mend

deepdoctection 0.32py3-none-any.whl → 0.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (111) hide show

deepdoctection/__init__.py +8 -25
deepdoctection/analyzer/dd.py +84 -71
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +78 -56
deepdoctection/datapoint/box.py +7 -7
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +157 -75
deepdoctection/datapoint/view.py +175 -151
deepdoctection/datasets/adapter.py +30 -24
deepdoctection/datasets/base.py +10 -10
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +23 -25
deepdoctection/datasets/instances/doclaynet.py +48 -49
deepdoctection/datasets/instances/fintabnet.py +44 -45
deepdoctection/datasets/instances/funsd.py +23 -23
deepdoctection/datasets/instances/iiitar13k.py +8 -8
deepdoctection/datasets/instances/layouttest.py +2 -2
deepdoctection/datasets/instances/publaynet.py +3 -3
deepdoctection/datasets/instances/pubtables1m.py +18 -18
deepdoctection/datasets/instances/pubtabnet.py +30 -29
deepdoctection/datasets/instances/rvlcdip.py +28 -29
deepdoctection/datasets/instances/xfund.py +51 -30
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +13 -12
deepdoctection/eval/eval.py +32 -26
deepdoctection/eval/tedsmetric.py +16 -12
deepdoctection/eval/tp_eval_callback.py +7 -16
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +69 -89
deepdoctection/extern/deskew.py +11 -10
deepdoctection/extern/doctrocr.py +81 -64
deepdoctection/extern/fastlang.py +23 -16
deepdoctection/extern/hfdetr.py +53 -38
deepdoctection/extern/hflayoutlm.py +216 -155
deepdoctection/extern/hflm.py +35 -30
deepdoctection/extern/model.py +433 -255
deepdoctection/extern/pdftext.py +15 -15
deepdoctection/extern/pt/ptutils.py +4 -2
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +14 -16
deepdoctection/extern/tp/tfutils.py +16 -2
deepdoctection/extern/tp/tpcompat.py +11 -7
deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
deepdoctection/extern/tpdetect.py +40 -45
deepdoctection/mapper/cats.py +36 -40
deepdoctection/mapper/cocostruct.py +16 -12
deepdoctection/mapper/d2struct.py +22 -22
deepdoctection/mapper/hfstruct.py +7 -7
deepdoctection/mapper/laylmstruct.py +22 -24
deepdoctection/mapper/maputils.py +9 -10
deepdoctection/mapper/match.py +33 -2
deepdoctection/mapper/misc.py +6 -7
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +6 -6
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +3 -3
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/anngen.py +39 -14
deepdoctection/pipe/base.py +68 -99
deepdoctection/pipe/common.py +181 -85
deepdoctection/pipe/concurrency.py +14 -10
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +18 -16
deepdoctection/pipe/lm.py +49 -47
deepdoctection/pipe/order.py +63 -65
deepdoctection/pipe/refine.py +102 -109
deepdoctection/pipe/segment.py +157 -162
deepdoctection/pipe/sub_layout.py +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/d2_frcnn_train.py +27 -25
deepdoctection/train/hf_detr_train.py +22 -18
deepdoctection/train/hf_layoutlm_train.py +49 -48
deepdoctection/train/tp_frcnn_train.py +10 -11
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +52 -14
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +41 -14
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +15 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/pdf_utils.py +39 -14
deepdoctection/utils/settings.py +188 -182
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +70 -69
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
deepdoctection-0.34.dist-info/RECORD +146 -0
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.32.dist-info/RECORD +0 -146
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0

deepdoctection/pipe/anngen.py CHANGED Viewed

@@ -19,11 +19,11 @@
 Module for datapoint populating helpers
 """
 from dataclasses import asdict
-from typing import Dict, List, Mapping, Optional, Union
+from typing import Mapping, Optional, Union
 import numpy as np
-from ..datapoint.annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, SummaryAnnotation
+from ..datapoint.annotation import DEFAULT_CATEGORY_ID, CategoryAnnotation, ContainerAnnotation, ImageAnnotation
 from ..datapoint.box import BoundingBox, local_to_global_coords, rescale_coords
 from ..datapoint.image import Image
 from ..extern.base import DetectionResult
@@ -44,7 +44,7 @@ class DatapointManager:
     def __init__(self, service_id: str, model_id: Optional[str] = None) -> None:
         self._datapoint: Optional[Image] = None
-        self._cache_anns: Dict[str, ImageAnnotation] = {}
+        self._cache_anns: dict[str, ImageAnnotation] = {}
         self.datapoint_is_passed: bool = False
         self.category_id_mapping: Optional[Mapping[int, int]] = None
         self.service_id = service_id
@@ -155,7 +155,7 @@ class DatapointManager:
             ann = ImageAnnotation(
                 category_name=detect_result.class_name,
                 bounding_box=box,
-                category_id=str(detect_result.class_id),
+                category_id=detect_result.class_id,
                 score=detect_result.score,
                 service_id=self.service_id,
                 model_id=self.model_id,
@@ -174,7 +174,7 @@ class DatapointManager:
                     raise ValueError("image cannot be None")
                 ann.image.set_embedding(parent_ann.annotation_id, ann.bounding_box)
                 ann.image.set_embedding(self.datapoint.image_id, ann_global_box)
-                parent_ann.dump_relationship(Relationships.child, ann.annotation_id)
+                parent_ann.dump_relationship(Relationships.CHILD, ann.annotation_id)
             self.datapoint.dump(ann)
             self._cache_anns[ann.annotation_id] = ann
@@ -189,7 +189,7 @@ class DatapointManager:
     def set_category_annotation(
         self,
         category_name: ObjectTypes,
-        category_id: Optional[Union[str, int]],
+        category_id: Optional[int],
         sub_cat_key: ObjectTypes,
         annotation_id: str,
         score: Optional[float] = None,
@@ -216,7 +216,7 @@ class DatapointManager:
         ) as annotation_context:
             cat_ann = CategoryAnnotation(
                 category_name=category_name,
-                category_id=str(category_id),
+                category_id=category_id if category_id is not None else DEFAULT_CATEGORY_ID,
                 score=score,
                 service_id=self.service_id,
                 model_id=self.model_id,
@@ -230,10 +230,10 @@ class DatapointManager:
     def set_container_annotation(
         self,
         category_name: ObjectTypes,
-        category_id: Optional[Union[str, int]],
+        category_id: Optional[int],
         sub_cat_key: ObjectTypes,
         annotation_id: str,
-        value: Union[str, List[str]],
+        value: Union[str, list[str]],
         score: Optional[float] = None,
     ) -> Optional[str]:
         """
@@ -260,7 +260,7 @@ class DatapointManager:
         ) as annotation_context:
             cont_ann = ContainerAnnotation(
                 category_name=category_name,
-                category_id=str(category_id),
+                category_id=category_id if category_id is not None else DEFAULT_CATEGORY_ID,
                 value=value,
                 score=score,
                 service_id=self.service_id,
@@ -272,6 +272,33 @@ class DatapointManager:
             return None
         return cont_ann.annotation_id
+    def set_relationship_annotation(
+        self, relationship_name: ObjectTypes, target_annotation_id: str, annotation_id: str
+    ) -> Optional[str]:
+        """
+        Create a relationship annotation and dump it to the target annotation.
+        :param relationship_name: The relationship key
+        :param target_annotation_id: Annotation_id of the parent `ImageAnnotation`
+        :param annotation_id: The annotation_id to dump the relationship to
+        :return: Annotation_id of the parent `ImageAnnotation` for references if the dumpy has been successful
+        """
+        self.assert_datapoint_passed()
+        with MappingContextManager(
+            dp_name=self.datapoint.file_name,
+            filter_level="annotation",
+            relationship_annotation={
+                "relationship_name": relationship_name.value,
+                "target_annotation_id": target_annotation_id,
+                "annotation_id": annotation_id,
+            },
+        ) as annotation_context:
+            self._cache_anns[target_annotation_id].dump_relationship(relationship_name, annotation_id)
+        if annotation_context.context_error:
+            return None
+        return target_annotation_id
     def set_summary_annotation(
         self,
         summary_key: ObjectTypes,
@@ -299,8 +326,6 @@ class DatapointManager:
         else:
             image = self.datapoint
         assert image is not None, image
-        if image.summary is None:
-            image.summary = SummaryAnnotation()
         ann: Union[CategoryAnnotation, ContainerAnnotation]
         with MappingContextManager(
@@ -316,7 +341,7 @@ class DatapointManager:
             if summary_value is not None:
                 ann = ContainerAnnotation(
                     category_name=summary_name,
-                    category_id=str(summary_number) if summary_number is not None else "",
+                    category_id=summary_number if summary_number else DEFAULT_CATEGORY_ID,
                     value=summary_value,
                     score=summary_score,
                     service_id=self.service_id,
@@ -326,7 +351,7 @@ class DatapointManager:
             else:
                 ann = CategoryAnnotation(
                     category_name=summary_name,
-                    category_id=str(summary_number) if summary_number is not None else "",
+                    category_id=summary_number if summary_number is not None else DEFAULT_CATEGORY_ID,
                     score=summary_score,
                     service_id=self.service_id,
                     model_id=self.model_id,

deepdoctection/pipe/base.py CHANGED Viewed

@@ -19,21 +19,33 @@
 """
 Module for the base class for building pipelines
 """
+from __future__ import annotations
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from copy import deepcopy
-from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional, Set, Union
+from dataclasses import dataclass, field
+from typing import Any, Mapping, Optional, Union
 from uuid import uuid1
 from ..dataflow import DataFlow, MapData
 from ..datapoint.image import Image
-from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner, TextRecognizer
 from ..utils.context import timed_operation
-from ..utils.detection_types import JsonDict
 from ..utils.identifier import get_uuid_from_str
+from ..utils.settings import ObjectTypes
 from .anngen import DatapointManager
+@dataclass(frozen=True)
+class MetaAnnotation:
+    """A immutable dataclass that stores information about what `Image` are being
+    modified through a pipeline compoenent."""
+    image_annotations: tuple[ObjectTypes, ...] = field(default=())
+    sub_categories: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
+    relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
+    summaries: tuple[ObjectTypes, ...] = field(default=())
 class PipelineComponent(ABC):
     """
     Base class for pipeline components. Pipeline components are the parts that make up a pipeline. They contain the
@@ -54,15 +66,14 @@ class PipelineComponent(ABC):
                  planned.
     """
-    def __init__(self, name: str):
+    def __init__(self, name: str, model_id: Optional[str] = None) -> None:
         """
         :param name: The name of the pipeline component. The name will be used to identify a pipeline component in a
                      pipeline. Use something that describe the task of the pipeline.
         """
         self.name = name
         self.service_id = self.get_service_id()
-        self._meta_has_all_types()
-        self.dp_manager = DatapointManager(self.service_id)
+        self.dp_manager = DatapointManager(self.service_id, model_id)
         self.timer_on = False
     @abstractmethod
@@ -108,14 +119,14 @@ class PipelineComponent(ABC):
         return MapData(df, self.pass_datapoint)
     @abstractmethod
-    def clone(self) -> "PipelineComponent":
+    def clone(self) -> PipelineComponent:
         """
         Clone an instance
         """
         raise NotImplementedError()
     @abstractmethod
-    def get_meta_annotation(self) -> JsonDict:
+    def get_meta_annotation(self) -> MetaAnnotation:
         """
         Get a dict of list of annotation type. The dict must contain
@@ -127,96 +138,53 @@ class PipelineComponent(ABC):
         """
         raise NotImplementedError()
-    def _meta_has_all_types(self) -> None:
-        if not {"image_annotations", "sub_categories", "relationships", "summaries"}.issubset(
-            set(self.get_meta_annotation().keys())
-        ):
-            raise TypeError(
-                f" 'get_meta_annotation' must return dict with all required keys. "
-                f"Got {self.get_meta_annotation().keys()}"
-            )
     def get_service_id(self) -> str:
         """
         Get the generating model
         """
         return get_uuid_from_str(self.name)[:8]
-class PredictorPipelineComponent(PipelineComponent, ABC):
-    """
-    Lightweight abstract pipeline component class with `predictor`. Object detectors that only read in images as
-    numpy array and return `DetectResult`s are currently permitted.
-    """
-    def __init__(
-        self,
-        name: str,
-        predictor: Union[ObjectDetector, PdfMiner, TextRecognizer],
-    ) -> None:
+    def clear_predictor(self) -> None:
         """
-        :param name: Will be passed to base class
-        :param predictor: An Object detector for predicting
+        Clear the predictor of the pipeline component if it has one. Needed for model updates during training.
         """
-        self.predictor = predictor
-        super().__init__(name)
-        self.dp_manager = DatapointManager(self.service_id, self.predictor.model_id)
+        raise NotImplementedError(
+            "Maybe you forgot to implement this method in your pipeline component. This might "
+            "be the case when you run evaluation during training and need to update the "
+            "trained model in your pipeline component."
+        )
-    @abstractmethod
-    def clone(self) -> "PredictorPipelineComponent":
-        raise NotImplementedError()
-class LanguageModelPipelineComponent(PipelineComponent, ABC):
-    """
-    Abstract pipeline component class with two attributes `tokenizer` and `language_model` .
-    """
-    def __init__(
-        self,
-        name: str,
-        tokenizer: Any,
-        mapping_to_lm_input_func: Callable[..., Callable[[Image], Optional[Any]]],
-    ):
+    def has_predictor(self) -> bool:
         """
-        :param name: Will be passed to base class
-        :param tokenizer: Tokenizer, typing allows currently anything. This will be changed in the future
-        :param mapping_to_lm_input_func: Function mapping image to layout language model features
+        Check if the pipeline component has a predictor
         """
+        if hasattr(self, "predictor"):
+            if self.predictor is not None:
+                return True
+        return False
-        self.tokenizer = tokenizer
-        super().__init__(name)
-        self.mapping_to_lm_input_func = mapping_to_lm_input_func
-    @abstractmethod
-    def clone(self) -> "LanguageModelPipelineComponent":
+    def _undo(self, dp: Image) -> Image:
         """
-        Clone an instance
+        Undo the processing of the pipeline component. It will remove `ImageAnnotation`, `CategoryAnnotation` and
+        `ContainerAnnotation` with the service_id of the pipeline component.
         """
-        raise NotImplementedError()
-class ImageTransformPipelineComponent(PipelineComponent, ABC):
-    """
-    Abstract pipeline component class with one model to transform images. This component is meant to be used at the
-    beginning of a pipeline
-    """
+        if self.timer_on:
+            with timed_operation(self.__class__.__name__):
+                self.dp_manager.datapoint = dp
+                dp.remove(service_ids=self.service_id)
+        else:
+            self.dp_manager.datapoint = dp
+            dp.remove(service_ids=self.service_id)
+        return self.dp_manager.datapoint
-    def __init__(self, name: str, transform_predictor: ImageTransformer):
+    def undo(self, df: DataFlow) -> DataFlow:
         """
-        :param name: Will be passed to base class
-        :param transform_predictor: An `ImageTransformer` for image transformation
-        """
-        self.transform_predictor = transform_predictor
-        super().__init__(name)
+        Mapping a datapoint via `_undo` within a dataflow pipeline
-    @abstractmethod
-    def clone(self) -> "ImageTransformPipelineComponent":
-        """
-        Clone an instance
+        :param df: An input dataflow of Images
+        :return: A output dataflow of Images
         """
-        raise NotImplementedError()
+        return MapData(df, self._undo)
 class Pipeline(ABC):
@@ -262,7 +230,7 @@ class Pipeline(ABC):
            df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
     """
-    def __init__(self, pipeline_component_list: List[PipelineComponent]) -> None:
+    def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
         """
         :param pipeline_component_list: A list of pipeline components.
         """
@@ -305,7 +273,7 @@ class Pipeline(ABC):
         """
         raise NotImplementedError()
-    def get_meta_annotation(self) -> JsonDict:
+    def get_meta_annotation(self) -> MetaAnnotation:
         """
         Collects meta annotations from all pipeline components and summarizes the returned results
@@ -313,23 +281,24 @@ class Pipeline(ABC):
                  names and generated sub categories), relationships (dict with category names and generated
                  relationships) as well as summaries (list with sub categories)
         """
-        pipeline_populations: Dict[str, Union[List[str], DefaultDict[str, Set[str]]]] = {
-            "image_annotations": [],
-            "sub_categories": defaultdict(set),
-            "relationships": defaultdict(set),
-            "summaries": [],
-        }
+        image_annotations: list[ObjectTypes] = []
+        sub_categories = defaultdict(set)
+        relationships = defaultdict(set)
+        summaries: list[ObjectTypes] = []
         for component in self.pipe_component_list:
-            meta_anns = deepcopy(component.get_meta_annotation())
-            pipeline_populations["image_annotations"].extend(meta_anns["image_annotations"])  # type: ignore
-            for key, value in meta_anns["sub_categories"].items():
-                pipeline_populations["sub_categories"][key].update(value)
-            for key, value in meta_anns["relationships"].items():
-                pipeline_populations["relationships"][key].update(value)
-            pipeline_populations["summaries"].extend(meta_anns["summaries"])  # type: ignore
-        pipeline_populations["sub_categories"] = dict(pipeline_populations["sub_categories"])  # type: ignore
-        pipeline_populations["relationships"] = dict(pipeline_populations["relationships"])  # type: ignore
-        return pipeline_populations
+            meta_anns = component.get_meta_annotation()
+            image_annotations.extend(meta_anns.image_annotations)
+            for key, value in meta_anns.sub_categories.items():
+                sub_categories[key].update(value)
+            for key, value in meta_anns.relationships.items():
+                relationships[key].update(value)
+            summaries.extend(meta_anns.summaries)
+        return MetaAnnotation(
+            image_annotations=tuple(image_annotations),
+            sub_categories=dict(sub_categories),
+            relationships=dict(relationships),
+            summaries=tuple(summaries),
+        )
     def get_pipeline_info(
         self, service_id: Optional[str] = None, name: Optional[str] = None

deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.32py3-none-any.whl → 0.34py3-none-any.whl