PyPI - deepdoctection - Versions diffs - 0.31__py3-none-any.whl → 0.33__py3-none-any.whl - Mend

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show

deepdoctection/__init__.py +16 -29
deepdoctection/analyzer/dd.py +70 -59
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +41 -56
deepdoctection/datapoint/box.py +9 -8
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +56 -44
deepdoctection/datapoint/view.py +245 -150
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +35 -26
deepdoctection/datasets/base.py +14 -12
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +24 -26
deepdoctection/datasets/instances/doclaynet.py +51 -51
deepdoctection/datasets/instances/fintabnet.py +46 -46
deepdoctection/datasets/instances/funsd.py +25 -24
deepdoctection/datasets/instances/iiitar13k.py +13 -10
deepdoctection/datasets/instances/layouttest.py +4 -3
deepdoctection/datasets/instances/publaynet.py +5 -5
deepdoctection/datasets/instances/pubtables1m.py +24 -21
deepdoctection/datasets/instances/pubtabnet.py +32 -30
deepdoctection/datasets/instances/rvlcdip.py +30 -30
deepdoctection/datasets/instances/xfund.py +26 -26
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +15 -13
deepdoctection/eval/eval.py +41 -37
deepdoctection/eval/tedsmetric.py +30 -23
deepdoctection/eval/tp_eval_callback.py +16 -19
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +85 -113
deepdoctection/extern/deskew.py +14 -11
deepdoctection/extern/doctrocr.py +141 -130
deepdoctection/extern/fastlang.py +27 -18
deepdoctection/extern/hfdetr.py +71 -62
deepdoctection/extern/hflayoutlm.py +504 -211
deepdoctection/extern/hflm.py +230 -0
deepdoctection/extern/model.py +488 -302
deepdoctection/extern/pdftext.py +23 -19
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +29 -19
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +18 -18
deepdoctection/extern/tp/tfutils.py +57 -9
deepdoctection/extern/tp/tpcompat.py +21 -14
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +45 -53
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/cats.py +27 -29
deepdoctection/mapper/cocostruct.py +10 -10
deepdoctection/mapper/d2struct.py +27 -26
deepdoctection/mapper/hfstruct.py +13 -8
deepdoctection/mapper/laylmstruct.py +178 -37
deepdoctection/mapper/maputils.py +12 -11
deepdoctection/mapper/match.py +2 -2
deepdoctection/mapper/misc.py +11 -9
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +5 -5
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +5 -5
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +12 -14
deepdoctection/pipe/base.py +52 -106
deepdoctection/pipe/common.py +72 -59
deepdoctection/pipe/concurrency.py +16 -11
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +20 -16
deepdoctection/pipe/lm.py +75 -105
deepdoctection/pipe/order.py +194 -89
deepdoctection/pipe/refine.py +111 -124
deepdoctection/pipe/segment.py +156 -161
deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +48 -41
deepdoctection/train/hf_detr_train.py +41 -30
deepdoctection/train/hf_layoutlm_train.py +153 -135
deepdoctection/train/tp_frcnn_train.py +32 -31
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +87 -125
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +22 -18
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +16 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +11 -11
deepdoctection/utils/settings.py +185 -181
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +74 -72
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
deepdoctection-0.33.dist-info/RECORD +146 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.31.dist-info/RECORD +0 -144
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0

deepdoctection/mapper/xfundstruct.py CHANGED Viewed

@@ -25,7 +25,6 @@ from itertools import chain
 from typing import Mapping, Optional
 from ..datapoint import BoundingBox, CategoryAnnotation, ContainerAnnotation, Image, ImageAnnotation
-from ..utils.detection_types import JsonDict
 from ..utils.fs import load_image_from_file
 from ..utils.settings import (
     BioTag,
@@ -37,17 +36,18 @@ from ..utils.settings import (
     get_type,
     token_class_tag_to_token_class_with_tag,
 )
+from ..utils.types import FunsdDict
 from .maputils import MappingContextManager, curry, maybe_get_fake_score
 @curry
 def xfund_to_image(
-    dp: JsonDict,
+    dp: FunsdDict,
     load_image: bool,
     fake_score: bool,
-    categories_dict_name_as_key: Mapping[str, str],
+    categories_dict_name_as_key: Mapping[ObjectTypes, int],
     token_class_names_mapping: Mapping[str, str],
-    ner_token_to_id_mapping: Mapping[ObjectTypes, Mapping[ObjectTypes, Mapping[ObjectTypes, str]]],
+    ner_token_to_id_mapping: Mapping[ObjectTypes, Mapping[ObjectTypes, Mapping[ObjectTypes, int]]],
 ) -> Optional[Image]:
     """
     Map a datapoint of annotation structure as given as from xfund or funsd dataset in to an Image structure
@@ -75,9 +75,9 @@ def xfund_to_image(
     _, file_name = os.path.split(full_path)
     external_id = dp.get("uid")
-    tag_to_id_mapping = ner_token_to_id_mapping[LayoutType.word][WordType.tag]
-    token_class_to_id_mapping = ner_token_to_id_mapping[LayoutType.word][WordType.token_class]
-    token_tag_to_id_mapping = ner_token_to_id_mapping[LayoutType.word][WordType.token_tag]
+    tag_to_id_mapping = ner_token_to_id_mapping[LayoutType.WORD][WordType.TAG]
+    token_class_to_id_mapping = ner_token_to_id_mapping[LayoutType.WORD][WordType.TOKEN_CLASS]
+    token_tag_to_id_mapping = ner_token_to_id_mapping[LayoutType.WORD][WordType.TOKEN_TAG]
     with MappingContextManager(file_name) as mapping_context:
         image = Image(file_name=file_name, location=full_path, external_id=external_id)
@@ -101,16 +101,16 @@ def xfund_to_image(
             bbox = BoundingBox(absolute_coords=True, ulx=box[0], uly=box[1], lrx=box[2], lry=box[3])
             score = maybe_get_fake_score(fake_score)
             entity_ann = ImageAnnotation(
-                category_name=LayoutType.text,
+                category_name=LayoutType.TEXT,
                 bounding_box=bbox,
-                category_id=categories_dict_name_as_key[LayoutType.text],
+                category_id=categories_dict_name_as_key[LayoutType.TEXT],
                 score=score,
             )
             category_name = token_class_names_mapping[entity["label"]]
             sub_cat_semantic = CategoryAnnotation(
                 category_name=category_name, category_id=token_class_to_id_mapping[get_type(category_name)]
             )
-            entity_ann.dump_sub_category(WordType.token_class, sub_cat_semantic)
+            entity_ann.dump_sub_category(WordType.TOKEN_CLASS, sub_cat_semantic)
             image.dump(entity_ann)
             words = entity.get("words")
@@ -122,61 +122,61 @@ def xfund_to_image(
                 score = maybe_get_fake_score(fake_score)
                 ann = ImageAnnotation(
-                    category_name=LayoutType.word,
+                    category_name=LayoutType.WORD,
                     bounding_box=bbox,
-                    category_id=categories_dict_name_as_key[LayoutType.word],
+                    category_id=categories_dict_name_as_key[LayoutType.WORD],
                     score=score,
                 )
                 image.dump(ann)
-                entity_ann.dump_relationship(Relationships.child, ann.annotation_id)
+                entity_ann.dump_relationship(Relationships.CHILD, ann.annotation_id)
                 sub_cat_semantic = CategoryAnnotation(
                     category_name=category_name, category_id=token_class_to_id_mapping[get_type(category_name)]
                 )
-                ann.dump_sub_category(WordType.token_class, sub_cat_semantic)
-                sub_cat_chars = ContainerAnnotation(category_name=WordType.characters, value=word["text"])
-                ann.dump_sub_category(WordType.characters, sub_cat_chars)
-                if sub_cat_semantic.category_name == TokenClasses.other:
+                ann.dump_sub_category(WordType.TOKEN_CLASS, sub_cat_semantic)
+                sub_cat_chars = ContainerAnnotation(category_name=WordType.CHARACTERS, value=word["text"])
+                ann.dump_sub_category(WordType.CHARACTERS, sub_cat_chars)
+                if sub_cat_semantic.category_name == TokenClasses.OTHER:
                     sub_cat_tag = CategoryAnnotation(
-                        category_name=BioTag.outside, category_id=tag_to_id_mapping[BioTag.outside]
+                        category_name=BioTag.OUTSIDE, category_id=tag_to_id_mapping[BioTag.OUTSIDE]
                     )
-                    ann.dump_sub_category(WordType.tag, sub_cat_tag)
+                    ann.dump_sub_category(WordType.TAG, sub_cat_tag)
                     # populating ner token to be used for training and evaluation
                     sub_cat_ner_tok = CategoryAnnotation(
-                        category_name=BioTag.outside, category_id=token_tag_to_id_mapping[BioTag.outside]
+                        category_name=BioTag.OUTSIDE, category_id=token_tag_to_id_mapping[BioTag.OUTSIDE]
                     )
-                    ann.dump_sub_category(WordType.token_tag, sub_cat_ner_tok)
+                    ann.dump_sub_category(WordType.TOKEN_TAG, sub_cat_ner_tok)
                 elif not idx:
                     sub_cat_tag = CategoryAnnotation(
-                        category_name=BioTag.begin, category_id=tag_to_id_mapping[BioTag.begin]
+                        category_name=BioTag.BEGIN, category_id=tag_to_id_mapping[BioTag.BEGIN]
                     )
-                    ann.dump_sub_category(WordType.tag, sub_cat_tag)
+                    ann.dump_sub_category(WordType.TAG, sub_cat_tag)
                     sub_cat_ner_tok = CategoryAnnotation(
                         category_name=token_class_tag_to_token_class_with_tag(
-                            get_type(sub_cat_semantic.category_name), BioTag.begin
+                            get_type(sub_cat_semantic.category_name), BioTag.BEGIN
                         ),
                         category_id=token_tag_to_id_mapping[
                             token_class_tag_to_token_class_with_tag(
-                                get_type(sub_cat_semantic.category_name), BioTag.begin
+                                get_type(sub_cat_semantic.category_name), BioTag.BEGIN
                             )
                         ],
                     )
-                    ann.dump_sub_category(WordType.token_tag, sub_cat_ner_tok)
+                    ann.dump_sub_category(WordType.TOKEN_TAG, sub_cat_ner_tok)
                 else:
                     sub_cat_tag = CategoryAnnotation(
-                        category_name=BioTag.inside, category_id=tag_to_id_mapping[BioTag.inside]
+                        category_name=BioTag.INSIDE, category_id=tag_to_id_mapping[BioTag.INSIDE]
                     )
-                    ann.dump_sub_category(WordType.tag, sub_cat_tag)
+                    ann.dump_sub_category(WordType.TAG, sub_cat_tag)
                     sub_cat_ner_tok = CategoryAnnotation(
                         category_name=token_class_tag_to_token_class_with_tag(
-                            get_type(sub_cat_semantic.category_name), BioTag.inside
+                            get_type(sub_cat_semantic.category_name), BioTag.INSIDE
                         ),
                         category_id=token_tag_to_id_mapping[
                             token_class_tag_to_token_class_with_tag(
-                                get_type(sub_cat_semantic.category_name), BioTag.inside
+                                get_type(sub_cat_semantic.category_name), BioTag.INSIDE
                             )
                         ],
                     )
-                    ann.dump_sub_category(WordType.token_tag, sub_cat_ner_tok)
+                    ann.dump_sub_category(WordType.TOKEN_TAG, sub_cat_ner_tok)
                 entity_id_to_ann_id[entity["id"]].append(ann.annotation_id)
                 ann_id_to_entity_id[ann.annotation_id] = entity["id"]
@@ -184,7 +184,7 @@ def xfund_to_image(
             entity_id_to_entity_link_id[entity["id"]].extend(entity["linking"])
         # now populating semantic links
-        word_anns = image.get_annotation(category_names=LayoutType.word)
+        word_anns = image.get_annotation(category_names=LayoutType.WORD)
         for word in word_anns:
             entity_id = ann_id_to_entity_id[word.annotation_id]
             all_linked_entities = list(chain(*entity_id_to_entity_link_id[entity_id]))
@@ -193,7 +193,7 @@ def xfund_to_image(
                 ann_ids.extend(entity_id_to_ann_id[linked_entity])
             for ann_id in ann_ids:
                 if ann_id != word.annotation_id:
-                    word.dump_relationship(Relationships.semantic_entity_link, ann_id)
+                    word.dump_relationship(Relationships.SEMANTIC_ENTITY_LINK, ann_id)
     if mapping_context.context_error:
         return None

deepdoctection/pipe/__init__.py CHANGED Viewed

@@ -22,7 +22,6 @@ Contains pipeline components that can be plugged into each other and predictors
 from .anngen import *
 from .base import *
-from .cell import *
 from .common import *
 from .concurrency import *
 from .doctectionpipe import *
@@ -33,5 +32,6 @@ from .order import *
 from .refine import *
 from .registry import *
 from .segment import *
+from .sub_layout import *
 from .text import *
 from .transform import *

deepdoctection/pipe/anngen.py CHANGED Viewed

@@ -19,11 +19,11 @@
 Module for datapoint populating helpers
 """
 from dataclasses import asdict
-from typing import Dict, List, Mapping, Optional, Union
+from typing import Mapping, Optional, Union
 import numpy as np
-from ..datapoint.annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, SummaryAnnotation
+from ..datapoint.annotation import DEFAULT_CATEGORY_ID, CategoryAnnotation, ContainerAnnotation, ImageAnnotation
 from ..datapoint.box import BoundingBox, local_to_global_coords, rescale_coords
 from ..datapoint.image import Image
 from ..extern.base import DetectionResult
@@ -44,7 +44,7 @@ class DatapointManager:
     def __init__(self, service_id: str, model_id: Optional[str] = None) -> None:
         self._datapoint: Optional[Image] = None
-        self._cache_anns: Dict[str, ImageAnnotation] = {}
+        self._cache_anns: dict[str, ImageAnnotation] = {}
         self.datapoint_is_passed: bool = False
         self.category_id_mapping: Optional[Mapping[int, int]] = None
         self.service_id = service_id
@@ -155,7 +155,7 @@ class DatapointManager:
             ann = ImageAnnotation(
                 category_name=detect_result.class_name,
                 bounding_box=box,
-                category_id=str(detect_result.class_id),
+                category_id=detect_result.class_id,
                 score=detect_result.score,
                 service_id=self.service_id,
                 model_id=self.model_id,
@@ -174,7 +174,7 @@ class DatapointManager:
                     raise ValueError("image cannot be None")
                 ann.image.set_embedding(parent_ann.annotation_id, ann.bounding_box)
                 ann.image.set_embedding(self.datapoint.image_id, ann_global_box)
-                parent_ann.dump_relationship(Relationships.child, ann.annotation_id)
+                parent_ann.dump_relationship(Relationships.CHILD, ann.annotation_id)
             self.datapoint.dump(ann)
             self._cache_anns[ann.annotation_id] = ann
@@ -189,7 +189,7 @@ class DatapointManager:
     def set_category_annotation(
         self,
         category_name: ObjectTypes,
-        category_id: Optional[Union[str, int]],
+        category_id: Optional[int],
         sub_cat_key: ObjectTypes,
         annotation_id: str,
         score: Optional[float] = None,
@@ -216,7 +216,7 @@ class DatapointManager:
         ) as annotation_context:
             cat_ann = CategoryAnnotation(
                 category_name=category_name,
-                category_id=str(category_id),
+                category_id=category_id if category_id is not None else DEFAULT_CATEGORY_ID,
                 score=score,
                 service_id=self.service_id,
                 model_id=self.model_id,
@@ -230,10 +230,10 @@ class DatapointManager:
     def set_container_annotation(
         self,
         category_name: ObjectTypes,
-        category_id: Optional[Union[str, int]],
+        category_id: Optional[int],
         sub_cat_key: ObjectTypes,
         annotation_id: str,
-        value: Union[str, List[str]],
+        value: Union[str, list[str]],
         score: Optional[float] = None,
     ) -> Optional[str]:
         """
@@ -260,7 +260,7 @@ class DatapointManager:
         ) as annotation_context:
             cont_ann = ContainerAnnotation(
                 category_name=category_name,
-                category_id=str(category_id),
+                category_id=category_id if category_id is not None else DEFAULT_CATEGORY_ID,
                 value=value,
                 score=score,
                 service_id=self.service_id,
@@ -299,8 +299,6 @@ class DatapointManager:
         else:
             image = self.datapoint
         assert image is not None, image
-        if image.summary is None:
-            image.summary = SummaryAnnotation()
         ann: Union[CategoryAnnotation, ContainerAnnotation]
         with MappingContextManager(
@@ -316,7 +314,7 @@ class DatapointManager:
             if summary_value is not None:
                 ann = ContainerAnnotation(
                     category_name=summary_name,
-                    category_id=str(summary_number) if summary_number is not None else "",
+                    category_id=summary_number if summary_number else DEFAULT_CATEGORY_ID,
                     value=summary_value,
                     score=summary_score,
                     service_id=self.service_id,
@@ -326,7 +324,7 @@ class DatapointManager:
             else:
                 ann = CategoryAnnotation(
                     category_name=summary_name,
-                    category_id=str(summary_number) if summary_number is not None else "",
+                    category_id=summary_number if summary_number is not None else DEFAULT_CATEGORY_ID,
                     score=summary_score,
                     service_id=self.service_id,
                     model_id=self.model_id,

deepdoctection/pipe/base.py CHANGED Viewed

@@ -19,21 +19,33 @@
 """
 Module for the base class for building pipelines
 """
+from __future__ import annotations
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from copy import deepcopy
-from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional, Set, Union
+from dataclasses import dataclass, field
+from typing import Any, Mapping, Optional, Union
 from uuid import uuid1
 from ..dataflow import DataFlow, MapData
 from ..datapoint.image import Image
-from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner, TextRecognizer
 from ..utils.context import timed_operation
-from ..utils.detection_types import JsonDict
 from ..utils.identifier import get_uuid_from_str
+from ..utils.settings import ObjectTypes
 from .anngen import DatapointManager
+@dataclass(frozen=True)
+class MetaAnnotation:
+    """A immutable dataclass that stores information about what `Image` are being
+    modified through a pipeline compoenent."""
+    image_annotations: tuple[ObjectTypes, ...] = field(default=())
+    sub_categories: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
+    relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
+    summaries: tuple[ObjectTypes, ...] = field(default=())
 class PipelineComponent(ABC):
     """
     Base class for pipeline components. Pipeline components are the parts that make up a pipeline. They contain the
@@ -54,15 +66,14 @@ class PipelineComponent(ABC):
                  planned.
     """
-    def __init__(self, name: str):
+    def __init__(self, name: str, model_id: Optional[str] = None) -> None:
         """
         :param name: The name of the pipeline component. The name will be used to identify a pipeline component in a
                      pipeline. Use something that describe the task of the pipeline.
         """
         self.name = name
         self.service_id = self.get_service_id()
-        self._meta_has_all_types()
-        self.dp_manager = DatapointManager(self.service_id)
+        self.dp_manager = DatapointManager(self.service_id, model_id)
         self.timer_on = False
     @abstractmethod
@@ -108,14 +119,14 @@ class PipelineComponent(ABC):
         return MapData(df, self.pass_datapoint)
     @abstractmethod
-    def clone(self) -> "PipelineComponent":
+    def clone(self) -> PipelineComponent:
         """
         Clone an instance
         """
         raise NotImplementedError()
     @abstractmethod
-    def get_meta_annotation(self) -> JsonDict:
+    def get_meta_annotation(self) -> MetaAnnotation:
         """
         Get a dict of list of annotation type. The dict must contain
@@ -127,96 +138,30 @@ class PipelineComponent(ABC):
         """
         raise NotImplementedError()
-    def _meta_has_all_types(self) -> None:
-        if not {"image_annotations", "sub_categories", "relationships", "summaries"}.issubset(
-            set(self.get_meta_annotation().keys())
-        ):
-            raise TypeError(
-                f" 'get_meta_annotation' must return dict with all required keys. "
-                f"Got {self.get_meta_annotation().keys()}"
-            )
     def get_service_id(self) -> str:
         """
         Get the generating model
         """
         return get_uuid_from_str(self.name)[:8]
-class PredictorPipelineComponent(PipelineComponent, ABC):
-    """
-    Lightweight abstract pipeline component class with `predictor`. Object detectors that only read in images as
-    numpy array and return `DetectResult`s are currently permitted.
-    """
-    def __init__(
-        self,
-        name: str,
-        predictor: Union[ObjectDetector, PdfMiner, TextRecognizer],
-    ) -> None:
+    def clear_predictor(self) -> None:
         """
-        :param name: Will be passed to base class
-        :param predictor: An Object detector for predicting
+        Clear the predictor of the pipeline component if it has one. Needed for model updates during training.
         """
-        self.predictor = predictor
-        super().__init__(name)
-        self.dp_manager = DatapointManager(self.service_id, self.predictor.model_id)
+        raise NotImplementedError(
+            "Maybe you forgot to implement this method in your pipeline component. This might "
+            "be the case when you run evaluation during training and need to update the "
+            "trained model in your pipeline component."
+        )
-    @abstractmethod
-    def clone(self) -> "PredictorPipelineComponent":
-        raise NotImplementedError()
-class LanguageModelPipelineComponent(PipelineComponent, ABC):
-    """
-    Abstract pipeline component class with two attributes `tokenizer` and `language_model` .
-    """
-    def __init__(
-        self,
-        name: str,
-        tokenizer: Any,
-        mapping_to_lm_input_func: Callable[..., Callable[[Image], Optional[Any]]],
-    ):
+    def has_predictor(self) -> bool:
         """
-        :param name: Will be passed to base class
-        :param tokenizer: Tokenizer, typing allows currently anything. This will be changed in the future
-        :param mapping_to_lm_input_func: Function mapping image to layout language model features
+        Check if the pipeline component has a predictor
         """
-        self.tokenizer = tokenizer
-        super().__init__(name)
-        self.mapping_to_lm_input_func = mapping_to_lm_input_func
-    @abstractmethod
-    def clone(self) -> "LanguageModelPipelineComponent":
-        """
-        Clone an instance
-        """
-        raise NotImplementedError()
-class ImageTransformPipelineComponent(PipelineComponent, ABC):
-    """
-    Abstract pipeline component class with one model to transform images. This component is meant to be used at the
-    beginning of a pipeline
-    """
-    def __init__(self, name: str, transform_predictor: ImageTransformer):
-        """
-        :param name: Will be passed to base class
-        :param transform_predictor: An `ImageTransformer` for image transformation
-        """
-        self.transform_predictor = transform_predictor
-        super().__init__(name)
-    @abstractmethod
-    def clone(self) -> "ImageTransformPipelineComponent":
-        """
-        Clone an instance
-        """
-        raise NotImplementedError()
+        if hasattr(self, "predictor"):
+            if self.predictor is not None:
+                return True
+        return False
 class Pipeline(ABC):
@@ -262,7 +207,7 @@ class Pipeline(ABC):
            df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
     """
-    def __init__(self, pipeline_component_list: List[PipelineComponent]) -> None:
+    def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
         """
         :param pipeline_component_list: A list of pipeline components.
         """
@@ -305,7 +250,7 @@ class Pipeline(ABC):
         """
         raise NotImplementedError()
-    def get_meta_annotation(self) -> JsonDict:
+    def get_meta_annotation(self) -> MetaAnnotation:
         """
         Collects meta annotations from all pipeline components and summarizes the returned results
@@ -313,23 +258,24 @@ class Pipeline(ABC):
                  names and generated sub categories), relationships (dict with category names and generated
                  relationships) as well as summaries (list with sub categories)
         """
-        pipeline_populations: Dict[str, Union[List[str], DefaultDict[str, Set[str]]]] = {
-            "image_annotations": [],
-            "sub_categories": defaultdict(set),
-            "relationships": defaultdict(set),
-            "summaries": [],
-        }
+        image_annotations: list[ObjectTypes] = []
+        sub_categories = defaultdict(set)
+        relationships = defaultdict(set)
+        summaries: list[ObjectTypes] = []
         for component in self.pipe_component_list:
-            meta_anns = deepcopy(component.get_meta_annotation())
-            pipeline_populations["image_annotations"].extend(meta_anns["image_annotations"])  # type: ignore
-            for key, value in meta_anns["sub_categories"].items():
-                pipeline_populations["sub_categories"][key].update(value)
-            for key, value in meta_anns["relationships"].items():
-                pipeline_populations["relationships"][key].update(value)
-            pipeline_populations["summaries"].extend(meta_anns["summaries"])  # type: ignore
-        pipeline_populations["sub_categories"] = dict(pipeline_populations["sub_categories"])  # type: ignore
-        pipeline_populations["relationships"] = dict(pipeline_populations["relationships"])  # type: ignore
-        return pipeline_populations
+            meta_anns = component.get_meta_annotation()
+            image_annotations.extend(meta_anns.image_annotations)
+            for key, value in meta_anns.sub_categories.items():
+                sub_categories[key].update(value)
+            for key, value in meta_anns.relationships.items():
+                relationships[key].update(value)
+            summaries.extend(meta_anns.summaries)
+        return MetaAnnotation(
+            image_annotations=tuple(image_annotations),
+            sub_categories=dict(sub_categories),
+            relationships=dict(relationships),
+            summaries=tuple(summaries),
+        )
     def get_pipeline_info(
         self, service_id: Optional[str] = None, name: Optional[str] = None

deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl