PyPI - deepdoctection - Versions diffs - 0.32__py3-none-any.whl → 0.34__py3-none-any.whl - Mend

deepdoctection 0.32py3-none-any.whl → 0.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (111) hide show

deepdoctection/__init__.py +8 -25
deepdoctection/analyzer/dd.py +84 -71
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +78 -56
deepdoctection/datapoint/box.py +7 -7
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +157 -75
deepdoctection/datapoint/view.py +175 -151
deepdoctection/datasets/adapter.py +30 -24
deepdoctection/datasets/base.py +10 -10
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +23 -25
deepdoctection/datasets/instances/doclaynet.py +48 -49
deepdoctection/datasets/instances/fintabnet.py +44 -45
deepdoctection/datasets/instances/funsd.py +23 -23
deepdoctection/datasets/instances/iiitar13k.py +8 -8
deepdoctection/datasets/instances/layouttest.py +2 -2
deepdoctection/datasets/instances/publaynet.py +3 -3
deepdoctection/datasets/instances/pubtables1m.py +18 -18
deepdoctection/datasets/instances/pubtabnet.py +30 -29
deepdoctection/datasets/instances/rvlcdip.py +28 -29
deepdoctection/datasets/instances/xfund.py +51 -30
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +13 -12
deepdoctection/eval/eval.py +32 -26
deepdoctection/eval/tedsmetric.py +16 -12
deepdoctection/eval/tp_eval_callback.py +7 -16
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +69 -89
deepdoctection/extern/deskew.py +11 -10
deepdoctection/extern/doctrocr.py +81 -64
deepdoctection/extern/fastlang.py +23 -16
deepdoctection/extern/hfdetr.py +53 -38
deepdoctection/extern/hflayoutlm.py +216 -155
deepdoctection/extern/hflm.py +35 -30
deepdoctection/extern/model.py +433 -255
deepdoctection/extern/pdftext.py +15 -15
deepdoctection/extern/pt/ptutils.py +4 -2
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +14 -16
deepdoctection/extern/tp/tfutils.py +16 -2
deepdoctection/extern/tp/tpcompat.py +11 -7
deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
deepdoctection/extern/tpdetect.py +40 -45
deepdoctection/mapper/cats.py +36 -40
deepdoctection/mapper/cocostruct.py +16 -12
deepdoctection/mapper/d2struct.py +22 -22
deepdoctection/mapper/hfstruct.py +7 -7
deepdoctection/mapper/laylmstruct.py +22 -24
deepdoctection/mapper/maputils.py +9 -10
deepdoctection/mapper/match.py +33 -2
deepdoctection/mapper/misc.py +6 -7
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +6 -6
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +3 -3
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/anngen.py +39 -14
deepdoctection/pipe/base.py +68 -99
deepdoctection/pipe/common.py +181 -85
deepdoctection/pipe/concurrency.py +14 -10
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +18 -16
deepdoctection/pipe/lm.py +49 -47
deepdoctection/pipe/order.py +63 -65
deepdoctection/pipe/refine.py +102 -109
deepdoctection/pipe/segment.py +157 -162
deepdoctection/pipe/sub_layout.py +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/d2_frcnn_train.py +27 -25
deepdoctection/train/hf_detr_train.py +22 -18
deepdoctection/train/hf_layoutlm_train.py +49 -48
deepdoctection/train/tp_frcnn_train.py +10 -11
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +52 -14
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +41 -14
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +15 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/pdf_utils.py +39 -14
deepdoctection/utils/settings.py +188 -182
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +70 -69
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
deepdoctection-0.34.dist-info/RECORD +146 -0
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.32.dist-info/RECORD +0 -146
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
{deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0

deepdoctection/pipe/common.py CHANGED Viewed

@@ -21,21 +21,18 @@ Module for common pipeline components
 from __future__ import annotations
 import os
-from copy import copy, deepcopy
-from typing import List, Literal, Mapping, Optional, Sequence, Union
+from copy import deepcopy
+from typing import Literal, Mapping, Optional, Sequence, Union
 import numpy as np
 from ..dataflow import DataFlow, MapData
 from ..datapoint.image import Image
 from ..datapoint.view import IMAGE_DEFAULTS, Page
-from ..mapper.maputils import MappingContextManager
-from ..mapper.match import match_anns_by_intersection
+from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
 from ..mapper.misc import to_image
-from ..utils.detection_types import JsonDict
 from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
-from .base import PipelineComponent
+from .base import MetaAnnotation, PipelineComponent
 from .registry import pipeline_component_registry
 if os.environ.get("DD_USE_TORCH"):
@@ -57,37 +54,44 @@ class ImageCroppingService(PipelineComponent):
         :param category_names: A single name or a list of category names to crop
         """
-        if isinstance(category_names, str):
-            category_names = [category_names]
-        self.category_names = [get_type(category_name) for category_name in category_names]
+        self.category_names = (
+            (category_names,)
+            if isinstance(category_names, str)
+            else tuple(get_type(category_name) for category_name in category_names)
+        )
         super().__init__("image_crop")
     def serve(self, dp: Image) -> None:
         for ann in dp.get_annotation(category_names=self.category_names):
             dp.image_ann_to_image(ann.annotation_id, crop_image=True)
-    def clone(self) -> PipelineComponent:
+    def clone(self) -> ImageCroppingService:
         return self.__class__(self.category_names)
-    def get_meta_annotation(self) -> JsonDict:
-        return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
+    def get_meta_annotation(self) -> MetaAnnotation:
+        return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
+    def clear_predictor(self) -> None:
+        pass
-@pipeline_component_registry.register("MatchingService")
-class MatchingService(PipelineComponent):
+class IntersectionMatcher:
     """
-    Objects of two object classes can be assigned to one another by determining their pairwise average. If this is above
-    a limit, a relation is created between them.
-    The parent object class (based on its category) and the child object class are defined for the service. A child
-    relation is created in the parent class if the conditions are met.
+    Objects of two object classes can be assigned to one another by determining their pairwise intersection. If this is
+    above a limit, a relation is created between them.
+    The parent object class (based on its category) and the child object class are defined for the service.
     Either `iou` (intersection-over-union) or `ioa` (intersection-over-area) can be selected as the matching rule.
             # the following will assign word annotations to text and title annotation, provided that their ioa-threshold
             # is above 0.7. words below that threshold will not be assigned.
-            match = MatchingService(parent_categories=["TEXT","TITLE"],child_categories="WORD",matching_rule="ioa",
-                                    threshold=0.7)
+            matcher = IntersectionMatcher(matching_rule="ioa", threshold=0.7)
+            match_service = MatchingService(parent_categories=["text","title"],
+                                    child_categories="word",
+                                    matcher=matcher,
+                                    relationship_key=Relationships.CHILD)
             # Assigning means that text and title annotation will receive a relationship called "CHILD" which is a list
               of annotation ids of mapped words.
@@ -95,16 +99,12 @@ class MatchingService(PipelineComponent):
     def __init__(
         self,
-        parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
-        child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
         matching_rule: Literal["iou", "ioa"],
         threshold: float,
         use_weighted_intersections: bool = False,
         max_parent_only: bool = False,
     ) -> None:
         """
-        :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
-        :param child_categories: list of categories to be used for a child class.
         :param matching_rule: "iou" or "ioa"
         :param threshold: iou/ioa threshold. Value between [0,1]
         :param use_weighted_intersections: This is currently only implemented for matching_rule 'ioa'. Instead of using
@@ -112,64 +112,150 @@ class MatchingService(PipelineComponent):
                                            that intersections with more cells will likely decrease the ioa value. By
                                            multiplying the ioa with the number of all intersection for each child this
                                            value calibrate the ioa.
-        :param max_parent_only: Will assign to each child at most one parent with maximum ioa
-        """
-        self.parent_categories = (
-            [get_type(parent_categories)]  # type: ignore
-            if not isinstance(parent_categories, (list, set))
-            else [get_type(parent_category) for parent_category in parent_categories]
-        )
-        self.child_categories = (
-            [get_type(child_categories)]  # type: ignore
-            if not isinstance(child_categories, (list, set))
-            else [get_type(child_category) for child_category in child_categories]
-        )
-        assert matching_rule in ["iou", "ioa"], "segment rule must be either iou or ioa"
+        :param max_parent_only: Will assign to each child at most one parent with maximum ioa"""
+        if matching_rule not in ("iou", "ioa"):
+            raise ValueError("segment rule must be either iou or ioa")
         self.matching_rule = matching_rule
         self.threshold = threshold
         self.use_weighted_intersections = use_weighted_intersections
         self.max_parent_only = max_parent_only
-        super().__init__("matching")
-    def serve(self, dp: Image) -> None:
+    def match(
+        self,
+        dp: Image,
+        parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
+        child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
+    ) -> list[tuple[str, str]]:
         """
-        - generates pairwise match-score by intersection
-        - generates child relationship at parent level
+        The matching algorithm
         :param dp: datapoint image
+        :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
+        :param child_categories: list of categories to be used for a child class.
+        :return: A list of tuples with parent and child annotation ids
         """
         child_index, parent_index, child_anns, parent_anns = match_anns_by_intersection(
             dp,
-            parent_ann_category_names=self.parent_categories,
-            child_ann_category_names=self.child_categories,
+            parent_ann_category_names=parent_categories,
+            child_ann_category_names=child_categories,
             matching_rule=self.matching_rule,
             threshold=self.threshold,
             use_weighted_intersections=self.use_weighted_intersections,
             max_parent_only=self.max_parent_only,
         )
-        with MappingContextManager(dp_name=dp.file_name):
-            matched_child_anns = np.take(child_anns, child_index)  # type: ignore
-            matched_parent_anns = np.take(parent_anns, parent_index)  # type: ignore
-            for idx, parent in enumerate(matched_parent_anns):
-                parent.dump_relationship(Relationships.child, matched_child_anns[idx].annotation_id)
+        matched_child_anns = np.take(child_anns, child_index)  # type: ignore
+        matched_parent_anns = np.take(parent_anns, parent_index)  # type: ignore
+        all_parent_child_relations = []
+        for idx, parent in enumerate(matched_parent_anns):
+            all_parent_child_relations.append((parent.annotation_id, matched_child_anns[idx].annotation_id))
+        return all_parent_child_relations
+class NeighbourMatcher:
+    """
+    Objects of two object classes can be assigned to one another by determining their pairwise distance.
+        # the following will assign caption annotations to figure annotation
+        matcher = NeighbourMatcher()
+        match_service = MatchingService(parent_categories=["figure"],
+                                        child_categories="caption",
+                                        matcher=matcher,
+                                        relationship_key=Relationships.LAYOUT_LINK)
+    """
+    def match(
+        self,
+        dp: Image,
+        parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
+        child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
+    ) -> list[tuple[str, str]]:
+        """
+        The matching algorithm
+        :param dp: datapoint image
+        :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
+        :param child_categories: list of categories to be used for a child class.
+        :return: A list of tuples with parent and child annotation ids
+        """
+        return [
+            (pair[0].annotation_id, pair[1].annotation_id)
+            for pair in match_anns_by_distance(dp, parent_categories, child_categories)
+        ]
+@pipeline_component_registry.register("MatchingService")
+class MatchingService(PipelineComponent):
+    """
+    A service to match annotations of two categories by intersection or distance. The matched annotations will be
+    assigned a relationship. The parent category will receive a relationship to the child category.
+    """
+    def __init__(
+        self,
+        parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
+        child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
+        matcher: Union[IntersectionMatcher, NeighbourMatcher],
+        relationship_key: Relationships,
+    ) -> None:
+        """
+        :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
+        :param child_categories: list of categories to be used for a child class.
+        """
+        self.parent_categories = (
+            (get_type(parent_categories),)
+            if isinstance(parent_categories, str)
+            else tuple(get_type(category_name) for category_name in parent_categories)
+        )
+        self.child_categories = (
+            (get_type(child_categories),)
+            if isinstance(child_categories, str)
+            else (tuple(get_type(category_name) for category_name in child_categories))
+        )
+        self.matcher = matcher
+        self.relationship_key = relationship_key
+        super().__init__("matching")
+    def serve(self, dp: Image) -> None:
+        """
+        - generates pairwise match-score by intersection
+        - generates child relationship at parent level
+        :param dp: datapoint image
+        """
+        matched_pairs = self.matcher.match(dp, self.parent_categories, self.child_categories)
+        for pair in matched_pairs:
+            self.dp_manager.set_relationship_annotation(self.relationship_key, pair[0], pair[1])
     def clone(self) -> PipelineComponent:
-        return self.__class__(self.parent_categories, self.child_categories, self.matching_rule, self.threshold)
-    def get_meta_annotation(self) -> JsonDict:
-        return dict(
-            [
-                ("image_annotations", []),
-                ("sub_categories", {}),
-                ("relationships", {parent: {Relationships.child} for parent in self.parent_categories}),
-                ("summaries", []),
-            ]
+        return self.__class__(self.parent_categories, self.child_categories, self.matcher, self.relationship_key)
+    def get_meta_annotation(self) -> MetaAnnotation:
+        return MetaAnnotation(
+            image_annotations=(),
+            sub_categories={},
+            relationships={parent: {Relationships.CHILD} for parent in self.parent_categories},
+            summaries=(),
         )
+    def clear_predictor(self) -> None:
+        pass
 @pipeline_component_registry.register("PageParsingService")
-class PageParsingService:
+class PageParsingService(PipelineComponent):
     """
     A "pseudo" pipeline component that can be added to a pipeline to convert `Image`s into `Page` formats. It allows a
     custom parsing depending on customizing options of other pipeline components.
@@ -188,14 +274,20 @@ class PageParsingService:
         """
         self.name = "page_parser"
         if isinstance(floating_text_block_categories, (str, ObjectTypes)):
-            floating_text_block_categories = [floating_text_block_categories]
+            floating_text_block_categories = (get_type(floating_text_block_categories),)
         if floating_text_block_categories is None:
-            floating_text_block_categories = copy(IMAGE_DEFAULTS["floating_text_block_categories"])
+            floating_text_block_categories = IMAGE_DEFAULTS["floating_text_block_categories"]
         self.text_container = get_type(text_container)
-        self.floating_text_block_categories = [get_type(text_block) for text_block in floating_text_block_categories]
+        self.floating_text_block_categories = tuple(
+            (get_type(text_block) for text_block in floating_text_block_categories)
+        )
         self.include_residual_text_container = include_residual_text_container
         self._init_sanity_checks()
+        super().__init__(self.name)
+    def serve(self, dp: Image) -> None:
+        raise NotImplementedError("PageParsingService is not meant to be used in serve method")
     def pass_datapoint(self, dp: Image) -> Page:
         """
@@ -203,29 +295,24 @@ class PageParsingService:
         :param dp: Image
         :return: Page
         """
-        return Page.from_image(dp, self.text_container, self.floating_text_block_categories)
-    def predict_dataflow(self, df: DataFlow) -> DataFlow:
-        """
-        Mapping a datapoint via `pass_datapoint` within a dataflow pipeline
-        :param df: An input dataflow
-        :return: A output dataflow
-        """
-        return MapData(df, self.pass_datapoint)
+        return Page.from_image(
+            dp,
+            text_container=self.text_container,
+            floating_text_block_categories=self.floating_text_block_categories,
+            include_residual_text_container=self.include_residual_text_container,
+        )
     def _init_sanity_checks(self) -> None:
         assert self.text_container in (
-            LayoutType.word,
-            LayoutType.line,
-        ), f"text_container must be either {LayoutType.word} or {LayoutType.line}"
+            LayoutType.WORD,
+            LayoutType.LINE,
+        ), f"text_container must be either {LayoutType.WORD} or {LayoutType.LINE}"
-    @staticmethod
-    def get_meta_annotation() -> JsonDict:
+    def get_meta_annotation(self) -> MetaAnnotation:
         """
         meta annotation. We do not generate any new annotations here
         """
-        return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
+        return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
     def clone(self) -> PageParsingService:
         """clone"""
@@ -235,6 +322,9 @@ class PageParsingService:
             self.include_residual_text_container,
         )
+    def clear_predictor(self) -> None:
+        pass
 @pipeline_component_registry.register("AnnotationNmsService")
 class AnnotationNmsService(PipelineComponent):
@@ -259,8 +349,8 @@ class AnnotationNmsService(PipelineComponent):
     def __init__(
         self,
         nms_pairs: Sequence[Sequence[TypeOrStr]],
-        thresholds: Union[float, List[float]],
-        priority: Optional[List[Union[Optional[TypeOrStr]]]] = None,
+        thresholds: Union[float, list[float]],
+        priority: Optional[list[Union[Optional[TypeOrStr]]]] = None,
     ):
         """
         :param nms_pairs: Groups of categories, either as string or by `ObjectType`.
@@ -297,8 +387,11 @@ class AnnotationNmsService(PipelineComponent):
     def clone(self) -> PipelineComponent:
         return self.__class__(deepcopy(self.nms_pairs), self.threshold)
-    def get_meta_annotation(self) -> JsonDict:
-        return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
+    def get_meta_annotation(self) -> MetaAnnotation:
+        return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
+    def clear_predictor(self) -> None:
+        pass
 @pipeline_component_registry.register("ImageParsingService")
@@ -333,8 +426,11 @@ class ImageParsingService:
         return self.__class__(self.dpi)
     @staticmethod
-    def get_meta_annotation() -> JsonDict:
+    def get_meta_annotation() -> MetaAnnotation:
         """
         meta annotation. We do not generate any new annotations here
         """
-        return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
+        return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
+    def clear_predictor(self) -> None:
+        """clear predictor. Will do nothing"""

deepdoctection/pipe/concurrency.py CHANGED Viewed

@@ -24,16 +24,16 @@ import itertools
 import queue
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import ExitStack
-from typing import Callable, List, Optional, Sequence, Union
+from typing import Callable, Optional, Sequence, Union
 import tqdm
 from ..dataflow import DataFlow, MapData
 from ..datapoint.image import Image
 from ..utils.context import timed_operation
-from ..utils.detection_types import JsonDict, QueueType, TqdmType
 from ..utils.tqdm import get_tqdm
-from .base import PipelineComponent
+from ..utils.types import QueueType, TqdmType
+from .base import MetaAnnotation, PipelineComponent
 from .common import ImageParsingService, PageParsingService
 from .registry import pipeline_component_registry
@@ -100,7 +100,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
     def __init__(
         self,
-        pipeline_components: Sequence[Union[PipelineComponent, PageParsingService, ImageParsingService]],
+        pipeline_components: Sequence[Union[PipelineComponent, ImageParsingService]],
         pre_proc_func: Optional[Callable[[Image], Image]] = None,
         post_proc_func: Optional[Callable[[Image], Image]] = None,
         max_datapoints: Optional[int] = None,
@@ -123,7 +123,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
         self.timer_on = False
         super().__init__(f"multi_thread_{self.pipe_components[0].name}")
-    def put_task(self, df: Union[DataFlow, List[Image]]) -> None:
+    def put_task(self, df: Union[DataFlow, list[Image]]) -> None:
         """
         Put a dataflow or a list of datapoints to the queue. Note, that the process will not start before `start`
         is called. If you do not know how many datapoints will be cached, use max_datapoint to ensure no oom.
@@ -133,7 +133,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
         self._put_datapoints_to_queue(df)
-    def start(self) -> List[Image]:
+    def start(self) -> list[Image]:
         """
         Creates a worker for each component and starts processing the data points of the queue. A list of the results
         is returned once all points in the queue have been processed.
@@ -165,7 +165,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
         tqdm_bar: Optional[TqdmType] = None,
         pre_proc_func: Optional[Callable[[Image], Image]] = None,
         post_proc_func: Optional[Callable[[Image], Image]] = None,
-    ) -> List[Image]:
+    ) -> list[Image]:
         outputs = []
         with ExitStack() as stack:
@@ -184,7 +184,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
                 tqdm_bar.update(1)
         return outputs
-    def _put_datapoints_to_queue(self, df: Union[DataFlow, List[Image]]) -> None:
+    def _put_datapoints_to_queue(self, df: Union[DataFlow, list[Image]]) -> None:
         if isinstance(df, DataFlow):
             df.reset_state()
         for idx, dp in enumerate(df):
@@ -193,7 +193,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
                     break
             self.input_queue.put(dp)
-    def pass_datapoints(self, dpts: List[Image]) -> List[Image]:
+    def pass_datapoints(self, dpts: list[Image]) -> list[Image]:
         """
         Putting the list of datapoints into a thread-save queue and start for each pipeline
         component a separate thread. It will return a list of datapoints where the order of appearance
@@ -225,5 +225,9 @@ class MultiThreadPipelineComponent(PipelineComponent):
     def clone(self) -> MultiThreadPipelineComponent:
         raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
-    def get_meta_annotation(self) -> JsonDict:
+    def get_meta_annotation(self) -> MetaAnnotation:
         return self.pipe_components[0].get_meta_annotation()
+    def clear_predictor(self) -> None:
+        for pipe in self.pipe_components:
+            pipe.clear_predictor()

deepdoctection/pipe/doctectionpipe.py CHANGED Viewed

@@ -26,18 +26,18 @@ from typing import List, Mapping, Optional, Sequence, Tuple, Union
 from ..dataflow import DataFlow, MapData
 from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
 from ..datapoint.image import Image
+from ..datapoint.view import IMAGE_DEFAULTS
 from ..mapper.maputils import curry
 from ..mapper.misc import to_image
-from ..utils.detection_types import Pathlike
 from ..utils.fs import maybe_path_or_pdf
 from ..utils.logger import LoggingRecord, logger
-from ..utils.settings import LayoutType
-from .base import Pipeline, PipelineComponent, PredictorPipelineComponent
+from ..utils.types import PathLikeOrStr
+from .base import Pipeline, PipelineComponent
 from .common import PageParsingService
 def _collect_from_kwargs(
-    **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]
+    **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
 ) -> Tuple[Optional[str], Optional[str], bool, int, str, DataFlow]:
     dataset_dataflow = kwargs.get("dataset_dataflow")
     path = kwargs.get("path")
@@ -69,7 +69,7 @@ def _collect_from_kwargs(
 @curry
 def _proto_process(
-    dp: Union[str, Mapping[str, str]], path: Optional[str], doc_path: Optional[str]
+    dp: Union[str, Mapping[str, str]], path: Optional[PathLikeOrStr], doc_path: Optional[PathLikeOrStr]
 ) -> Union[str, Mapping[str, str]]:
     if isinstance(dp, str):
         file_name = Path(dp).name
@@ -78,10 +78,14 @@ def _proto_process(
     else:
         file_name = dp["file_name"]
     if path is None:
-        path_tmp = doc_path
+        path_tmp = doc_path or ""
     else:
         path_tmp = path
-    logger.info(LoggingRecord(f"Processing {file_name}", {"path": path_tmp, "df": path_tmp, "file_name": file_name}))
+    logger.info(
+        LoggingRecord(
+            f"Processing {file_name}", {"path": os.fspath(path_tmp), "df": os.fspath(path_tmp), "file_name": file_name}
+        )
+    )
     return dp
@@ -90,7 +94,7 @@ def _to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int
     return to_image(dp, dpi)
-def _doc_to_dataflow(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
+def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
     if not os.path.isfile(path):
         raise FileExistsError(f"{path} not a file")
@@ -127,19 +131,18 @@ class DoctectionPipe(Pipeline):
     def __init__(
         self,
-        pipeline_component_list: List[Union[PipelineComponent]],
+        pipeline_component_list: List[PipelineComponent],
         page_parsing_service: Optional[PageParsingService] = None,
     ):
-        if page_parsing_service is None:
-            self.page_parser = PageParsingService(text_container=LayoutType.word)
-        else:
-            self.page_parser = page_parsing_service
-        assert all(
-            isinstance(element, (PipelineComponent, PredictorPipelineComponent)) for element in pipeline_component_list
+        self.page_parser = (
+            PageParsingService(text_container=IMAGE_DEFAULTS["text_container"])
+            if page_parsing_service is None
+            else page_parsing_service
         )
         super().__init__(pipeline_component_list)
-    def _entry(self, **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]) -> DataFlow:
+    def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
         path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow = _collect_from_kwargs(**kwargs)
         df: DataFlow
@@ -147,7 +150,7 @@ class DoctectionPipe(Pipeline):
         if isinstance(path, (str, Path)):
             if not isinstance(file_type, (str, list)):
                 raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
-            df = DoctectionPipe.path_to_dataflow(path, file_type, shuffle=shuffle)
+            df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
         elif isinstance(doc_path, (str, Path)):
             df = DoctectionPipe.doc_to_dataflow(
                 path=doc_path, max_datapoints=int(max_datapoints) if max_datapoints is not None else None
@@ -164,7 +167,7 @@ class DoctectionPipe(Pipeline):
     @staticmethod
     def path_to_dataflow(
-        path: Pathlike,
+        path: PathLikeOrStr,
         file_type: Union[str, Sequence[str]],
         max_datapoints: Optional[int] = None,
         shuffle: bool = False,
@@ -179,12 +182,12 @@ class DoctectionPipe(Pipeline):
         :return: dataflow
         """
         if not os.path.isdir(path):
-            raise NotADirectoryError(f"{path} not a directory")
+            raise NotADirectoryError(f"{os.fspath(path)} not a directory")
         df = SerializerFiles.load(path, file_type, max_datapoints, shuffle)
         return df
     @staticmethod
-    def doc_to_dataflow(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
+    def doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
         """
         Processing method for documents
@@ -203,7 +206,7 @@ class DoctectionPipe(Pipeline):
         """
         return self.page_parser.predict_dataflow(df)
-    def analyze(self, **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]) -> DataFlow:
+    def analyze(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
         """
         `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder

deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.32py3-none-any.whl → 0.34py3-none-any.whl