PyPI - deepdoctection - Versions diffs - 0.31__py3-none-any.whl → 0.33__py3-none-any.whl - Mend

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show

deepdoctection/__init__.py +16 -29
deepdoctection/analyzer/dd.py +70 -59
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +41 -56
deepdoctection/datapoint/box.py +9 -8
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +56 -44
deepdoctection/datapoint/view.py +245 -150
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +35 -26
deepdoctection/datasets/base.py +14 -12
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +24 -26
deepdoctection/datasets/instances/doclaynet.py +51 -51
deepdoctection/datasets/instances/fintabnet.py +46 -46
deepdoctection/datasets/instances/funsd.py +25 -24
deepdoctection/datasets/instances/iiitar13k.py +13 -10
deepdoctection/datasets/instances/layouttest.py +4 -3
deepdoctection/datasets/instances/publaynet.py +5 -5
deepdoctection/datasets/instances/pubtables1m.py +24 -21
deepdoctection/datasets/instances/pubtabnet.py +32 -30
deepdoctection/datasets/instances/rvlcdip.py +30 -30
deepdoctection/datasets/instances/xfund.py +26 -26
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +15 -13
deepdoctection/eval/eval.py +41 -37
deepdoctection/eval/tedsmetric.py +30 -23
deepdoctection/eval/tp_eval_callback.py +16 -19
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +85 -113
deepdoctection/extern/deskew.py +14 -11
deepdoctection/extern/doctrocr.py +141 -130
deepdoctection/extern/fastlang.py +27 -18
deepdoctection/extern/hfdetr.py +71 -62
deepdoctection/extern/hflayoutlm.py +504 -211
deepdoctection/extern/hflm.py +230 -0
deepdoctection/extern/model.py +488 -302
deepdoctection/extern/pdftext.py +23 -19
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +29 -19
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +18 -18
deepdoctection/extern/tp/tfutils.py +57 -9
deepdoctection/extern/tp/tpcompat.py +21 -14
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +45 -53
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/cats.py +27 -29
deepdoctection/mapper/cocostruct.py +10 -10
deepdoctection/mapper/d2struct.py +27 -26
deepdoctection/mapper/hfstruct.py +13 -8
deepdoctection/mapper/laylmstruct.py +178 -37
deepdoctection/mapper/maputils.py +12 -11
deepdoctection/mapper/match.py +2 -2
deepdoctection/mapper/misc.py +11 -9
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +5 -5
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +5 -5
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +12 -14
deepdoctection/pipe/base.py +52 -106
deepdoctection/pipe/common.py +72 -59
deepdoctection/pipe/concurrency.py +16 -11
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +20 -16
deepdoctection/pipe/lm.py +75 -105
deepdoctection/pipe/order.py +194 -89
deepdoctection/pipe/refine.py +111 -124
deepdoctection/pipe/segment.py +156 -161
deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +48 -41
deepdoctection/train/hf_detr_train.py +41 -30
deepdoctection/train/hf_layoutlm_train.py +153 -135
deepdoctection/train/tp_frcnn_train.py +32 -31
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +87 -125
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +22 -18
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +16 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +11 -11
deepdoctection/utils/settings.py +185 -181
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +74 -72
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
deepdoctection-0.33.dist-info/RECORD +146 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.31.dist-info/RECORD +0 -144
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0

deepdoctection/pipe/common.py CHANGED Viewed

@@ -18,8 +18,11 @@
 """
 Module for common pipeline components
 """
-from copy import copy, deepcopy
-from typing import List, Literal, Mapping, Optional, Sequence, Union
+from __future__ import annotations
+import os
+from copy import deepcopy
+from typing import Literal, Mapping, Optional, Sequence, Union
 import numpy as np
@@ -29,17 +32,14 @@ from ..datapoint.view import IMAGE_DEFAULTS, Page
 from ..mapper.maputils import MappingContextManager
 from ..mapper.match import match_anns_by_intersection
 from ..mapper.misc import to_image
-from ..utils.detection_types import JsonDict
-from ..utils.file_utils import detectron2_available, pytorch_available, tf_available
 from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
-from .base import PipelineComponent
+from .base import MetaAnnotation, PipelineComponent
 from .registry import pipeline_component_registry
-if tf_available():
-    from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
-elif pytorch_available() and detectron2_available():
+if os.environ.get("DD_USE_TORCH"):
     from ..mapper.d2struct import pt_nms_image_annotations as nms_image_annotations
+elif os.environ.get("DD_USE_TF"):
+    from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
 @pipeline_component_registry.register("ImageCroppingService")
@@ -55,20 +55,25 @@ class ImageCroppingService(PipelineComponent):
         :param category_names: A single name or a list of category names to crop
         """
-        if isinstance(category_names, str):
-            category_names = [category_names]
-        self.category_names = [get_type(category_name) for category_name in category_names]
+        self.category_names = (
+            (category_names,)
+            if isinstance(category_names, str)
+            else tuple(get_type(category_name) for category_name in category_names)
+        )
         super().__init__("image_crop")
     def serve(self, dp: Image) -> None:
         for ann in dp.get_annotation(category_names=self.category_names):
             dp.image_ann_to_image(ann.annotation_id, crop_image=True)
-    def clone(self) -> "PipelineComponent":
+    def clone(self) -> ImageCroppingService:
         return self.__class__(self.category_names)
-    def get_meta_annotation(self) -> JsonDict:
-        return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
+    def get_meta_annotation(self) -> MetaAnnotation:
+        return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
+    def clear_predictor(self) -> None:
+        pass
 @pipeline_component_registry.register("MatchingService")
@@ -113,16 +118,18 @@ class MatchingService(PipelineComponent):
         :param max_parent_only: Will assign to each child at most one parent with maximum ioa
         """
         self.parent_categories = (
-            [get_type(parent_categories)]  # type: ignore
-            if not isinstance(parent_categories, (list, set))
-            else [get_type(parent_category) for parent_category in parent_categories]
+            (get_type(parent_categories),)
+            if isinstance(parent_categories, str)
+            else tuple(get_type(category_name) for category_name in parent_categories)
         )
         self.child_categories = (
-            [get_type(child_categories)]  # type: ignore
-            if not isinstance(child_categories, (list, set))
-            else [get_type(child_category) for child_category in child_categories]
+            (get_type(child_categories),)
+            if isinstance(child_categories, str)
+            else (tuple(get_type(category_name) for category_name in child_categories))
         )
-        assert matching_rule in ["iou", "ioa"], "segment rule must be either iou or ioa"
+        if matching_rule not in ("iou", "ioa"):
+            raise ValueError("segment rule must be either iou or ioa")
         self.matching_rule = matching_rule
         self.threshold = threshold
         self.use_weighted_intersections = use_weighted_intersections
@@ -150,24 +157,25 @@ class MatchingService(PipelineComponent):
             matched_child_anns = np.take(child_anns, child_index)  # type: ignore
             matched_parent_anns = np.take(parent_anns, parent_index)  # type: ignore
             for idx, parent in enumerate(matched_parent_anns):
-                parent.dump_relationship(Relationships.child, matched_child_anns[idx].annotation_id)
+                parent.dump_relationship(Relationships.CHILD, matched_child_anns[idx].annotation_id)
     def clone(self) -> PipelineComponent:
         return self.__class__(self.parent_categories, self.child_categories, self.matching_rule, self.threshold)
-    def get_meta_annotation(self) -> JsonDict:
-        return dict(
-            [
-                ("image_annotations", []),
-                ("sub_categories", {}),
-                ("relationships", {parent: {Relationships.child} for parent in self.parent_categories}),
-                ("summaries", []),
-            ]
+    def get_meta_annotation(self) -> MetaAnnotation:
+        return MetaAnnotation(
+            image_annotations=(),
+            sub_categories={},
+            relationships={parent: {Relationships.CHILD} for parent in self.parent_categories},
+            summaries=(),
         )
+    def clear_predictor(self) -> None:
+        pass
 @pipeline_component_registry.register("PageParsingService")
-class PageParsingService:
+class PageParsingService(PipelineComponent):
     """
     A "pseudo" pipeline component that can be added to a pipeline to convert `Image`s into `Page` formats. It allows a
     custom parsing depending on customizing options of other pipeline components.
@@ -186,14 +194,20 @@ class PageParsingService:
         """
         self.name = "page_parser"
         if isinstance(floating_text_block_categories, (str, ObjectTypes)):
-            floating_text_block_categories = [floating_text_block_categories]
+            floating_text_block_categories = (get_type(floating_text_block_categories),)
         if floating_text_block_categories is None:
-            floating_text_block_categories = copy(IMAGE_DEFAULTS["floating_text_block_categories"])
+            floating_text_block_categories = IMAGE_DEFAULTS["floating_text_block_categories"]
         self.text_container = get_type(text_container)
-        self.floating_text_block_categories = [get_type(text_block) for text_block in floating_text_block_categories]
+        self.floating_text_block_categories = tuple(
+            (get_type(text_block) for text_block in floating_text_block_categories)
+        )
         self.include_residual_text_container = include_residual_text_container
         self._init_sanity_checks()
+        super().__init__(self.name)
+    def serve(self, dp: Image) -> None:
+        raise NotImplementedError("PageParsingService is not meant to be used in serve method")
     def pass_datapoint(self, dp: Image) -> Page:
         """
@@ -203,29 +217,19 @@ class PageParsingService:
         """
         return Page.from_image(dp, self.text_container, self.floating_text_block_categories)
-    def predict_dataflow(self, df: DataFlow) -> DataFlow:
-        """
-        Mapping a datapoint via `pass_datapoint` within a dataflow pipeline
-        :param df: An input dataflow
-        :return: A output dataflow
-        """
-        return MapData(df, self.pass_datapoint)
     def _init_sanity_checks(self) -> None:
         assert self.text_container in (
-            LayoutType.word,
-            LayoutType.line,
-        ), f"text_container must be either {LayoutType.word} or {LayoutType.line}"
+            LayoutType.WORD,
+            LayoutType.LINE,
+        ), f"text_container must be either {LayoutType.WORD} or {LayoutType.LINE}"
-    @staticmethod
-    def get_meta_annotation() -> JsonDict:
+    def get_meta_annotation(self) -> MetaAnnotation:
         """
         meta annotation. We do not generate any new annotations here
         """
-        return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
+        return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
-    def clone(self) -> "PageParsingService":
+    def clone(self) -> PageParsingService:
         """clone"""
         return self.__class__(
             deepcopy(self.text_container),
@@ -233,6 +237,9 @@ class PageParsingService:
             self.include_residual_text_container,
         )
+    def clear_predictor(self) -> None:
+        pass
 @pipeline_component_registry.register("AnnotationNmsService")
 class AnnotationNmsService(PipelineComponent):
@@ -257,8 +264,8 @@ class AnnotationNmsService(PipelineComponent):
     def __init__(
         self,
         nms_pairs: Sequence[Sequence[TypeOrStr]],
-        thresholds: Union[float, List[float]],
-        priority: Optional[List[Union[Optional[TypeOrStr]]]] = None,
+        thresholds: Union[float, list[float]],
+        priority: Optional[list[Union[Optional[TypeOrStr]]]] = None,
     ):
         """
         :param nms_pairs: Groups of categories, either as string or by `ObjectType`.
@@ -292,11 +299,14 @@ class AnnotationNmsService(PipelineComponent):
                 if ann.annotation_id not in ann_ids_to_keep:
                     self.dp_manager.deactivate_annotation(ann.annotation_id)
-    def clone(self) -> "PipelineComponent":
+    def clone(self) -> PipelineComponent:
         return self.__class__(deepcopy(self.nms_pairs), self.threshold)
-    def get_meta_annotation(self) -> JsonDict:
-        return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
+    def get_meta_annotation(self) -> MetaAnnotation:
+        return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
+    def clear_predictor(self) -> None:
+        pass
 @pipeline_component_registry.register("ImageParsingService")
@@ -326,13 +336,16 @@ class ImageParsingService:
         """
         return MapData(df, self.pass_datapoint)
-    def clone(self) -> "ImageParsingService":
+    def clone(self) -> ImageParsingService:
         """clone"""
         return self.__class__(self.dpi)
     @staticmethod
-    def get_meta_annotation() -> JsonDict:
+    def get_meta_annotation() -> MetaAnnotation:
         """
         meta annotation. We do not generate any new annotations here
         """
-        return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
+        return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
+    def clear_predictor(self) -> None:
+        """clear predictor. Will do nothing"""

deepdoctection/pipe/concurrency.py CHANGED Viewed

@@ -18,21 +18,22 @@
 """
 Module for multithreading tasks
 """
+from __future__ import annotations
 import itertools
 import queue
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import ExitStack
-from typing import Callable, List, Optional, Sequence, Union
+from typing import Callable, Optional, Sequence, Union
 import tqdm
 from ..dataflow import DataFlow, MapData
 from ..datapoint.image import Image
 from ..utils.context import timed_operation
-from ..utils.detection_types import JsonDict, QueueType, TqdmType
 from ..utils.tqdm import get_tqdm
-from .base import PipelineComponent
+from ..utils.types import QueueType, TqdmType
+from .base import MetaAnnotation, PipelineComponent
 from .common import ImageParsingService, PageParsingService
 from .registry import pipeline_component_registry
@@ -99,7 +100,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
     def __init__(
         self,
-        pipeline_components: Sequence[Union[PipelineComponent, PageParsingService, ImageParsingService]],
+        pipeline_components: Sequence[Union[PipelineComponent, ImageParsingService]],
         pre_proc_func: Optional[Callable[[Image], Image]] = None,
         post_proc_func: Optional[Callable[[Image], Image]] = None,
         max_datapoints: Optional[int] = None,
@@ -122,7 +123,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
         self.timer_on = False
         super().__init__(f"multi_thread_{self.pipe_components[0].name}")
-    def put_task(self, df: Union[DataFlow, List[Image]]) -> None:
+    def put_task(self, df: Union[DataFlow, list[Image]]) -> None:
         """
         Put a dataflow or a list of datapoints to the queue. Note, that the process will not start before `start`
         is called. If you do not know how many datapoints will be cached, use max_datapoint to ensure no oom.
@@ -132,7 +133,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
         self._put_datapoints_to_queue(df)
-    def start(self) -> List[Image]:
+    def start(self) -> list[Image]:
         """
         Creates a worker for each component and starts processing the data points of the queue. A list of the results
         is returned once all points in the queue have been processed.
@@ -164,7 +165,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
         tqdm_bar: Optional[TqdmType] = None,
         pre_proc_func: Optional[Callable[[Image], Image]] = None,
         post_proc_func: Optional[Callable[[Image], Image]] = None,
-    ) -> List[Image]:
+    ) -> list[Image]:
         outputs = []
         with ExitStack() as stack:
@@ -183,7 +184,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
                 tqdm_bar.update(1)
         return outputs
-    def _put_datapoints_to_queue(self, df: Union[DataFlow, List[Image]]) -> None:
+    def _put_datapoints_to_queue(self, df: Union[DataFlow, list[Image]]) -> None:
         if isinstance(df, DataFlow):
             df.reset_state()
         for idx, dp in enumerate(df):
@@ -192,7 +193,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
                     break
             self.input_queue.put(dp)
-    def pass_datapoints(self, dpts: List[Image]) -> List[Image]:
+    def pass_datapoints(self, dpts: list[Image]) -> list[Image]:
         """
         Putting the list of datapoints into a thread-save queue and start for each pipeline
         component a separate thread. It will return a list of datapoints where the order of appearance
@@ -221,8 +222,12 @@ class MultiThreadPipelineComponent(PipelineComponent):
     def serve(self, dp: Image) -> None:
         raise NotImplementedError("MultiThreadPipelineComponent does not follow the PipelineComponent implementation")
-    def clone(self) -> "MultiThreadPipelineComponent":
+    def clone(self) -> MultiThreadPipelineComponent:
         raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
-    def get_meta_annotation(self) -> JsonDict:
+    def get_meta_annotation(self) -> MetaAnnotation:
         return self.pipe_components[0].get_meta_annotation()
+    def clear_predictor(self) -> None:
+        for pipe in self.pipe_components:
+            pipe.clear_predictor()

deepdoctection/pipe/doctectionpipe.py CHANGED Viewed

@@ -26,18 +26,18 @@ from typing import List, Mapping, Optional, Sequence, Tuple, Union
 from ..dataflow import DataFlow, MapData
 from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
 from ..datapoint.image import Image
+from ..datapoint.view import IMAGE_DEFAULTS
 from ..mapper.maputils import curry
 from ..mapper.misc import to_image
-from ..utils.detection_types import Pathlike
 from ..utils.fs import maybe_path_or_pdf
 from ..utils.logger import LoggingRecord, logger
-from ..utils.settings import LayoutType
-from .base import Pipeline, PipelineComponent, PredictorPipelineComponent
+from ..utils.types import PathLikeOrStr
+from .base import Pipeline, PipelineComponent
 from .common import PageParsingService
 def _collect_from_kwargs(
-    **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]
+    **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
 ) -> Tuple[Optional[str], Optional[str], bool, int, str, DataFlow]:
     dataset_dataflow = kwargs.get("dataset_dataflow")
     path = kwargs.get("path")
@@ -69,7 +69,7 @@ def _collect_from_kwargs(
 @curry
 def _proto_process(
-    dp: Union[str, Mapping[str, str]], path: Optional[str], doc_path: Optional[str]
+    dp: Union[str, Mapping[str, str]], path: Optional[PathLikeOrStr], doc_path: Optional[PathLikeOrStr]
 ) -> Union[str, Mapping[str, str]]:
     if isinstance(dp, str):
         file_name = Path(dp).name
@@ -78,10 +78,14 @@ def _proto_process(
     else:
         file_name = dp["file_name"]
     if path is None:
-        path_tmp = doc_path
+        path_tmp = doc_path or ""
     else:
         path_tmp = path
-    logger.info(LoggingRecord(f"Processing {file_name}", {"path": path_tmp, "df": path_tmp, "file_name": file_name}))
+    logger.info(
+        LoggingRecord(
+            f"Processing {file_name}", {"path": os.fspath(path_tmp), "df": os.fspath(path_tmp), "file_name": file_name}
+        )
+    )
     return dp
@@ -90,7 +94,7 @@ def _to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int
     return to_image(dp, dpi)
-def _doc_to_dataflow(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
+def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
     if not os.path.isfile(path):
         raise FileExistsError(f"{path} not a file")
@@ -127,19 +131,18 @@ class DoctectionPipe(Pipeline):
     def __init__(
         self,
-        pipeline_component_list: List[Union[PipelineComponent]],
+        pipeline_component_list: List[PipelineComponent],
         page_parsing_service: Optional[PageParsingService] = None,
     ):
-        if page_parsing_service is None:
-            self.page_parser = PageParsingService(text_container=LayoutType.word)
-        else:
-            self.page_parser = page_parsing_service
-        assert all(
-            isinstance(element, (PipelineComponent, PredictorPipelineComponent)) for element in pipeline_component_list
+        self.page_parser = (
+            PageParsingService(text_container=IMAGE_DEFAULTS["text_container"])
+            if page_parsing_service is None
+            else page_parsing_service
         )
         super().__init__(pipeline_component_list)
-    def _entry(self, **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]) -> DataFlow:
+    def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
         path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow = _collect_from_kwargs(**kwargs)
         df: DataFlow
@@ -147,7 +150,7 @@ class DoctectionPipe(Pipeline):
         if isinstance(path, (str, Path)):
             if not isinstance(file_type, (str, list)):
                 raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
-            df = DoctectionPipe.path_to_dataflow(path, file_type, shuffle=shuffle)
+            df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
         elif isinstance(doc_path, (str, Path)):
             df = DoctectionPipe.doc_to_dataflow(
                 path=doc_path, max_datapoints=int(max_datapoints) if max_datapoints is not None else None
@@ -164,7 +167,7 @@ class DoctectionPipe(Pipeline):
     @staticmethod
     def path_to_dataflow(
-        path: Pathlike,
+        path: PathLikeOrStr,
         file_type: Union[str, Sequence[str]],
         max_datapoints: Optional[int] = None,
         shuffle: bool = False,
@@ -179,12 +182,12 @@ class DoctectionPipe(Pipeline):
         :return: dataflow
         """
         if not os.path.isdir(path):
-            raise NotADirectoryError(f"{path} not a directory")
+            raise NotADirectoryError(f"{os.fspath(path)} not a directory")
         df = SerializerFiles.load(path, file_type, max_datapoints, shuffle)
         return df
     @staticmethod
-    def doc_to_dataflow(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
+    def doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
         """
         Processing method for documents
@@ -203,7 +206,7 @@ class DoctectionPipe(Pipeline):
         """
         return self.page_parser.predict_dataflow(df)
-    def analyze(self, **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]) -> DataFlow:
+    def analyze(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
         """
         `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder

deepdoctection/pipe/language.py CHANGED Viewed

@@ -18,16 +18,14 @@
 """
 Module for language detection pipeline component
 """
-from copy import copy, deepcopy
 from typing import Optional, Sequence
 from ..datapoint.image import Image
 from ..datapoint.view import Page
 from ..extern.base import LanguageDetector, ObjectDetector
-from ..utils.detection_types import JsonDict
 from ..utils.error import ImageError
 from ..utils.settings import PageType, TypeOrStr, get_type
-from .base import PipelineComponent
+from .base import MetaAnnotation, PipelineComponent
 from .registry import pipeline_component_registry
@@ -74,26 +72,27 @@ class LanguageDetectionService(PipelineComponent):
         self.predictor = language_detector
         self.text_detector = text_detector
         self.text_container = get_type(text_container) if text_container is not None else text_container
-        if floating_text_block_categories:
-            floating_text_block_categories = [get_type(text_block) for text_block in floating_text_block_categories]
-        self.floating_text_block_categories = floating_text_block_categories if floating_text_block_categories else []
-        super().__init__(
-            self._get_name(self.predictor.name)
-        )  # cannot use PredictorPipelineComponent class because of return type of predict meth
+        self.floating_text_block_categories = (
+            tuple(get_type(text_block) for text_block in floating_text_block_categories)
+            if (floating_text_block_categories is not None)
+            else ()
+        )
+        super().__init__(self._get_name(self.predictor.name))
     def serve(self, dp: Image) -> None:
         if self.text_detector is None:
-            page = Page.from_image(dp, self.text_container, self.floating_text_block_categories)  # type: ignore
+            page = Page.from_image(dp, self.text_container, self.floating_text_block_categories)
             text = page.text_no_line_break
         else:
             if dp.image is None:
                 raise ImageError("image cannot be None")
             detect_result_list = self.text_detector.predict(dp.image)
             # this is a concatenation of all detection result. No reading order
-            text = " ".join([result.text for result in detect_result_list if result.text is not None])
+            text = " ".join((result.text for result in detect_result_list if result.text is not None))
         predict_result = self.predictor.predict(text)
         self.dp_manager.set_summary_annotation(
-            PageType.language, PageType.language, 1, predict_result.text, predict_result.score
+            PageType.LANGUAGE, PageType.LANGUAGE, 1, predict_result.text, predict_result.score
         )
     def clone(self) -> PipelineComponent:
@@ -101,22 +100,18 @@ class LanguageDetectionService(PipelineComponent):
         if not isinstance(predictor, LanguageDetector):
             raise TypeError(f"Predictor must be of type LanguageDetector, but is of type {type(predictor)}")
         return self.__class__(
-            predictor,
-            copy(self.text_container),
-            deepcopy(self.text_detector),
-            deepcopy(self.floating_text_block_categories),
+            language_detector=predictor,
+            text_container=self.text_container,
+            text_detector=self.text_detector.clone() if self.text_detector is not None else None,
+            floating_text_block_categories=self.floating_text_block_categories,
         )
-    def get_meta_annotation(self) -> JsonDict:
-        return dict(
-            [
-                ("image_annotations", []),
-                ("sub_categories", {}),
-                ("relationships", {}),
-                ("summaries", [PageType.language]),
-            ]
-        )
+    def get_meta_annotation(self) -> MetaAnnotation:
+        return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=(PageType.LANGUAGE,))
     @staticmethod
     def _get_name(predictor_name: str) -> str:
         return f"language_detection_{predictor_name}"
+    def clear_predictor(self) -> None:
+        self.predictor.clear_model()

deepdoctection/pipe/layout.py CHANGED Viewed

@@ -18,21 +18,22 @@
 """
 Module for layout pipeline component
 """
+from __future__ import annotations
 from typing import Optional
 import numpy as np
 from ..datapoint.image import Image
 from ..extern.base import ObjectDetector, PdfMiner
-from ..utils.detection_types import JsonDict
 from ..utils.error import ImageError
 from ..utils.transform import PadTransform
-from .base import PredictorPipelineComponent
+from .base import MetaAnnotation, PipelineComponent
 from .registry import pipeline_component_registry
 @pipeline_component_registry.register("ImageLayoutService")
-class ImageLayoutService(PredictorPipelineComponent):
+class ImageLayoutService(PipelineComponent):
     """
     Pipeline component for determining the layout. Which layout blocks are determined depends on the Detector and thus
     usually on the data set on which the Detector was pre-trained. If the Detector has been trained on Publaynet, these
@@ -63,6 +64,7 @@ class ImageLayoutService(PredictorPipelineComponent):
         :param crop_image: Do not only populate `ImageAnnotation.image` but also crop the detected block according
                            to its bounding box and populate the resulting sub image to
                            `ImageAnnotation.image.image`.
+        :param padder: If not `None`, will apply the padder to the image before prediction and inverse apply the padder
         :param skip_if_layout_extracted: When `True` will check, if there are already `ImageAnnotation` of a category
                                          available that will be predicted by the `layout_detector`. If yes, will skip
                                          the prediction process.
@@ -71,11 +73,12 @@ class ImageLayoutService(PredictorPipelineComponent):
         self.crop_image = crop_image
         self.padder = padder
         self.skip_if_layout_extracted = skip_if_layout_extracted
-        super().__init__(self._get_name(layout_detector.name), layout_detector)
+        self.predictor = layout_detector
+        super().__init__(self._get_name(layout_detector.name), self.predictor.model_id)
     def serve(self, dp: Image) -> None:
         if self.skip_if_layout_extracted:
-            categories = self.predictor.possible_categories()  # type: ignore
+            categories = self.predictor.get_category_names()
             anns = dp.get_annotation(category_names=categories)
             if anns:
                 return
@@ -84,7 +87,7 @@ class ImageLayoutService(PredictorPipelineComponent):
         np_image = dp.image
         if self.padder:
             np_image = self.padder.apply_image(np_image)
-        detect_result_list = self.predictor.predict(np_image)  # type: ignore
+        detect_result_list = self.predictor.predict(np_image)
         if self.padder and detect_result_list:
             boxes = np.array([detect_result.box for detect_result in detect_result_list])
             boxes_orig = self.padder.inverse_apply_coords(boxes)
@@ -94,22 +97,20 @@ class ImageLayoutService(PredictorPipelineComponent):
         for detect_result in detect_result_list:
             self.dp_manager.set_image_annotation(detect_result, to_image=self.to_image, crop_image=self.crop_image)
-    def get_meta_annotation(self) -> JsonDict:
-        assert isinstance(self.predictor, (ObjectDetector, PdfMiner))
-        return dict(
-            [
-                ("image_annotations", self.predictor.possible_categories()),
-                ("sub_categories", {}),
-                ("relationships", {}),
-                ("summaries", []),
-            ]
+    def get_meta_annotation(self) -> MetaAnnotation:
+        if not isinstance(self.predictor, (ObjectDetector, PdfMiner)):
+            raise TypeError(
+                f"self.predictor must be of type ObjectDetector or PdfMiner but is of type " f"{type(self.predictor)}"
+            )
+        return MetaAnnotation(
+            image_annotations=self.predictor.get_category_names(), sub_categories={}, relationships={}, summaries=()
         )
     @staticmethod
     def _get_name(predictor_name: str) -> str:
         return f"image_{predictor_name}"
-    def clone(self) -> "PredictorPipelineComponent":
+    def clone(self) -> ImageLayoutService:
         predictor = self.predictor.clone()
         padder_clone = None
         if self.padder:
@@ -117,3 +118,6 @@ class ImageLayoutService(PredictorPipelineComponent):
         if not isinstance(predictor, ObjectDetector):
             raise TypeError(f"predictor must be of type ObjectDetector, but is of type {type(predictor)}")
         return self.__class__(predictor, self.to_image, self.crop_image, padder_clone, self.skip_if_layout_extracted)
+    def clear_predictor(self) -> None:
+        self.predictor.clear_model()

deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl