PyPI - deepdoctection - Versions diffs - 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl - Mend

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show

deepdoctection/__init__.py +4 -2
deepdoctection/analyzer/__init__.py +2 -1
deepdoctection/analyzer/config.py +919 -0
deepdoctection/analyzer/dd.py +36 -62
deepdoctection/analyzer/factory.py +311 -141
deepdoctection/configs/conf_dd_one.yaml +100 -44
deepdoctection/configs/profiles.jsonl +32 -0
deepdoctection/dataflow/__init__.py +9 -6
deepdoctection/dataflow/base.py +33 -15
deepdoctection/dataflow/common.py +96 -75
deepdoctection/dataflow/custom.py +36 -29
deepdoctection/dataflow/custom_serialize.py +135 -91
deepdoctection/dataflow/parallel_map.py +33 -31
deepdoctection/dataflow/serialize.py +15 -10
deepdoctection/dataflow/stats.py +41 -28
deepdoctection/datapoint/__init__.py +4 -6
deepdoctection/datapoint/annotation.py +104 -66
deepdoctection/datapoint/box.py +190 -130
deepdoctection/datapoint/convert.py +66 -39
deepdoctection/datapoint/image.py +151 -95
deepdoctection/datapoint/view.py +383 -236
deepdoctection/datasets/__init__.py +2 -6
deepdoctection/datasets/adapter.py +11 -11
deepdoctection/datasets/base.py +118 -81
deepdoctection/datasets/dataflow_builder.py +18 -12
deepdoctection/datasets/info.py +76 -57
deepdoctection/datasets/instances/__init__.py +6 -2
deepdoctection/datasets/instances/doclaynet.py +17 -14
deepdoctection/datasets/instances/fintabnet.py +16 -22
deepdoctection/datasets/instances/funsd.py +11 -6
deepdoctection/datasets/instances/iiitar13k.py +9 -9
deepdoctection/datasets/instances/layouttest.py +9 -9
deepdoctection/datasets/instances/publaynet.py +9 -9
deepdoctection/datasets/instances/pubtables1m.py +13 -13
deepdoctection/datasets/instances/pubtabnet.py +13 -15
deepdoctection/datasets/instances/rvlcdip.py +8 -8
deepdoctection/datasets/instances/xfund.py +11 -9
deepdoctection/datasets/registry.py +18 -11
deepdoctection/datasets/save.py +12 -11
deepdoctection/eval/__init__.py +3 -2
deepdoctection/eval/accmetric.py +72 -52
deepdoctection/eval/base.py +29 -10
deepdoctection/eval/cocometric.py +14 -12
deepdoctection/eval/eval.py +56 -41
deepdoctection/eval/registry.py +6 -3
deepdoctection/eval/tedsmetric.py +24 -9
deepdoctection/eval/tp_eval_callback.py +13 -12
deepdoctection/extern/__init__.py +1 -1
deepdoctection/extern/base.py +176 -97
deepdoctection/extern/d2detect.py +127 -92
deepdoctection/extern/deskew.py +19 -10
deepdoctection/extern/doctrocr.py +162 -108
deepdoctection/extern/fastlang.py +25 -17
deepdoctection/extern/hfdetr.py +137 -60
deepdoctection/extern/hflayoutlm.py +329 -248
deepdoctection/extern/hflm.py +67 -33
deepdoctection/extern/model.py +108 -762
deepdoctection/extern/pdftext.py +37 -12
deepdoctection/extern/pt/nms.py +15 -1
deepdoctection/extern/pt/ptutils.py +13 -9
deepdoctection/extern/tessocr.py +87 -54
deepdoctection/extern/texocr.py +29 -14
deepdoctection/extern/tp/tfutils.py +36 -8
deepdoctection/extern/tp/tpcompat.py +54 -16
deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
deepdoctection/extern/tpdetect.py +4 -2
deepdoctection/mapper/__init__.py +1 -1
deepdoctection/mapper/cats.py +117 -76
deepdoctection/mapper/cocostruct.py +35 -17
deepdoctection/mapper/d2struct.py +56 -29
deepdoctection/mapper/hfstruct.py +32 -19
deepdoctection/mapper/laylmstruct.py +221 -185
deepdoctection/mapper/maputils.py +71 -35
deepdoctection/mapper/match.py +76 -62
deepdoctection/mapper/misc.py +68 -44
deepdoctection/mapper/pascalstruct.py +13 -12
deepdoctection/mapper/prodigystruct.py +33 -19
deepdoctection/mapper/pubstruct.py +42 -32
deepdoctection/mapper/tpstruct.py +39 -19
deepdoctection/mapper/xfundstruct.py +20 -13
deepdoctection/pipe/__init__.py +1 -2
deepdoctection/pipe/anngen.py +104 -62
deepdoctection/pipe/base.py +226 -107
deepdoctection/pipe/common.py +206 -123
deepdoctection/pipe/concurrency.py +74 -47
deepdoctection/pipe/doctectionpipe.py +108 -47
deepdoctection/pipe/language.py +41 -24
deepdoctection/pipe/layout.py +45 -18
deepdoctection/pipe/lm.py +146 -78
deepdoctection/pipe/order.py +205 -119
deepdoctection/pipe/refine.py +111 -63
deepdoctection/pipe/registry.py +1 -1
deepdoctection/pipe/segment.py +213 -142
deepdoctection/pipe/sub_layout.py +76 -46
deepdoctection/pipe/text.py +52 -33
deepdoctection/pipe/transform.py +8 -6
deepdoctection/train/d2_frcnn_train.py +87 -69
deepdoctection/train/hf_detr_train.py +72 -40
deepdoctection/train/hf_layoutlm_train.py +85 -46
deepdoctection/train/tp_frcnn_train.py +56 -28
deepdoctection/utils/concurrency.py +59 -16
deepdoctection/utils/context.py +40 -19
deepdoctection/utils/develop.py +26 -17
deepdoctection/utils/env_info.py +86 -37
deepdoctection/utils/error.py +16 -10
deepdoctection/utils/file_utils.py +246 -71
deepdoctection/utils/fs.py +162 -43
deepdoctection/utils/identifier.py +29 -16
deepdoctection/utils/logger.py +49 -32
deepdoctection/utils/metacfg.py +83 -21
deepdoctection/utils/pdf_utils.py +119 -62
deepdoctection/utils/settings.py +24 -10
deepdoctection/utils/tqdm.py +10 -5
deepdoctection/utils/transform.py +182 -46
deepdoctection/utils/utils.py +61 -28
deepdoctection/utils/viz.py +150 -104
deepdoctection-0.43.1.dist-info/METADATA +376 -0
deepdoctection-0.43.1.dist-info/RECORD +149 -0
deepdoctection/analyzer/_config.py +0 -146
deepdoctection-0.42.1.dist-info/METADATA +0 -431
deepdoctection-0.42.1.dist-info/RECORD +0 -148
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0

deepdoctection/pipe/concurrency.py CHANGED Viewed

@@ -41,53 +41,64 @@ from .registry import pipeline_component_registry
 @pipeline_component_registry.register("ImageCroppingService")
 class MultiThreadPipelineComponent(PipelineComponent):
     """
-    Running a pipeline component in multiple thread to increase through put. Datapoints will be queued
-    and processed once calling the `start`.
+    This module provides functionality for running pipeline components in multiple threads to increase throughput.
+    Datapoints are queued and processed once calling the `start` method.
-    The number of threads is derived from the list of pipeline components. It makes no sense to create the various
-    components.
+    Note:
+        The number of threads is derived from the list of `pipeline_components`. It makes no sense to create various
+        components.
-    Think of the pipeline component as an asynchronous process. Because the entire data flow is loaded into memory
-    before the process is started, storage capacity must be guaranteed.
+        Think of the pipeline component as an asynchronous process. Because the entire data flow is loaded into memory
+        before the process is started, storage capacity must be guaranteed.
-    If pre- and post-processing are to be carried out before the task within the wrapped pipeline component, this can
-    also be transferred as a function. These tasks are also assigned to the threads.
+        If pre- and post-processing are to be carried out before the task within the wrapped pipeline component, this
+        can also be transferred as a function. These tasks are also assigned to the threads.
-    Note that the order in the dataflow and when returning lists is generally is no longer retained.
+        The order in the dataflow and when returning lists is generally no longer retained.
+    Example:
+        ```python
         some_component = SubImageLayoutService(some_predictor, some_category)
-        some_component:clone = some_component.clone()
+        some_component_clone = some_component.clone()
-        multi_thread_comp = MultiThreadPipelineComponent(pipeline_components=[some_component,some_component_clone],
-                                                         pre_proc_func=maybe_load_image,
-                                                         post_proc_func=maybe_remove_image)
+        multi_thread_comp = MultiThreadPipelineComponent(
+            pipeline_components=[some_component, some_component_clone],
+            pre_proc_func=maybe_load_image,
+            post_proc_func=maybe_remove_image
+        )
         multi_thread_comp.put_task(some_dataflow)
         output_list = multi_thread_comp.start()
+        ```
-    You cannot run `MultiThreadPipelineComponent` in `DoctectionPipe` as this requires batching datapoints and neither
-    can you run `MultiThreadPipelineComponent` in combination with a humble 'PipelineComponent` unless you take care
-    of batching/unbatching between each component by yourself. The easiest way to build a pipeline with
-    `MultiThreadPipelineComponent` can be accomplished as follows:
+    Info:
+        You cannot run `MultiThreadPipelineComponent` in `DoctectionPipe` as this requires batching datapoints and
+        neither can you run `MultiThreadPipelineComponent` in combination with a humble `PipelineComponent` unless you
+        take care of batching/unbatching between each component by yourself. The easiest way to build a pipeline with
+        `MultiThreadPipelineComponent` can be accomplished as follows:
+    Example:
+        ```python
         # define the pipeline component
-        ome_component = SubImageLayoutService(some_predictor, some_category)
-        some_component:clone = some_component.clone()
+        some_component = SubImageLayoutService(some_predictor, some_category)
+        some_component_clone = some_component.clone()
         # creating two threads, one for each component
-        multi_thread_comp = MultiThreadPipelineComponent(pipeline_components=[some_component,some_component_clone],
-                                                         pre_proc_func=maybe_load_image,
-                                                         post_proc_func=maybe_remove_image)
+        multi_thread_comp = MultiThreadPipelineComponent(
+            pipeline_components=[some_component, some_component_clone],
+            pre_proc_func=maybe_load_image,
+            post_proc_func=maybe_remove_image
+        )
         # currying `to_image`, so that you can call it in `MapData`.
         @curry
-        def _to_image(dp,dpi):
-            return to_image(dp,dpi)
+        def _to_image(dp, dpi):
+            return to_image(dp, dpi)
         # set-up the dataflow/stream, e.g.
         df = SerializerPdfDoc.load(path, max_datapoints=max_datapoints)
         df = MapData(df, to_image(dpi=300))
-        df = BatchData(df, batch_size=32,remainder=True)
+        df = BatchData(df, batch_size=32, remainder=True)
         df = multi_thread_comp.predict_dataflow(df)
         df = FlattenData(df)
         df = MapData(df, lambda x: x[0])
@@ -95,7 +106,8 @@ class MultiThreadPipelineComponent(PipelineComponent):
         df.reset_state()
         for dp in df:
-           ...
+            ...
+        ```
     """
     def __init__(
@@ -106,13 +118,12 @@ class MultiThreadPipelineComponent(PipelineComponent):
         max_datapoints: Optional[int] = None,
     ) -> None:
         """
-        :param pipeline_components: list of identical pipeline component. Number of threads created is determined by
-                                    `len`
-        :param pre_proc_func: pass a function, that reads and returns an image. Will execute before entering the pipe
-                              component
-        :param post_proc_func: pass a function, that reads and returns an image. Will execute after entering the pipe
-                               component
-        :param max_datapoints: max datapoints to process
+        Args:
+            pipeline_components: List of identical `PipelineComponent`. Number of threads created is determined by
+                                 `len`.
+            pre_proc_func: Function that reads and returns an image. Will execute before entering the pipe component.
+            post_proc_func: Function that reads and returns an image. Will execute after entering the pipe component.
+            max_datapoints: Maximum datapoints to process.
         """
         self.pipe_components = pipeline_components
@@ -125,20 +136,29 @@ class MultiThreadPipelineComponent(PipelineComponent):
     def put_task(self, df: Union[DataFlow, list[Image]]) -> None:
         """
-        Put a dataflow or a list of datapoints to the queue. Note, that the process will not start before `start`
-        is called. If you do not know how many datapoints will be cached, use max_datapoint to ensure no oom.
+        Put a `DataFlow` or a list of datapoints to the queue.
+        Note:
+            The process will not start before `start` is called. If you do not know how many datapoints will be cached,
+            use `max_datapoints` to ensure no OOM.
-        :param df: A list or a dataflow of Image
+        Args:
+            df: A list or a `DataFlow` of `Image`.
         """
         self._put_datapoints_to_queue(df)
     def start(self) -> list[Image]:
         """
-        Creates a worker for each component and starts processing the data points of the queue. A list of the results
-        is returned once all points in the queue have been processed.
+        Creates a worker for each component and starts processing the datapoints of the queue.
-        :return: A list of Images
+        Example:
+            ```python
+            output_list = multi_thread_comp.start()
+            ```
+        Returns:
+            A list of `Image` objects.
         """
         with ThreadPoolExecutor(
             max_workers=len(self.pipe_components), thread_name_prefix="EvalWorker"
@@ -195,11 +215,15 @@ class MultiThreadPipelineComponent(PipelineComponent):
     def pass_datapoints(self, dpts: list[Image]) -> list[Image]:
         """
-        Putting the list of datapoints into a thread-save queue and start for each pipeline
-        component a separate thread. It will return a list of datapoints where the order of appearance
-        of the output might be not the same as the input.
-        :param dpts:
-        :return:
+        Put the list of datapoints into a thread-safe queue and start a separate thread for each pipeline component.
+        The order of appearance of the output might not be the same as the input.
+        Args:
+            dpts: List of `Image` datapoints.
+        Returns:
+            List of processed `Image` datapoints.
         """
         for dp in dpts:
             self.input_queue.put(dp)
@@ -212,10 +236,13 @@ class MultiThreadPipelineComponent(PipelineComponent):
     def predict_dataflow(self, df: DataFlow) -> DataFlow:
         """
-        Mapping a datapoint via `pass_datapoint` within a dataflow pipeline
+        Map a datapoint via `pass_datapoints` within a dataflow pipeline.
+        Args:
+            df: An input `DataFlow`.
-        :param df: An input dataflow
-        :return: A output dataflow
+        Returns:
+            An output `DataFlow`.
         """
         return MapData(df, self.pass_datapoints)

deepdoctection/pipe/doctectionpipe.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # limitations under the License.
 """
-Module for pipeline with Tensorpack predictors
+Module for document processing pipeline
 """
 import os
@@ -42,6 +42,21 @@ from .common import PageParsingService
 def _collect_from_kwargs(
     **kwargs: Union[Optional[str], bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
 ) -> Tuple[Optional[str], Union[str, Sequence[str]], bool, int, str, DataFlow, Optional[bytes]]:
+    """
+    Collects and validates keyword arguments for dataflow construction.
+    Args:
+        **kwargs: Keyword arguments that may include `path`, `bytes`, `dataset_dataflow`, `shuffle`, `file_type`, and
+            `max_datapoints`.
+    Returns:
+        Tuple containing `path`, `file_type`, `shuffle`, `max_datapoints`, `doc_path`, `dataset_dataflow`, and
+        `b_bytes`.
+    Raises:
+        ValueError: If neither `path` nor `dataset_dataflow` is provided, or if required arguments are missing.
+        TypeError: If argument types are incorrect.
+    """
     b_bytes = kwargs.get("bytes")
     dataset_dataflow = kwargs.get("dataset_dataflow")
     path = kwargs.get("path")
@@ -115,10 +130,35 @@ def _to_image(
     width: Optional[int] = None,
     height: Optional[int] = None,
 ) -> Optional[Image]:
+    """
+    Converts a data point to an `Image` object.
+    Args:
+        dp: The data point, which can be a string or a mapping.
+        dpi: Dots per inch for the image.
+        width: Width of the image.
+        height: Height of the image.
+    Returns:
+        An `Image` object or None.
+    """
     return to_image(dp, dpi, width, height)
 def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
+    """
+    Creates a dataflow from a PDF document.
+    Args:
+        path: Path to the PDF document.
+        max_datapoints: Maximum number of data points to consider.
+    Returns:
+        A `DataFlow` object.
+    Raises:
+        FileExistsError: If the file does not exist.
+    """
     if not os.path.isfile(path):
         raise FileExistsError(f"{path} not a file")
@@ -129,28 +169,24 @@ def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None)
 class DoctectionPipe(Pipeline):
     """
-    Prototype for a document layout pipeline. Contains implementation for loading document types (images in directory,
-    single PDF document, dataflow from datasets), conversions in dataflows and building a pipeline.
+    Prototype for a document layout pipeline.
+    Contains implementation for loading document types (images in directory, single PDF document, dataflow from
+     datasets), conversions in dataflows, and building a pipeline.
     See `deepdoctection.analyzer.dd` for a concrete implementation.
     See also the explanations in `base.Pipeline`.
-    By default, `DoctectionPipe` will instantiate a default `PageParsingService`
-        PageParsingService(text_container=LayoutType.word,
-                           text_block_categories=[LayoutType.title,
-                                                  LayoutType.text,
-                                                  LayoutType.list,
-                                                  LayoutType.table])
-    but you can overwrite the current setting:
+    By default, `DoctectionPipe` will instantiate a default `PageParsingService`:
-    **Example:**
+    Example:
+        ```python
+        pipe = DoctectionPipe([comp_1, com_2], PageParsingService(text_container= my_custom_setting))
+        ```
-            pipe = DoctectionPipe([comp_1, com_2], PageParsingService(text_container= my_custom_setting))
+    Note:
+        You can overwrite the current setting by providing a custom `PageParsingService`.
     """
     def __init__(
@@ -158,8 +194,17 @@ class DoctectionPipe(Pipeline):
         pipeline_component_list: List[PipelineComponent],
         page_parsing_service: Optional[PageParsingService] = None,
     ):
+        """
+        Initializes the `DoctectionPipe`.
+        Args:
+            pipeline_component_list: List of pipeline components.
+            page_parsing_service: Optional custom `PageParsingService`.
+        """
         self.page_parser = (
-            PageParsingService(text_container=IMAGE_DEFAULTS["text_container"])
+            PageParsingService(
+                text_container=IMAGE_DEFAULTS.TEXT_CONTAINER,
+            )
             if page_parsing_service is None
             else page_parsing_service
         )
@@ -216,13 +261,19 @@ class DoctectionPipe(Pipeline):
         shuffle: bool = False,
     ) -> DataFlow:
         """
-        Processing method for directories
+        Processing method for directories.
-        :param path: path to directory
-        :param file_type: file type to consider (single str or list of strings)
-        :param max_datapoints: max number of datapoints to consider
-        :param shuffle: Shuffle file names in order to stream them randomly
-        :return: dataflow
+        Args:
+            path: Path to directory.
+            file_type: File type to consider (single string or list of strings).
+            max_datapoints: Maximum number of data points to consider.
+            shuffle: Whether to shuffle file names for random streaming.
+        Returns:
+            A `DataFlow` object.
+        Raises:
+            NotADirectoryError: If the path is not a directory.
         """
         if not os.path.isdir(path):
             raise NotADirectoryError(f"{os.fspath(path)} not a directory")
@@ -232,11 +283,14 @@ class DoctectionPipe(Pipeline):
     @staticmethod
     def doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
         """
-        Processing method for documents
+        Processing method for documents.
-        :param path: path to directory
-        :param max_datapoints: max number of datapoints to consider
-        :return: dataflow
+        Args:
+            path: Path to the document.
+            max_datapoints: Maximum number of data points to consider.
+        Returns:
+            A `DataFlow` object.
         """
         return _doc_to_dataflow(path, max_datapoints)
@@ -245,13 +299,19 @@ class DoctectionPipe(Pipeline):
         path: str, b_bytes: bytes, file_type: Union[str, Sequence[str]], max_datapoints: Optional[int] = None
     ) -> DataFlow:
         """
-        Converts a bytes object to a dataflow
+        Converts a bytes object to a dataflow.
+        Args:
+            path: Path to directory or an image file.
+            b_bytes: Bytes object.
+            file_type: File type, e.g., `.pdf`, `.jpg`, or a list of image file types.
+            max_datapoints: Maximum number of data points to consider.
-        :param path: path to directory or an image file
-        :param b_bytes: bytes object
-        :param file_type: e.g. ".pdf", ".jpg" or [".jpg", ".png", ".jpeg", ".tif"]
-        :param max_datapoints: max number of datapoints to consider
-        :return: DataFlow
+        Returns:
+            A `DataFlow` object.
+        Raises:
+            ValueError: If the combination of arguments is not supported.
         """
         file_name = os.path.split(path)[1]
@@ -280,10 +340,13 @@ class DoctectionPipe(Pipeline):
     def dataflow_to_page(self, df: DataFlow) -> DataFlow:
         """
-        Converts a dataflow of images to a dataflow of pages
+        Converts a dataflow of images to a dataflow of pages.
+        Args:
+            df: Dataflow.
-        :param df: Dataflow
-        :return: Dataflow
+        Returns:
+            A dataflow of pages.
         """
         return self.page_parser.predict_dataflow(df)
@@ -291,18 +354,16 @@ class DoctectionPipe(Pipeline):
         self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
     ) -> DataFlow:
         """
-        `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
-        `kwargs key path:` A path to a directory in which either image documents or pdf files are located. It is
-                           assumed that the pdf documents consist of only one page. If there are multiple pages,
-                           only the first page is processed through the pipeline.
-                           Alternatively, a path to a pdf document with multiple pages.
-        `kwargs key bytes:` A bytes object of an image
-        `kwargs key file_type:` Selection of the file type, if: args:`file_type` is passed
-        `kwargs key max_datapoints:` Stops processing as soon as max_datapoints images have been processed
+        Args:
+            `kwargs:
+                 dataset_dataflow (Dataflow):` Transfer a dataflow of a dataset via its dataflow builder
+                 path (TypeOrStr):` A path to a directory in which either image documents or pdf files are located. It
+                               is assumed that the pdf documents consist of only one page. If there are multiple pages,
+                               only the first page is processed through the pipeline.
+                               Alternatively, a path to a pdf document with multiple pages.
+                 bytes:` A bytes object of an image
+                 file_type:` Selection of the file type, if: args:`file_type` is passed
+                 max_datapoints:` Stops processing as soon as max_datapoints images have been processed
         :return: dataflow
         """

deepdoctection/pipe/language.py CHANGED Viewed

@@ -21,7 +21,7 @@ Module for language detection pipeline component
 from typing import Optional, Sequence
 from ..datapoint.image import Image
-from ..datapoint.view import Page
+from ..datapoint.view import ImageDefaults, Page
 from ..extern.base import LanguageDetector, ObjectDetector
 from ..utils.error import ImageError
 from ..utils.settings import PageType, TypeOrStr, get_type
@@ -36,20 +36,22 @@ class LanguageDetectionService(PipelineComponent):
     There are two ways to use this component:
-    - By analyzing the already extracted and ordered text. For this purpose, a page object is parsed internally and
-      the full text is passed to the language_detector. This approach provides the greatest precision.
-    - By previous text extraction with an object detector and subsequent transfer of concatenated word elements to the
-      language_detector. Only one OCR detector can be used here. This method can be used, for example, to select an OCR
-      detector that specializes in a language using. Although the word recognition is less accurate
-      when choosing any detector, the results are confident enough to rely on the results, especially when extracting
-      longer text passages. So, a TextExtractionService, for example, can be selected as the subsequent pipeline
-      component. The words determined by the OCR detector are not transferred to the image object.
-          lang_detector = FasttextLangDetector(path_weights,profile.categories)
-          component = LanguageDetectionService(lang_detector, text_container="word",
-                                               text_block_names=["text","title","table"])
+    1. By analyzing the already extracted and ordered text. For this purpose, a `Page` object is parsed internally and
+    the full text is passed to the `language_detector`. This approach provides the greatest precision.
+    2. By previous text extraction with an object detector and subsequent transfer of concatenated word elements to the
+    `language_detector`. Only one OCR detector can be used here. This method can be used, for example, to select an OCR
+    detector that specializes in a language. Although the word recognition is less accurate
+    when choosing any detector, the results are confident enough to rely on, especially when extracting
+    longer text passages. So, a `TextExtractionService`, for example, can be selected as the subsequent pipeline
+    component. The words determined by the OCR detector are not transferred to the image object.
+    Example:
+        ```python
+        lang_detector = FasttextLangDetector(path_weights, profile.categories)
+        component = LanguageDetectionService(lang_detector, text_container="word",
+                                             text_block_names=["text", "title", "table"])
+        ```
     """
     def __init__(
@@ -60,18 +62,20 @@ class LanguageDetectionService(PipelineComponent):
         floating_text_block_categories: Optional[Sequence[TypeOrStr]] = None,
     ):
         """
-        :param language_detector: Detector to determine text
-        :param text_container: text container, needed to generate the reading order. Not necessary when passing a
-                               text detector.
-        :param text_detector: Object detector to extract text. You cannot use a Pdfminer here.
-        :param floating_text_block_categories: text blocks, needed for generating the reading order. Not necessary
-                                 when passing a text detector.
+        Initializes a `LanguageDetectionService` instance.
+        Args:
+            language_detector: Detector to determine text.
+            text_container: Text container, needed to generate the reading order. Not necessary when passing a
+                `text_detector`.
+            text_detector: Object detector to extract text. You cannot use a Pdfminer here.
+            floating_text_block_categories: Text blocks, needed for generating the reading order. Not necessary
+                when passing a `text_detector`.
         """
         self.predictor = language_detector
         self.text_detector = text_detector
-        self.text_container = get_type(text_container) if text_container is not None else text_container
+        self.text_container = get_type(text_container) if text_container is not None else ImageDefaults.TEXT_CONTAINER
         self.floating_text_block_categories = (
             tuple(get_type(text_block) for text_block in floating_text_block_categories)
             if (floating_text_block_categories is not None)
@@ -81,8 +85,21 @@ class LanguageDetectionService(PipelineComponent):
         super().__init__(self._get_name(self.predictor.name))
     def serve(self, dp: Image) -> None:
+        """
+        Serves the language detection on the given `Image`.
+        Args:
+            dp: The `Image` datapoint to process.
+        Raises:
+            ImageError: If `dp.image` is `None` and a `text_detector` is used.
+        """
         if self.text_detector is None:
-            page = Page.from_image(dp, self.text_container, self.floating_text_block_categories)
+            page = Page.from_image(
+                image_orig=dp,
+                text_container=self.text_container,
+                floating_text_block_categories=self.floating_text_block_categories,
+            )
             text = page.text_no_line_break
         else:
             if dp.image is None:

deepdoctection/pipe/layout.py CHANGED Viewed

@@ -43,11 +43,20 @@ def skip_if_category_or_service_extracted(
     """
     Skip the processing of the pipeline component if the category or service is already extracted.
-    **Example**
+    Example:
+        ```python
         detector = # some detector
         item_component = ImageLayoutService(detector)
         item_component.set_inbound_filter(skip_if_category_or_service_extracted(detector.get_categories(as_dict=False)))
+        ```
+    Args:
+        dp: The `Image` datapoint to check.
+        category_names: Optional category names or sequence of `ObjectTypes` to check for.
+        service_ids: Optional service IDs or sequence of service IDs to check for.
+    Returns:
+        Whether to skip processing based on existing annotation.
     """
     if dp.get_annotation(category_names=category_names, service_ids=service_ids):
@@ -58,18 +67,21 @@ def skip_if_category_or_service_extracted(
 @pipeline_component_registry.register("ImageLayoutService")
 class ImageLayoutService(PipelineComponent):
     """
-    Pipeline component for determining the layout. Which layout blocks are determined depends on the Detector and thus
-    usually on the data set on which the Detector was pre-trained. If the Detector has been trained on Publaynet, these
-    are layouts such as text, title, table, list and figure. If the Detector has been trained on DocBank, these are
-    rather Abstract, Author, Caption, Equation, Figure, Footer, List, Paragraph, Reference, Section, Table, Title.
+    Pipeline component for determining the layout.
-    The component is usually at the beginning of the pipeline. Cropping of the layout blocks can be selected to simplify
-    further processing.
+    Which layout blocks are determined depends on the `Detector` and thus usually on the data set on which the
+     `Detector` was pre-trained. If the `Detector` has been trained on Publaynet, these are layouts such as text, title
+     , table, list and figure. If the `Detector` has been trained on DocBank, these are rather Abstract, Author,
+      Caption, Equation, Figure, Footer, List, Paragraph, Reference, Section, Table, Title.
-    **Example**
+    The component is usually at the beginning of the pipeline. Cropping of the layout blocks can be selected to
+     simplify further processing.
-            d_items = TPFrcnnDetector(item_config_path, item_weights_path, {1: 'row', 2: 'column'})
-            item_component = ImageLayoutService(d_items)
+    Example:
+        ```python
+        d_items = TPFrcnnDetector(item_config_path, item_weights_path, {1: 'row', 2: 'column'})
+        item_component = ImageLayoutService(d_items)
+        ```
     """
     def __init__(
@@ -80,13 +92,19 @@ class ImageLayoutService(PipelineComponent):
         padder: Optional[PadTransform] = None,
     ):
         """
-        :param layout_detector: object detector
-        :param to_image: Generate an image for each detected block, e.g. populate `ImageAnnotation.image`. Useful,
-                         if you want to process only some blocks in a subsequent pipeline component.
-        :param crop_image: Do not only populate `ImageAnnotation.image` but also crop the detected block according
-                           to its bounding box and populate the resulting sub image to
-                           `ImageAnnotation.image.image`.
-        :param padder: If not `None`, will apply the padder to the image before prediction and inverse apply the padder
+        Initializes the `ImageLayoutService`.
+        Args:
+            layout_detector: The object detector.
+            to_image: Whether to generate an image for each detected block, e.g. populate `ImageAnnotation.image`.
+                Useful if you want to process only some blocks in a subsequent pipeline component.
+            crop_image: Whether to crop the detected block according to its bounding box and populate the resulting sub
+                image to `ImageAnnotation.image.image`.
+            padder: If not `None`, will apply the padder to the image before prediction and inverse apply the padder.
+        Note:
+            If `padder` is provided, it will be applied before prediction and inversely applied to the coordinates
+             after prediction.
         """
         self.to_image = to_image
         self.crop_image = crop_image
@@ -95,6 +113,15 @@ class ImageLayoutService(PipelineComponent):
         super().__init__(self._get_name(layout_detector.name), self.predictor.model_id)
     def serve(self, dp: Image) -> None:
+        """
+        Serve the pipeline component on the given `Image`.
+        Args:
+            dp: The `Image` datapoint to process.
+        Raises:
+            ImageError: If `dp.image` is `None`.
+        """
         if dp.image is None:
             raise ImageError("image cannot be None")
         np_image = dp.image

deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl