PyPI - deepdoctection - Versions diffs - 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl - Mend

deepdoctection 0.42.1py3-none-any.whl → 0.43py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show

deepdoctection/__init__.py +2 -1
deepdoctection/analyzer/__init__.py +2 -1
deepdoctection/analyzer/config.py +904 -0
deepdoctection/analyzer/dd.py +36 -62
deepdoctection/analyzer/factory.py +311 -141
deepdoctection/configs/conf_dd_one.yaml +100 -44
deepdoctection/configs/profiles.jsonl +32 -0
deepdoctection/dataflow/__init__.py +9 -6
deepdoctection/dataflow/base.py +33 -15
deepdoctection/dataflow/common.py +96 -75
deepdoctection/dataflow/custom.py +36 -29
deepdoctection/dataflow/custom_serialize.py +135 -91
deepdoctection/dataflow/parallel_map.py +33 -31
deepdoctection/dataflow/serialize.py +15 -10
deepdoctection/dataflow/stats.py +41 -28
deepdoctection/datapoint/__init__.py +4 -6
deepdoctection/datapoint/annotation.py +104 -66
deepdoctection/datapoint/box.py +190 -130
deepdoctection/datapoint/convert.py +66 -39
deepdoctection/datapoint/image.py +151 -95
deepdoctection/datapoint/view.py +383 -236
deepdoctection/datasets/__init__.py +2 -6
deepdoctection/datasets/adapter.py +11 -11
deepdoctection/datasets/base.py +118 -81
deepdoctection/datasets/dataflow_builder.py +18 -12
deepdoctection/datasets/info.py +76 -57
deepdoctection/datasets/instances/__init__.py +6 -2
deepdoctection/datasets/instances/doclaynet.py +17 -14
deepdoctection/datasets/instances/fintabnet.py +16 -22
deepdoctection/datasets/instances/funsd.py +11 -6
deepdoctection/datasets/instances/iiitar13k.py +9 -9
deepdoctection/datasets/instances/layouttest.py +9 -9
deepdoctection/datasets/instances/publaynet.py +9 -9
deepdoctection/datasets/instances/pubtables1m.py +13 -13
deepdoctection/datasets/instances/pubtabnet.py +13 -15
deepdoctection/datasets/instances/rvlcdip.py +8 -8
deepdoctection/datasets/instances/xfund.py +11 -9
deepdoctection/datasets/registry.py +18 -11
deepdoctection/datasets/save.py +12 -11
deepdoctection/eval/__init__.py +3 -2
deepdoctection/eval/accmetric.py +72 -52
deepdoctection/eval/base.py +29 -10
deepdoctection/eval/cocometric.py +14 -12
deepdoctection/eval/eval.py +56 -41
deepdoctection/eval/registry.py +6 -3
deepdoctection/eval/tedsmetric.py +24 -9
deepdoctection/eval/tp_eval_callback.py +13 -12
deepdoctection/extern/__init__.py +1 -1
deepdoctection/extern/base.py +176 -97
deepdoctection/extern/d2detect.py +127 -92
deepdoctection/extern/deskew.py +19 -10
deepdoctection/extern/doctrocr.py +157 -106
deepdoctection/extern/fastlang.py +25 -17
deepdoctection/extern/hfdetr.py +137 -60
deepdoctection/extern/hflayoutlm.py +329 -248
deepdoctection/extern/hflm.py +67 -33
deepdoctection/extern/model.py +108 -762
deepdoctection/extern/pdftext.py +37 -12
deepdoctection/extern/pt/nms.py +15 -1
deepdoctection/extern/pt/ptutils.py +13 -9
deepdoctection/extern/tessocr.py +87 -54
deepdoctection/extern/texocr.py +29 -14
deepdoctection/extern/tp/tfutils.py +36 -8
deepdoctection/extern/tp/tpcompat.py +54 -16
deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
deepdoctection/extern/tpdetect.py +4 -2
deepdoctection/mapper/__init__.py +1 -1
deepdoctection/mapper/cats.py +117 -76
deepdoctection/mapper/cocostruct.py +35 -17
deepdoctection/mapper/d2struct.py +56 -29
deepdoctection/mapper/hfstruct.py +32 -19
deepdoctection/mapper/laylmstruct.py +221 -185
deepdoctection/mapper/maputils.py +71 -35
deepdoctection/mapper/match.py +76 -62
deepdoctection/mapper/misc.py +68 -44
deepdoctection/mapper/pascalstruct.py +13 -12
deepdoctection/mapper/prodigystruct.py +33 -19
deepdoctection/mapper/pubstruct.py +42 -32
deepdoctection/mapper/tpstruct.py +39 -19
deepdoctection/mapper/xfundstruct.py +20 -13
deepdoctection/pipe/__init__.py +1 -2
deepdoctection/pipe/anngen.py +104 -62
deepdoctection/pipe/base.py +226 -107
deepdoctection/pipe/common.py +206 -123
deepdoctection/pipe/concurrency.py +74 -47
deepdoctection/pipe/doctectionpipe.py +108 -47
deepdoctection/pipe/language.py +41 -24
deepdoctection/pipe/layout.py +45 -18
deepdoctection/pipe/lm.py +146 -78
deepdoctection/pipe/order.py +196 -113
deepdoctection/pipe/refine.py +111 -63
deepdoctection/pipe/registry.py +1 -1
deepdoctection/pipe/segment.py +213 -142
deepdoctection/pipe/sub_layout.py +76 -46
deepdoctection/pipe/text.py +52 -33
deepdoctection/pipe/transform.py +8 -6
deepdoctection/train/d2_frcnn_train.py +87 -69
deepdoctection/train/hf_detr_train.py +72 -40
deepdoctection/train/hf_layoutlm_train.py +85 -46
deepdoctection/train/tp_frcnn_train.py +56 -28
deepdoctection/utils/concurrency.py +59 -16
deepdoctection/utils/context.py +40 -19
deepdoctection/utils/develop.py +25 -17
deepdoctection/utils/env_info.py +85 -36
deepdoctection/utils/error.py +16 -10
deepdoctection/utils/file_utils.py +246 -62
deepdoctection/utils/fs.py +162 -43
deepdoctection/utils/identifier.py +29 -16
deepdoctection/utils/logger.py +49 -32
deepdoctection/utils/metacfg.py +83 -21
deepdoctection/utils/pdf_utils.py +119 -62
deepdoctection/utils/settings.py +24 -10
deepdoctection/utils/tqdm.py +10 -5
deepdoctection/utils/transform.py +182 -46
deepdoctection/utils/utils.py +61 -28
deepdoctection/utils/viz.py +150 -104
deepdoctection-0.43.dist-info/METADATA +376 -0
deepdoctection-0.43.dist-info/RECORD +149 -0
deepdoctection/analyzer/_config.py +0 -146
deepdoctection-0.42.1.dist-info/METADATA +0 -431
deepdoctection-0.42.1.dist-info/RECORD +0 -148
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -37,31 +37,34 @@ if TYPE_CHECKING:
 @pipeline_component_registry.register("LMTokenClassifierService")
 class LMTokenClassifierService(PipelineComponent):
     """
-    Pipeline component for token classification
+    Module for token classification pipeline.
-    **Example**
+    This module provides pipeline components for token and sequence classification using language models.
-            # setting up compulsory ocr service
-            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
-            tess = TesseractOcrDetector(tesseract_config_path)
-            ocr_service = TextExtractionService(tess)
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
-            # hf tokenizer and token classifier
-            tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
-            layoutlm = HFLayoutLmTokenClassifier(categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
-                                                               'E-header', 'E-question', 'I-answer', 'I-header',
-                                                               'I-question', 'O', 'S-answer', 'S-header', 'S-question'])
+        # hf tokenizer and token classifier
+        tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
+        layoutlm = HFLayoutLmTokenClassifier(categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
+                                                           'E-header', 'E-question', 'I-answer', 'I-header',
+                                                           'I-question', 'O', 'S-answer', 'S-header', 'S-question'])
-            # token classification service
-            layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
+        # token classification service
+        layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
-            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service, layoutlm_service])
-            path = "path/to/some/form"
-            df = pipe.analyze(path=path)
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
-            for dp in df:
-                ...
+        for dp in df:
+            ...
+        ```
     """
     def __init__(
@@ -76,32 +79,65 @@ class LMTokenClassifierService(PipelineComponent):
         sliding_window_stride: int = 0,
     ) -> None:
         """
-        :param tokenizer: Token classifier, typing allows currently anything. This will be changed in the future
-        :param language_model: language model token classifier
-        :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
-                        `do_not_pad`.
-        :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
-                           maximum acceptable input length for the model if that argument is not provided. This will
-                           truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                           sequences (or a batch of pairs) is provided.
-                           If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
-                           model maximum admissible input size).
-        :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
-                           can be returned as an additional batch element. Not that in this case, the number of input
-                           batch samples will be smaller than the output batch samples.
-        :param use_other_as_default_category: When predicting token classes, it might be possible that some words might
-                                              not get sent to the model because they are categorized as not eligible
-                                              token (e.g. empty string). If set to `True` it will assign all words
-                                              without token the `BioTag.outside` token.
-        :param segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly
-                              for models that have been trained on segments rather than words.
-                              Choose a single or a sequence of layout segments to use their bounding boxes. Note, that
-                              the layout segments need to have a child-relationship with words. If a word does not
-                              appear as child, it will use the word bounding box.
-        :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length, a sliding
-                              windows will be created with each window having max_length sequence input. When using
-                              `sliding_window_stride=0` no strides will be created, otherwise it will create slides
-                              with windows shifted `sliding_window_stride` to the right.
+        Pipeline component for token classification.
+        Example:
+            ```python
+            # setting up compulsory ocr service
+            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+            tess = TesseractOcrDetector(tesseract_config_path)
+            ocr_service = TextExtractionService(tess)
+            # hf tokenizer and token classifier
+            tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
+            layoutlm = HFLayoutLmTokenClassifier(categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
+                                                               'E-header', 'E-question', 'I-answer', 'I-header',
+                                                               'I-question', 'O', 'S-answer', 'S-header', 'S-question'])
+            # token classification service
+            layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
+            pipe = DoctectionPipe(pipeline_component_list=[ocr_service, layoutlm_service])
+            path = "path/to/some/form"
+            df = pipe.analyze(path=path)
+            for dp in df:
+                ...
+            ```
+        Args:
+            tokenizer: `Token classifier`, typing allows currently anything. This will be changed in the future.
+            language_model: `language model token classifier`.
+            padding: A padding strategy to be passed to the `tokenizer`. Must be either `max_length`, `longest` or
+                    `do_not_pad`.
+            truncation: If `True` will truncate to a maximum length specified with the argument `max_length` or to
+                        the maximum acceptable input length for the model if that argument is not provided. This
+                        will truncate token by token, removing a token from the longest sequence in the pair if a
+                        pair of sequences (or a batch of pairs) is provided. If `False` then no truncation (i.e.,
+                        can output batch with sequence lengths greater than the model maximum admissible input
+                        size).
+            return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
+                                       can be returned as an additional batch element. Note that in this case, the
+                                       number of input batch samples will be smaller than the output batch samples.
+            use_other_as_default_category: When predicting token classes, it might be possible that some words
+                                           might not get sent to the model because they are categorized as not
+                                           eligible token (e.g. empty string). If set to `True` it will assign all
+                                           words without token the `BioTag.outside` token.
+            segment_positions: Using bounding boxes of segment instead of words improves model accuracy
+                               significantly for models that have been trained on segments rather than words.
+                               Choose a single or a sequence of layout segments to use their bounding boxes. Note,
+                               that the layout segments need to have a child-relationship with words. If a word
+                               does not appear as child, it will use the word bounding box.
+            sliding_window_stride: If the output of the `tokenizer` exceeds the `max_length` sequence length, a
+                                   sliding window will be created with each window having `max_length` sequence
+                                   input. When using `sliding_window_stride=0` no strides will be created,
+                                   otherwise it will create slides with windows shifted `sliding_window_stride` to
+                                   the right.
+        Note:
+            If `use_other_as_default_category` is set, words without eligible tokens will be assigned the
+            `BioTag.outside` token.
         """
         self.language_model = language_model
         self.padding = padding
@@ -134,6 +170,15 @@ class LMTokenClassifierService(PipelineComponent):
         self._init_sanity_checks()
     def serve(self, dp: Image) -> None:
+        """
+        Serve the token classification pipeline on a given `Image`.
+        Args:
+            dp: The `Image` to process.
+        Returns:
+            None
+        """
         lm_input = self.mapping_to_lm_input_func(**self.required_kwargs)(dp)
         if lm_input is None:
             return
@@ -231,30 +276,32 @@ class LMTokenClassifierService(PipelineComponent):
 @pipeline_component_registry.register("LMSequenceClassifierService")
 class LMSequenceClassifierService(PipelineComponent):
     """
-    Pipeline component for sequence classification
+    Pipeline component for sequence classification.
-    **Example**
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
-            # setting up compulsory ocr service
-            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
-            tess = TesseractOcrDetector(tesseract_config_path)
-            ocr_service = TextExtractionService(tess)
+        # hf tokenizer and token classifier
+        tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
+        layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json", "path/to/model.bin",
+                                                 categories=["handwritten", "presentation", "resume"])
-            # hf tokenizer and token classifier
-            tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
-            layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json","path/to/model.bin",
-                                                     categories=["handwritten", "presentation", "resume"])
+        # token classification service
+        layoutlm_service = LMSequenceClassifierService(tokenizer, layoutlm)
-            # token classification service
-            layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service, layoutlm_service])
-            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
-            path = "path/to/some/form"
-            df = pipe.analyze(path=path)
+        for dp in df:
+            ...
+        ```
-            for dp in df:
-                ...
     """
@@ -268,22 +315,26 @@ class LMSequenceClassifierService(PipelineComponent):
         use_other_as_default_category: bool = False,
     ) -> None:
         """
-        :param tokenizer: Tokenizer, typing allows currently anything. This will be changed in the future
-        :param language_model: language model sequence classifier
-        :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
-                        `do_not_pad`.
-        :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
-                           maximum acceptable input length for the model if that argument is not provided. This will
-                           truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                           sequences (or a batch of pairs) is provided.
-                           If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
-                           model maximum admissible input size).
-        :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
-                           can be returned as an additional batch element. Not that in this case, the number of input
-                           batch samples will be smaller than the output batch samples.
-        :param use_other_as_default_category: When predicting document classes, it might be possible that some pages
-                           do not get sent to the model because they are empty. If set to `True` it
-                           will assign images with no features the category `TokenClasses.OTHER`.
+        Args:
+            tokenizer: `Tokenizer`, typing allows currently anything. This will be changed in the future.
+            language_model: `language model sequence classifier`.
+            padding: A padding strategy to be passed to the `tokenizer`. Must be either `max_length`, `longest` or
+                     `do_not_pad`.
+            truncation: If `True` will truncate to a maximum length specified with the argument `max_length` or to the
+                        maximum acceptable input length for the model if that argument is not provided. This will
+                        truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                        sequences (or a batch of pairs) is provided. If `False` then no truncation (i.e., can output
+                        batch with sequence lengths greater than the model maximum admissible input size).
+            return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
+                                       can be returned as an additional batch element. Note that in this case, the
+                                       number of input batch samples will be smaller than the output batch samples.
+            use_other_as_default_category: When predicting document classes, it might be possible that some pages do
+                                           not get sent to the model because they are empty. If set to `True` it will
+                                           assign images with no features the category `TokenClasses.OTHER`.
+        Note:
+            If `use_other_as_default_category` is set, images with no features will be assigned the `TokenClasses.OTHER`
+            category.
         """
         self.language_model = language_model
@@ -305,6 +356,15 @@ class LMSequenceClassifierService(PipelineComponent):
         self._init_sanity_checks()
     def serve(self, dp: Image) -> None:
+        """
+        Serve the sequence classification pipeline on a given `Image`.
+        Args:
+            dp: The `Image` to process.
+        Returns:
+            None
+        """
         lm_input = self.mapping_to_lm_input_func(**self.required_kwargs)(dp)
         lm_output = None
         if lm_input is None:
@@ -347,7 +407,15 @@ class LMSequenceClassifierService(PipelineComponent):
     @staticmethod
     def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
-        """Replacing eval functions"""
+        """
+        Get the function to map images to features for the language model.
+        Args:
+            mapping_str: The mapping function name as a string.
+        Returns:
+            A callable that maps an `Image` to features.
+        """
         return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
             mapping_str
         ]

deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.42.1py3-none-any.whl → 0.43py3-none-any.whl