PyPI - deepdoctection - Versions diffs - 0.31__py3-none-any.whl → 0.33__py3-none-any.whl - Mend

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show

deepdoctection/__init__.py +16 -29
deepdoctection/analyzer/dd.py +70 -59
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +41 -56
deepdoctection/datapoint/box.py +9 -8
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +56 -44
deepdoctection/datapoint/view.py +245 -150
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +35 -26
deepdoctection/datasets/base.py +14 -12
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +24 -26
deepdoctection/datasets/instances/doclaynet.py +51 -51
deepdoctection/datasets/instances/fintabnet.py +46 -46
deepdoctection/datasets/instances/funsd.py +25 -24
deepdoctection/datasets/instances/iiitar13k.py +13 -10
deepdoctection/datasets/instances/layouttest.py +4 -3
deepdoctection/datasets/instances/publaynet.py +5 -5
deepdoctection/datasets/instances/pubtables1m.py +24 -21
deepdoctection/datasets/instances/pubtabnet.py +32 -30
deepdoctection/datasets/instances/rvlcdip.py +30 -30
deepdoctection/datasets/instances/xfund.py +26 -26
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +15 -13
deepdoctection/eval/eval.py +41 -37
deepdoctection/eval/tedsmetric.py +30 -23
deepdoctection/eval/tp_eval_callback.py +16 -19
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +85 -113
deepdoctection/extern/deskew.py +14 -11
deepdoctection/extern/doctrocr.py +141 -130
deepdoctection/extern/fastlang.py +27 -18
deepdoctection/extern/hfdetr.py +71 -62
deepdoctection/extern/hflayoutlm.py +504 -211
deepdoctection/extern/hflm.py +230 -0
deepdoctection/extern/model.py +488 -302
deepdoctection/extern/pdftext.py +23 -19
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +29 -19
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +18 -18
deepdoctection/extern/tp/tfutils.py +57 -9
deepdoctection/extern/tp/tpcompat.py +21 -14
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +45 -53
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/cats.py +27 -29
deepdoctection/mapper/cocostruct.py +10 -10
deepdoctection/mapper/d2struct.py +27 -26
deepdoctection/mapper/hfstruct.py +13 -8
deepdoctection/mapper/laylmstruct.py +178 -37
deepdoctection/mapper/maputils.py +12 -11
deepdoctection/mapper/match.py +2 -2
deepdoctection/mapper/misc.py +11 -9
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +5 -5
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +5 -5
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +12 -14
deepdoctection/pipe/base.py +52 -106
deepdoctection/pipe/common.py +72 -59
deepdoctection/pipe/concurrency.py +16 -11
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +20 -16
deepdoctection/pipe/lm.py +75 -105
deepdoctection/pipe/order.py +194 -89
deepdoctection/pipe/refine.py +111 -124
deepdoctection/pipe/segment.py +156 -161
deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +48 -41
deepdoctection/train/hf_detr_train.py +41 -30
deepdoctection/train/hf_layoutlm_train.py +153 -135
deepdoctection/train/tp_frcnn_train.py +32 -31
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +87 -125
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +22 -18
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +16 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +11 -11
deepdoctection/utils/settings.py +185 -181
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +74 -72
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
deepdoctection-0.33.dist-info/RECORD +146 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.31.dist-info/RECORD +0 -144
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# File: tokenclass.py
+# File: lm.py
 # Copyright 2021 Dr. Janis Meyer. All rights reserved.
 #
@@ -18,60 +18,23 @@
 """
 Module for token classification pipeline
 """
+from __future__ import annotations
 from copy import copy
-from typing import Any, List, Literal, Optional, Sequence, Union
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
 from ..datapoint.image import Image
-from ..extern.hflayoutlm import HFLayoutLmSequenceClassifierBase, HFLayoutLmTokenClassifierBase
-from ..mapper.laylmstruct import image_to_layoutlm_features
-from ..utils.detection_types import JsonDict
-from ..utils.file_utils import transformers_available
+from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
 from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
-from .base import LanguageModelPipelineComponent
+from .base import MetaAnnotation, PipelineComponent
 from .registry import pipeline_component_registry
-if transformers_available():
-    from transformers import LayoutLMTokenizerFast, RobertaTokenizerFast, XLMRobertaTokenizerFast
-    _ARCHITECTURES_TO_TOKENIZER = {
-        ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
-        ("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
-        ("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
-            "roberta-base", add_prefix_space=True
-        ),
-        ("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
-            "roberta-base", add_prefix_space=True
-        ),
-    }
-def get_tokenizer_from_architecture(architecture_name: str, use_xlm_tokenizer: bool) -> Any:
-    """
-    We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
-    returns the tokenizer that should be used for a particular model.
-    :param architecture_name: The model as stated in the transformer library.
-    :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
-    :return: Tokenizer instance to use.
-    """
-    return _ARCHITECTURES_TO_TOKENIZER[(architecture_name, use_xlm_tokenizer)]
+if TYPE_CHECKING:
+    from ..extern.hflayoutlm import HfLayoutSequenceModels, HfLayoutTokenModels
 @pipeline_component_registry.register("LMTokenClassifierService")
-class LMTokenClassifierService(LanguageModelPipelineComponent):
+class LMTokenClassifierService(PipelineComponent):
     """
     Pipeline component for token classification
@@ -103,7 +66,7 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
     def __init__(
         self,
         tokenizer: Any,
-        language_model: HFLayoutLmTokenClassifierBase,
+        language_model: HfLayoutTokenModels,
         padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
         truncation: bool = True,
         return_overflowing_tokens: bool = False,
@@ -147,14 +110,16 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
         self.segment_positions = segment_positions
         self.sliding_window_stride = sliding_window_stride
         if self.use_other_as_default_category:
-            categories_name_as_key = {val: key for key, val in self.language_model.categories.items()}
+            categories_name_as_key = {val: key for key, val in self.language_model.categories.categories.items()}
             self.default_key: ObjectTypes
-            if BioTag.outside in categories_name_as_key:
-                self.default_key = BioTag.outside
+            if BioTag.OUTSIDE in categories_name_as_key:
+                self.default_key = BioTag.OUTSIDE
             else:
-                self.default_key = TokenClasses.other
+                self.default_key = TokenClasses.OTHER
             self.other_name_as_key = {self.default_key: categories_name_as_key[self.default_key]}
-        super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
+        self.tokenizer = tokenizer
+        self.mapping_to_lm_input_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
+        super().__init__(self._get_name(), self.language_model.model_id)
         self.required_kwargs = {
             "tokenizer": self.tokenizer,
             "padding": self.padding,
@@ -164,7 +129,7 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
             "segment_positions": self.segment_positions,
             "sliding_window_stride": self.sliding_window_stride,
         }
-        self.required_kwargs.update(self.language_model.default_kwargs_for_input_mapping())
+        self.required_kwargs.update(self.language_model.default_kwargs_for_image_to_features_mapping())
         self._init_sanity_checks()
     def serve(self, dp: Image) -> None:
@@ -182,7 +147,7 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
             and not token.token.startswith("##")
         ]
-        words_populated: List[str] = []
+        words_populated: list[str] = []
         for token in lm_output:
             if token.uuid not in words_populated:
                 if token.class_name == token.semantic_name:
@@ -190,35 +155,37 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
                 else:
                     token_class_name_id = None
                 self.dp_manager.set_category_annotation(
-                    token.semantic_name, token_class_name_id, WordType.token_class, token.uuid
+                    token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid
                 )
-                self.dp_manager.set_category_annotation(token.bio_tag, None, WordType.tag, token.uuid)
+                self.dp_manager.set_category_annotation(token.bio_tag, None, WordType.TAG, token.uuid)
                 self.dp_manager.set_category_annotation(
-                    token.class_name, token.class_id, WordType.token_tag, token.uuid
+                    token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid
                 )
                 words_populated.append(token.uuid)
         if self.use_other_as_default_category:
-            word_anns = dp.get_annotation(LayoutType.word)
+            word_anns = dp.get_annotation(LayoutType.WORD)
             for word in word_anns:
-                if WordType.token_class not in word.sub_categories:
+                if WordType.TOKEN_CLASS not in word.sub_categories:
                     self.dp_manager.set_category_annotation(
-                        TokenClasses.other,
+                        TokenClasses.OTHER,
                         self.other_name_as_key[self.default_key],
-                        WordType.token_class,
+                        WordType.TOKEN_CLASS,
                         word.annotation_id,
                     )
-                if WordType.tag not in word.sub_categories:
-                    self.dp_manager.set_category_annotation(BioTag.outside, None, WordType.tag, word.annotation_id)
-                if WordType.token_tag not in word.sub_categories:
+                if WordType.TAG not in word.sub_categories:
+                    self.dp_manager.set_category_annotation(BioTag.OUTSIDE, None, WordType.TAG, word.annotation_id)
+                if WordType.TOKEN_TAG not in word.sub_categories:
                     self.dp_manager.set_category_annotation(
                         self.default_key,
                         self.other_name_as_key[self.default_key],
-                        WordType.token_tag,
+                        WordType.TOKEN_TAG,
                         word.annotation_id,
                     )
-    def clone(self) -> "LMTokenClassifierService":
+    def clone(self) -> LMTokenClassifierService:
+        # ToDo: replace copying of tokenizer with a proper clone method. Otherwise we cannot run the evaluation with
+        # multiple threads
         return self.__class__(
             copy(self.tokenizer),
             self.language_model.clone(),
@@ -230,36 +197,38 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
             self.sliding_window_stride,
         )
-    def get_meta_annotation(self) -> JsonDict:
-        return dict(
-            [
-                ("image_annotations", []),
-                ("sub_categories", {LayoutType.word: {WordType.token_class, WordType.tag, WordType.token_tag}}),
-                ("relationships", {}),
-                ("summaries", []),
-            ]
+    def get_meta_annotation(self) -> MetaAnnotation:
+        return MetaAnnotation(
+            image_annotations=(),
+            sub_categories={LayoutType.WORD: {WordType.TOKEN_CLASS, WordType.TAG, WordType.TOKEN_TAG}},
+            relationships={},
+            summaries=(),
         )
     def _get_name(self) -> str:
         return f"lm_token_class_{self.language_model.name}"
     def _init_sanity_checks(self) -> None:
-        tokenizer_class = self.language_model.model.config.tokenizer_class
-        use_xlm_tokenizer = False
-        if tokenizer_class is not None:
-            use_xlm_tokenizer = True
-        tokenizer_reference = get_tokenizer_from_architecture(
-            self.language_model.model.__class__.__name__, use_xlm_tokenizer
-        )
-        if not isinstance(self.tokenizer, type(tokenizer_reference)):
+        tokenizer_class_name = self.language_model.model.config.tokenizer_class
+        if tokenizer_class_name != self.tokenizer.__class__.__name__:
             raise TypeError(
-                f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
+                f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
                 f"in this framework"
             )
+    @staticmethod
+    def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
+        """Replacing eval functions"""
+        return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
+            mapping_str
+        ]
+    def clear_predictor(self) -> None:
+        self.language_model.clear_model()
 @pipeline_component_registry.register("LMSequenceClassifierService")
-class LMSequenceClassifierService(LanguageModelPipelineComponent):
+class LMSequenceClassifierService(PipelineComponent):
     """
     Pipeline component for sequence classification
@@ -291,7 +260,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
     def __init__(
         self,
         tokenizer: Any,
-        language_model: HFLayoutLmSequenceClassifierBase,
+        language_model: HfLayoutSequenceModels,
         padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
         truncation: bool = True,
         return_overflowing_tokens: bool = False,
@@ -315,7 +284,9 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
         self.padding = padding
         self.truncation = truncation
         self.return_overflowing_tokens = return_overflowing_tokens
-        super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
+        self.tokenizer = tokenizer
+        self.mapping_to_lm_input_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
+        super().__init__(self._get_name(), self.language_model.model_id)
         self.required_kwargs = {
             "tokenizer": self.tokenizer,
             "padding": self.padding,
@@ -323,7 +294,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
             "return_overflowing_tokens": self.return_overflowing_tokens,
             "return_tensors": "pt",
         }
-        self.required_kwargs.update(self.language_model.default_kwargs_for_input_mapping())
+        self.required_kwargs.update(self.language_model.default_kwargs_for_image_to_features_mapping())
         self._init_sanity_checks()
     def serve(self, dp: Image) -> None:
@@ -332,10 +303,10 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
             return
         lm_output = self.language_model.predict(**lm_input)
         self.dp_manager.set_summary_annotation(
-            PageType.document_type, lm_output.class_name, lm_output.class_id, None, lm_output.score
+            PageType.DOCUMENT_TYPE, lm_output.class_name, lm_output.class_id, None, lm_output.score
         )
-    def clone(self) -> "LMSequenceClassifierService":
+    def clone(self) -> LMSequenceClassifierService:
         return self.__class__(
             copy(self.tokenizer),
             self.language_model.clone(),
@@ -344,29 +315,28 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
             self.return_overflowing_tokens,
         )
-    def get_meta_annotation(self) -> JsonDict:
-        return dict(
-            [
-                ("image_annotations", []),
-                ("sub_categories", {}),
-                ("relationships", {}),
-                ("summaries", [PageType.document_type]),
-            ]
+    def get_meta_annotation(self) -> MetaAnnotation:
+        return MetaAnnotation(
+            image_annotations=(), sub_categories={}, relationships={}, summaries=(PageType.DOCUMENT_TYPE,)
         )
     def _get_name(self) -> str:
         return f"lm_sequence_class_{self.language_model.name}"
     def _init_sanity_checks(self) -> None:
-        tokenizer_class = self.language_model.model.config.tokenizer_class
-        use_xlm_tokenizer = False
-        if tokenizer_class is not None:
-            use_xlm_tokenizer = True
-        tokenizer_reference = get_tokenizer_from_architecture(
-            self.language_model.model.__class__.__name__, use_xlm_tokenizer
-        )
-        if not isinstance(self.tokenizer, type(tokenizer_reference)):
+        tokenizer_class_name = self.language_model.model.config.tokenizer_class
+        if tokenizer_class_name != self.tokenizer.__class__.__name__:
             raise TypeError(
-                f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
+                f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
                 f"in this framework"
             )
+    @staticmethod
+    def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
+        """Replacing eval functions"""
+        return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
+            mapping_str
+        ]
+    def clear_predictor(self) -> None:
+        self.language_model.clear_model()

deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl