PyPI - deepdoctection - Versions diffs - 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl - Mend

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show

deepdoctection/__init__.py +4 -2
deepdoctection/analyzer/__init__.py +2 -1
deepdoctection/analyzer/config.py +919 -0
deepdoctection/analyzer/dd.py +36 -62
deepdoctection/analyzer/factory.py +311 -141
deepdoctection/configs/conf_dd_one.yaml +100 -44
deepdoctection/configs/profiles.jsonl +32 -0
deepdoctection/dataflow/__init__.py +9 -6
deepdoctection/dataflow/base.py +33 -15
deepdoctection/dataflow/common.py +96 -75
deepdoctection/dataflow/custom.py +36 -29
deepdoctection/dataflow/custom_serialize.py +135 -91
deepdoctection/dataflow/parallel_map.py +33 -31
deepdoctection/dataflow/serialize.py +15 -10
deepdoctection/dataflow/stats.py +41 -28
deepdoctection/datapoint/__init__.py +4 -6
deepdoctection/datapoint/annotation.py +104 -66
deepdoctection/datapoint/box.py +190 -130
deepdoctection/datapoint/convert.py +66 -39
deepdoctection/datapoint/image.py +151 -95
deepdoctection/datapoint/view.py +383 -236
deepdoctection/datasets/__init__.py +2 -6
deepdoctection/datasets/adapter.py +11 -11
deepdoctection/datasets/base.py +118 -81
deepdoctection/datasets/dataflow_builder.py +18 -12
deepdoctection/datasets/info.py +76 -57
deepdoctection/datasets/instances/__init__.py +6 -2
deepdoctection/datasets/instances/doclaynet.py +17 -14
deepdoctection/datasets/instances/fintabnet.py +16 -22
deepdoctection/datasets/instances/funsd.py +11 -6
deepdoctection/datasets/instances/iiitar13k.py +9 -9
deepdoctection/datasets/instances/layouttest.py +9 -9
deepdoctection/datasets/instances/publaynet.py +9 -9
deepdoctection/datasets/instances/pubtables1m.py +13 -13
deepdoctection/datasets/instances/pubtabnet.py +13 -15
deepdoctection/datasets/instances/rvlcdip.py +8 -8
deepdoctection/datasets/instances/xfund.py +11 -9
deepdoctection/datasets/registry.py +18 -11
deepdoctection/datasets/save.py +12 -11
deepdoctection/eval/__init__.py +3 -2
deepdoctection/eval/accmetric.py +72 -52
deepdoctection/eval/base.py +29 -10
deepdoctection/eval/cocometric.py +14 -12
deepdoctection/eval/eval.py +56 -41
deepdoctection/eval/registry.py +6 -3
deepdoctection/eval/tedsmetric.py +24 -9
deepdoctection/eval/tp_eval_callback.py +13 -12
deepdoctection/extern/__init__.py +1 -1
deepdoctection/extern/base.py +176 -97
deepdoctection/extern/d2detect.py +127 -92
deepdoctection/extern/deskew.py +19 -10
deepdoctection/extern/doctrocr.py +162 -108
deepdoctection/extern/fastlang.py +25 -17
deepdoctection/extern/hfdetr.py +137 -60
deepdoctection/extern/hflayoutlm.py +329 -248
deepdoctection/extern/hflm.py +67 -33
deepdoctection/extern/model.py +108 -762
deepdoctection/extern/pdftext.py +37 -12
deepdoctection/extern/pt/nms.py +15 -1
deepdoctection/extern/pt/ptutils.py +13 -9
deepdoctection/extern/tessocr.py +87 -54
deepdoctection/extern/texocr.py +29 -14
deepdoctection/extern/tp/tfutils.py +36 -8
deepdoctection/extern/tp/tpcompat.py +54 -16
deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
deepdoctection/extern/tpdetect.py +4 -2
deepdoctection/mapper/__init__.py +1 -1
deepdoctection/mapper/cats.py +117 -76
deepdoctection/mapper/cocostruct.py +35 -17
deepdoctection/mapper/d2struct.py +56 -29
deepdoctection/mapper/hfstruct.py +32 -19
deepdoctection/mapper/laylmstruct.py +221 -185
deepdoctection/mapper/maputils.py +71 -35
deepdoctection/mapper/match.py +76 -62
deepdoctection/mapper/misc.py +68 -44
deepdoctection/mapper/pascalstruct.py +13 -12
deepdoctection/mapper/prodigystruct.py +33 -19
deepdoctection/mapper/pubstruct.py +42 -32
deepdoctection/mapper/tpstruct.py +39 -19
deepdoctection/mapper/xfundstruct.py +20 -13
deepdoctection/pipe/__init__.py +1 -2
deepdoctection/pipe/anngen.py +104 -62
deepdoctection/pipe/base.py +226 -107
deepdoctection/pipe/common.py +206 -123
deepdoctection/pipe/concurrency.py +74 -47
deepdoctection/pipe/doctectionpipe.py +108 -47
deepdoctection/pipe/language.py +41 -24
deepdoctection/pipe/layout.py +45 -18
deepdoctection/pipe/lm.py +146 -78
deepdoctection/pipe/order.py +205 -119
deepdoctection/pipe/refine.py +111 -63
deepdoctection/pipe/registry.py +1 -1
deepdoctection/pipe/segment.py +213 -142
deepdoctection/pipe/sub_layout.py +76 -46
deepdoctection/pipe/text.py +52 -33
deepdoctection/pipe/transform.py +8 -6
deepdoctection/train/d2_frcnn_train.py +87 -69
deepdoctection/train/hf_detr_train.py +72 -40
deepdoctection/train/hf_layoutlm_train.py +85 -46
deepdoctection/train/tp_frcnn_train.py +56 -28
deepdoctection/utils/concurrency.py +59 -16
deepdoctection/utils/context.py +40 -19
deepdoctection/utils/develop.py +26 -17
deepdoctection/utils/env_info.py +86 -37
deepdoctection/utils/error.py +16 -10
deepdoctection/utils/file_utils.py +246 -71
deepdoctection/utils/fs.py +162 -43
deepdoctection/utils/identifier.py +29 -16
deepdoctection/utils/logger.py +49 -32
deepdoctection/utils/metacfg.py +83 -21
deepdoctection/utils/pdf_utils.py +119 -62
deepdoctection/utils/settings.py +24 -10
deepdoctection/utils/tqdm.py +10 -5
deepdoctection/utils/transform.py +182 -46
deepdoctection/utils/utils.py +61 -28
deepdoctection/utils/viz.py +150 -104
deepdoctection-0.43.1.dist-info/METADATA +376 -0
deepdoctection-0.43.1.dist-info/RECORD +149 -0
deepdoctection/analyzer/_config.py +0 -146
deepdoctection-0.42.1.dist-info/METADATA +0 -431
deepdoctection-0.42.1.dist-info/RECORD +0 -148
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0

deepdoctection/extern/hflm.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # limitations under the License.
 """
-Wrapper for the Hugging Face Language Model for sequence and token  classification
+Wrapper for the HF Language Model for sequence and token classification
 """
 from __future__ import annotations
@@ -48,11 +48,14 @@ def predict_sequence_classes(
     model: Union[XLMRobertaForSequenceClassification],
 ) -> SequenceClassResult:
     """
-    :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
-    :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
-    :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
-    :param model: layoutlm model for sequence classification
-    :return: SequenceClassResult
+    Args:
+        input_ids: Token converted to ids to be taken from `XLMRobertaTokenizer`
+        attention_mask: The associated attention masks from padded sequences taken from `XLMRobertaTokenizer`
+        token_type_ids: Torch tensor of token type ids taken from `XLMRobertaTokenizer`
+        model: `XLMRobertaForSequenceClassification` model for sequence classification
+    Returns:
+        `SequenceClassResult`
     """
     outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
@@ -65,7 +68,7 @@ def predict_sequence_classes(
 class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
     """
-    Abstract base class for wrapping Bert-type models  for sequence classification into the deepdoctection framework.
+    Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
     """
     def __init__(
@@ -115,27 +118,51 @@ class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
     @staticmethod
     def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
-        """Returns the name of the model"""
+        """
+        Returns the name of the model
+        Args:
+            path_weights: Path to model weights
+            architecture: Architecture name
+        Returns:
+            str: Model name
+        """
         return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
     @staticmethod
     def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
-        """A refinement for adding the tokenizer class name to the model configs.
+        """
+        A refinement for adding the tokenizer class name to the model configs.
+        Args:
+            model_class_name: The model name, e.g. `model.__class__.__name__`
+            use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
-        :param model_class_name: The model name, e.g. model.__class__.__name__
-        :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
+        Returns:
+            str: Tokenizer class name
         """
         tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
         return tokenizer.__class__.__name__
     @staticmethod
     def image_to_raw_features_mapping() -> str:
-        """Returns the mapping function to convert images into raw features."""
+        """
+        Returns the mapping function to convert images into raw features.
+        Returns:
+            str: Name of the mapping function
+        """
         return "image_to_raw_lm_features"
     @staticmethod
     def image_to_features_mapping() -> str:
-        """Returns the mapping function to convert images into features."""
+        """
+        Returns the mapping function to convert images into features.
+        Returns:
+            str: Name of the mapping function
+        """
         return "image_to_lm_features"
@@ -147,28 +174,29 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
     Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
     classification and other things please use another model of the family.
-    **Example**
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
-            # setting up compulsory ocr service
-            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
-            tess = TesseractOcrDetector(tesseract_config_path)
-            ocr_service = TextExtractionService(tess)
+        # hf tokenizer and token classifier
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
+        roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
+                                              categories=["handwritten", "presentation", "resume"])
-            # hf tokenizer and token classifier
-            tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
-            roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
-                                                  categories=["handwritten", "presentation", "resume"])
+        # token classification service
+        roberta_service = LMSequenceClassifierService(tokenizer,roberta)
-            # token classification service
-            roberta_service = LMSequenceClassifierService(tokenizer,roberta)
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
-            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
-            path = "path/to/some/form"
-            df = pipe.analyze(path=path)
-            for dp in df:
-                ...
+        for dp in df:
+            ...
+        ```
     """
     def __init__(
@@ -209,9 +237,12 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
         """
         Get the inner (wrapped) model.
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :return: 'nn.Module'
+        Args:
+            path_config_json: path to .json config file
+            path_weights: path to model artifact
+        Returns:
+            `XLMRobertaForSequenceClassification`
         """
         config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
         return XLMRobertaForSequenceClassification.from_pretrained(
@@ -223,6 +254,9 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
         """
         Add some default arguments that might be necessary when preparing a sample. Overwrite this method
         for some custom setting.
+        Returns:
+            JsonDict: Dictionary with default arguments
         """
         return {}

deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl