PyPI - deepdoctection - Versions diffs - 0.30__py3-none-any.whl → 0.32__py3-none-any.whl - Mend

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show

deepdoctection/__init__.py +38 -29
deepdoctection/analyzer/dd.py +36 -29
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/base.py +0 -19
deepdoctection/dataflow/custom.py +4 -3
deepdoctection/dataflow/custom_serialize.py +14 -5
deepdoctection/dataflow/parallel_map.py +12 -11
deepdoctection/dataflow/serialize.py +5 -4
deepdoctection/datapoint/annotation.py +35 -13
deepdoctection/datapoint/box.py +3 -5
deepdoctection/datapoint/convert.py +3 -1
deepdoctection/datapoint/image.py +79 -36
deepdoctection/datapoint/view.py +152 -49
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +6 -3
deepdoctection/datasets/base.py +86 -11
deepdoctection/datasets/dataflow_builder.py +1 -1
deepdoctection/datasets/info.py +4 -4
deepdoctection/datasets/instances/doclaynet.py +3 -2
deepdoctection/datasets/instances/fintabnet.py +2 -1
deepdoctection/datasets/instances/funsd.py +2 -1
deepdoctection/datasets/instances/iiitar13k.py +5 -2
deepdoctection/datasets/instances/layouttest.py +4 -8
deepdoctection/datasets/instances/publaynet.py +2 -2
deepdoctection/datasets/instances/pubtables1m.py +6 -3
deepdoctection/datasets/instances/pubtabnet.py +2 -1
deepdoctection/datasets/instances/rvlcdip.py +2 -1
deepdoctection/datasets/instances/xfund.py +2 -1
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +1 -1
deepdoctection/eval/base.py +5 -4
deepdoctection/eval/cocometric.py +2 -1
deepdoctection/eval/eval.py +19 -15
deepdoctection/eval/tedsmetric.py +14 -11
deepdoctection/eval/tp_eval_callback.py +14 -7
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +39 -13
deepdoctection/extern/d2detect.py +182 -90
deepdoctection/extern/deskew.py +36 -9
deepdoctection/extern/doctrocr.py +265 -83
deepdoctection/extern/fastlang.py +49 -9
deepdoctection/extern/hfdetr.py +106 -55
deepdoctection/extern/hflayoutlm.py +441 -122
deepdoctection/extern/hflm.py +225 -0
deepdoctection/extern/model.py +56 -47
deepdoctection/extern/pdftext.py +10 -5
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +27 -18
deepdoctection/extern/tessocr.py +134 -22
deepdoctection/extern/texocr.py +6 -2
deepdoctection/extern/tp/tfutils.py +43 -9
deepdoctection/extern/tp/tpcompat.py +14 -11
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +54 -30
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/d2struct.py +9 -7
deepdoctection/mapper/hfstruct.py +7 -2
deepdoctection/mapper/laylmstruct.py +164 -21
deepdoctection/mapper/maputils.py +16 -3
deepdoctection/mapper/misc.py +6 -3
deepdoctection/mapper/prodigystruct.py +1 -1
deepdoctection/mapper/pubstruct.py +10 -10
deepdoctection/mapper/tpstruct.py +3 -3
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +35 -8
deepdoctection/pipe/base.py +53 -19
deepdoctection/pipe/common.py +23 -13
deepdoctection/pipe/concurrency.py +2 -1
deepdoctection/pipe/doctectionpipe.py +2 -2
deepdoctection/pipe/language.py +3 -2
deepdoctection/pipe/layout.py +6 -3
deepdoctection/pipe/lm.py +34 -66
deepdoctection/pipe/order.py +142 -35
deepdoctection/pipe/refine.py +26 -24
deepdoctection/pipe/segment.py +21 -16
deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
deepdoctection/pipe/text.py +14 -8
deepdoctection/pipe/transform.py +16 -9
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +36 -28
deepdoctection/train/hf_detr_train.py +26 -17
deepdoctection/train/hf_layoutlm_train.py +133 -111
deepdoctection/train/tp_frcnn_train.py +21 -19
deepdoctection/utils/__init__.py +3 -0
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +2 -2
deepdoctection/utils/env_info.py +41 -84
deepdoctection/utils/error.py +84 -0
deepdoctection/utils/file_utils.py +4 -15
deepdoctection/utils/fs.py +7 -7
deepdoctection/utils/logger.py +1 -0
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +5 -4
deepdoctection/utils/settings.py +6 -1
deepdoctection/utils/transform.py +1 -1
deepdoctection/utils/utils.py +0 -6
deepdoctection/utils/viz.py +48 -5
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
deepdoctection-0.32.dist-info/RECORD +146 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
deepdoctection-0.30.dist-info/RECORD +0 -143
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0

deepdoctection/extern/hflayoutlm.py CHANGED Viewed

@@ -18,6 +18,7 @@
 """
 HF Layoutlm model for diverse downstream tasks.
 """
+from __future__ import annotations
 from abc import ABC
 from collections import defaultdict
@@ -26,14 +27,10 @@ from pathlib import Path
 from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union
 import numpy as np
+from lazy_imports import try_import
 from ..utils.detection_types import JsonDict, Requirement
-from ..utils.file_utils import (
-    get_pytorch_requirement,
-    get_transformers_requirement,
-    pytorch_available,
-    transformers_available,
-)
+from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
 from ..utils.settings import (
     BioTag,
     ObjectTypes,
@@ -44,39 +41,85 @@ from ..utils.settings import (
     token_class_with_tag_to_token_class_and_tag,
 )
 from .base import LMSequenceClassifier, LMTokenClassifier, SequenceClassResult, TokenClassResult
-from .pt.ptutils import set_torch_auto_device
+from .pt.ptutils import get_torch_device
-if pytorch_available():
+with try_import() as pt_import_guard:
     import torch
     import torch.nn.functional as F
-    from torch import Tensor  # pylint: disable=W0611
-if transformers_available():
+with try_import() as tr_import_guard:
     from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD  # type: ignore
     from transformers import (
         LayoutLMForSequenceClassification,
         LayoutLMForTokenClassification,
+        LayoutLMTokenizerFast,
         LayoutLMv2Config,
         LayoutLMv2ForSequenceClassification,
         LayoutLMv2ForTokenClassification,
         LayoutLMv3Config,
         LayoutLMv3ForSequenceClassification,
         LayoutLMv3ForTokenClassification,
+        LiltForSequenceClassification,
+        LiltForTokenClassification,
         PretrainedConfig,
+        RobertaTokenizerFast,
+        XLMRobertaTokenizerFast,
     )
+def get_tokenizer_from_model_class(model_class: str, use_xlm_tokenizer: bool) -> Any:
+    """
+    We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
+    returns the tokenizer that should be used for a particular model.
+    :param model_class: The model as stated in the transformer library.
+    :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
+    :return: Tokenizer instance to use.
+    """
+    return {
+        ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
+            "microsoft/layoutlm-base-uncased"
+        ),
+        ("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
+            "microsoft/layoutlm-base-uncased"
+        ),
+        ("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
+            "microsoft/layoutlm-base-uncased"
+        ),
+        ("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
+            "microsoft/layoutlm-base-uncased"
+        ),
+        ("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
+        ("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
+        ("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
+            "roberta-base", add_prefix_space=True
+        ),
+        ("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
+            "roberta-base", add_prefix_space=True
+        ),
+        ("LiltForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
+        ("LiltForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
+            "roberta-base", add_prefix_space=True
+        ),
+        ("LiltForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
+        ("LiltForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
+            "roberta-base", add_prefix_space=True
+        ),
+        ("XLMRobertaForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained(
+            "FacebookAI/xlm-roberta-base"
+        ),
+    }[(model_class, use_xlm_tokenizer)]
 def predict_token_classes(
     uuids: List[List[str]],
-    input_ids: "Tensor",
-    attention_mask: "Tensor",
-    token_type_ids: "Tensor",
-    boxes: "Tensor",
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    token_type_ids: torch.Tensor,
+    boxes: torch.Tensor,
     tokens: List[List[str]],
-    model: Union[
-        "LayoutLMForTokenClassification", "LayoutLMv2ForTokenClassification", "LayoutLMv3ForTokenClassification"
-    ],
-    images: Optional["Tensor"] = None,
+    model: Union[LayoutLMForTokenClassification, LayoutLMv2ForTokenClassification, LayoutLMv3ForTokenClassification],
+    images: Optional[torch.Tensor] = None,
 ) -> List[TokenClassResult]:
     """
     :param uuids: A list of uuids that correspond to a word that induces the resulting token
@@ -129,26 +172,28 @@ def predict_token_classes(
 def predict_sequence_classes(
-    input_ids: "Tensor",
-    attention_mask: "Tensor",
-    token_type_ids: "Tensor",
-    boxes: "Tensor",
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    token_type_ids: torch.Tensor,
+    boxes: torch.Tensor,
     model: Union[
-        "LayoutLMForSequenceClassification",
-        "LayoutLMv2ForSequenceClassification",
-        "LayoutLMv3ForSequenceClassification",
+        LayoutLMForSequenceClassification,
+        LayoutLMv2ForSequenceClassification,
+        LayoutLMv3ForSequenceClassification,
+        LiltForSequenceClassification,
     ],
-    images: Optional["Tensor"] = None,
+    images: Optional[torch.Tensor] = None,
 ) -> SequenceClassResult:
     """
     :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
     :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
     :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
     :param boxes: Torch tensor of bounding boxes of type 'xyxy'
-    :param model: layoutlm model for token classification
+    :param model: layoutlm model for sequence classification
     :param images: A list of torch image tensors or None
     :return: SequenceClassResult
     """
     if images is None:
         outputs = model(input_ids=input_ids, bbox=boxes, attention_mask=attention_mask, token_type_ids=token_type_ids)
     elif isinstance(model, LayoutLMv2ForSequenceClassification):
@@ -177,7 +222,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
     Abstract base class for wrapping LayoutLM models for token classification into the deepdoctection framework.
     """
-    model: Union["LayoutLMForTokenClassification", "LayoutLMv2ForTokenClassification"]
+    model: Union[LayoutLMForTokenClassification, LayoutLMv2ForTokenClassification]
     def __init__(
         self,
@@ -186,7 +231,8 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
         categories_semantics: Optional[Sequence[TypeOrStr]] = None,
         categories_bio: Optional[Sequence[TypeOrStr]] = None,
         categories: Optional[Mapping[str, TypeOrStr]] = None,
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         """
         :param path_config_json: path to .json config file
@@ -198,9 +244,10 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
                                consistent with detectors use only values>0. Conversion will be done internally.
         :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
         :param device: The device (cpu,"cuda"), where to place the model.
+        :param use_xlm_tokenizer: True if one uses the LayoutXLM or a lilt model built with a xlm language model, e.g.
+                                  info-xlm or roberta-xlm. (LayoutXLM cannot be distinguished from LayoutLMv2).
         """
-        self.name = "_".join(Path(path_weights).parts[-3:])
         if categories is None:
             if categories_semantics is None:
                 raise ValueError("If categories is None then categories_semantics cannot be None")
@@ -219,11 +266,9 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
             self.categories = self._categories_orig_to_categories(
                 self.categories_semantics, self.categories_bio  # type: ignore
             )
-        if device is not None:
-            self.device = device
-        else:
-            self.device = set_torch_auto_device()
+        self.device = get_torch_device(device)
         self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
     @classmethod
     def get_requirements(cls) -> List[Requirement]:
@@ -257,9 +302,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
     def _validate_encodings(
         self, **encodings: Any
-    ) -> Tuple[
-        List[List[str]], List[str], "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", List[List[str]]
-    ]:
+    ) -> Tuple[List[List[str]], List[str], torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, List[List[str]]]:
         image_ids = encodings.get("image_ids", [])
         ann_ids = encodings.get("ann_ids")
         input_ids = encodings.get("input_ids")
@@ -292,7 +335,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
         return ann_ids, image_ids, input_ids, attention_mask, token_type_ids, boxes, tokens
-    def clone(self) -> "HFLayoutLmTokenClassifierBase":
+    def clone(self) -> HFLayoutLmTokenClassifierBase:
         return self.__class__(
             self.path_config,
             self.path_weights,
@@ -302,6 +345,29 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
             self.device,
         )
+    @staticmethod
+    def get_name(path_weights: str, architecture: str) -> str:
+        """Returns the name of the model"""
+        return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
+    def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
+        """A refinement for adding the tokenizer class name to the model configs.
+        :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
+        """
+        tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
+        return tokenizer.__class__.__name__
+    @staticmethod
+    def image_to_raw_features_mapping() -> str:
+        """Returns the mapping function to convert images into raw features."""
+        return "image_to_raw_layoutlm_features"
+    @staticmethod
+    def image_to_features_mapping() -> str:
+        """Returns the mapping function to convert images into features."""
+        return "image_to_layoutlm_features"
 class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
     """
@@ -344,7 +410,8 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
         categories_semantics: Optional[Sequence[TypeOrStr]] = None,
         categories_bio: Optional[Sequence[TypeOrStr]] = None,
         categories: Optional[Mapping[str, TypeOrStr]] = None,
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         """
         :param path_config_json: path to .json config file
@@ -356,14 +423,17 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
                                consistent with detectors use only values>0. Conversion will be done internally.
         :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
         :param device: The device (cpu,"cuda"), where to place the model.
+        :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
+                                  Tokenizer.
         """
-        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
-        self.model = LayoutLMForTokenClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
+        self.name = self.get_name(path_weights, "LayoutLM")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
         )
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
+    def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
         """
         Launch inference on LayoutLm for token classification. Pass the following arguments
@@ -388,6 +458,18 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
         return self._map_category_names(results)
+    @staticmethod
+    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return LayoutLMForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
 class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
     """
@@ -432,7 +514,8 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
         categories_semantics: Optional[Sequence[TypeOrStr]] = None,
         categories_bio: Optional[Sequence[TypeOrStr]] = None,
         categories: Optional[Mapping[str, TypeOrStr]] = None,
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         """
         :param path_config_json: path to .json config file
@@ -444,14 +527,17 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
                                consistent with detectors use only values>0. Conversion will be done internally.
         :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
         :param device: The device (cpu,"cuda"), where to place the model.
+        :param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
+                                  default value.
         """
-        config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
-        self.model = LayoutLMv2ForTokenClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
+        self.name = self.get_name(path_weights, "LayoutLMv2")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
         )
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
+    def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
         """
         Launch inference on LayoutLm for token classification. Pass the following arguments
@@ -489,6 +575,20 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
         """
         return {"image_width": 224, "image_height": 224}
+    @staticmethod
+    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return LayoutLMv2ForTokenClassification.from_pretrained(
+            pretrained_model_name_or_path=path_weights, config=config
+        )
 class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
     """
@@ -533,7 +633,8 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
         categories_semantics: Optional[Sequence[TypeOrStr]] = None,
         categories_bio: Optional[Sequence[TypeOrStr]] = None,
         categories: Optional[Mapping[str, TypeOrStr]] = None,
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         """
         :param path_config_json: path to .json config file
@@ -545,14 +646,17 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
                                consistent with detectors use only values>0. Conversion will be done internally.
         :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
         :param device: The device (cpu,"cuda"), where to place the model.
+        :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLMv3 model with a different
+                                  tokenizer.
         """
-        config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
-        self.model = LayoutLMv3ForTokenClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
+        self.name = self.get_name(path_weights, "LayoutLMv3")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
         )
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
+    def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
         """
         Launch inference on LayoutLm for token classification. Pass the following arguments
@@ -592,77 +696,54 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
             "pixel_std": np.array(IMAGENET_DEFAULT_STD, dtype=np.float32),
         }
+    @staticmethod
+    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return LayoutLMv3ForTokenClassification.from_pretrained(
+            pretrained_model_name_or_path=path_weights, config=config
+        )
 class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
     """
     Abstract base class for wrapping LayoutLM models  for sequence classification into the deepdoctection framework.
     """
-    model: Union["LayoutLMForSequenceClassification", "LayoutLMv2ForSequenceClassification"]
+    model: Union[LayoutLMForSequenceClassification, LayoutLMv2ForSequenceClassification]
     def __init__(
         self,
         path_config_json: str,
         path_weights: str,
         categories: Mapping[str, TypeOrStr],
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
-        self.name = "_".join(Path(path_weights).parts[-3:])
         self.path_config = path_config_json
         self.path_weights = path_weights
         self.categories = copy(categories)  # type: ignore
-        if device is not None:
-            self.device = device
-        else:
-            self.device = set_torch_auto_device()
+        self.device = get_torch_device(device)
         self.model.to(self.device)
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
-        input_ids = encodings.get("input_ids")
-        attention_mask = encodings.get("attention_mask")
-        token_type_ids = encodings.get("token_type_ids")
-        boxes = encodings.get("bbox")
-        if isinstance(input_ids, torch.Tensor):
-            input_ids = input_ids.to(self.device)
-        else:
-            raise ValueError(f"input_ids must be list but is {type(input_ids)}")
-        if isinstance(attention_mask, torch.Tensor):
-            attention_mask = attention_mask.to(self.device)
-        else:
-            raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
-        if isinstance(token_type_ids, torch.Tensor):
-            token_type_ids = token_type_ids.to(self.device)
-        else:
-            raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
-        if isinstance(boxes, torch.Tensor):
-            boxes = boxes.to(self.device)
-        else:
-            raise ValueError(f"boxes must be list but is {type(boxes)}")
-        result = predict_sequence_classes(
-            input_ids,
-            attention_mask,
-            token_type_ids,
-            boxes,
-            self.model,
-        )
-        result.class_id += 1
-        result.class_name = self.categories[str(result.class_id)]
-        return result
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
     @classmethod
     def get_requirements(cls) -> List[Requirement]:
         return [get_pytorch_requirement(), get_transformers_requirement()]
-    def clone(self) -> "HFLayoutLmSequenceClassifierBase":
+    def clone(self) -> HFLayoutLmSequenceClassifierBase:
         return self.__class__(self.path_config, self.path_weights, self.categories, self.device)
     def _validate_encodings(
-        self, **encodings: Union[List[List[str]], "torch.Tensor"]
-    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        self, **encodings: Union[List[List[str]], torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         input_ids = encodings.get("input_ids")
         attention_mask = encodings.get("attention_mask")
         token_type_ids = encodings.get("token_type_ids")
@@ -691,6 +772,29 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
         boxes = boxes.to(self.device)
         return input_ids, attention_mask, token_type_ids, boxes
+    @staticmethod
+    def get_name(path_weights: str, architecture: str) -> str:
+        """Returns the name of the model"""
+        return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
+    def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
+        """A refinement for adding the tokenizer class name to the model configs.
+        :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
+        """
+        tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
+        return tokenizer.__class__.__name__
+    @staticmethod
+    def image_to_raw_features_mapping() -> str:
+        """Returns the mapping function to convert images into raw features."""
+        return "image_to_raw_layoutlm_features"
+    @staticmethod
+    def image_to_features_mapping() -> str:
+        """Returns the mapping function to convert images into features."""
+        return "image_to_layoutlm_features"
 class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
     """
@@ -728,15 +832,15 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         path_config_json: str,
         path_weights: str,
         categories: Mapping[str, TypeOrStr],
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
-        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
-        self.model = LayoutLMForSequenceClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
-        )
-        super().__init__(path_config_json, path_weights, categories, device)
+        self.name = self.get_name(path_weights, "LayoutLM")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
+    def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
         input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
         result = predict_sequence_classes(
@@ -751,6 +855,20 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         result.class_name = self.categories[str(result.class_id)]
         return result
+    @staticmethod
+    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return LayoutLMForSequenceClassification.from_pretrained(
+            pretrained_model_name_or_path=path_weights, config=config
+        )
 class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
     """
@@ -788,15 +906,15 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         path_config_json: str,
         path_weights: str,
         categories: Mapping[str, TypeOrStr],
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
-        config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
-        self.model = LayoutLMv2ForSequenceClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
-        )
-        super().__init__(path_config_json, path_weights, categories, device)
+        self.name = self.get_name(path_weights, "LayoutLMv2")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
+    def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
         input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
         images = encodings.get("image")
         if isinstance(images, torch.Tensor):
@@ -818,6 +936,20 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         """
         return {"image_width": 224, "image_height": 224}
+    @staticmethod
+    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return LayoutLMv2ForSequenceClassification.from_pretrained(
+            pretrained_model_name_or_path=path_weights, config=config
+        )
 class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
     """
@@ -855,15 +987,15 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         path_config_json: str,
         path_weights: str,
         categories: Mapping[str, TypeOrStr],
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
-        config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
-        self.model = LayoutLMv3ForSequenceClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
-        )
-        super().__init__(path_config_json, path_weights, categories, device)
+        self.name = self.get_name(path_weights, "LayoutLMv3")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
+    def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
         input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
         images = encodings.get("pixel_values")
         if isinstance(images, torch.Tensor):
@@ -890,3 +1022,190 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
             "pixel_mean": np.array(IMAGENET_DEFAULT_MEAN, dtype=np.float32),
             "pixel_std": np.array(IMAGENET_DEFAULT_STD, dtype=np.float32),
         }
+    @staticmethod
+    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return LayoutLMv3ForSequenceClassification.from_pretrained(
+            pretrained_model_name_or_path=path_weights, config=config
+        )
+class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
+    """
+    A wrapper class for `transformers.LiltForTokenClassification` to use within a pipeline component.
+    Check <https://huggingface.co/docs/transformers/model_doc/lilt> for documentation of the model itself.
+    Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
+    classification and other things please use another model of the family.
+    **Example**
+            # setting up compulsory ocr service
+            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+            tess = TesseractOcrDetector(tesseract_config_path)
+            ocr_service = TextExtractionService(tess)
+            # hf tokenizer and token classifier
+            tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+            lilt = HFLiltTokenClassifier("path/to/config.json","path/to/model.bin",
+                                                  categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
+                                                               'E-header', 'E-question', 'I-answer', 'I-header',
+                                                               'I-question', 'O', 'S-answer', 'S-header',
+                                                               'S-question'])
+            # token classification service
+            lilt_service = LMTokenClassifierService(tokenizer,lilt)
+            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
+            path = "path/to/some/form"
+            df = pipe.analyze(path=path)
+            for dp in df:
+                ...
+    """
+    def __init__(
+        self,
+        path_config_json: str,
+        path_weights: str,
+        categories_semantics: Optional[Sequence[TypeOrStr]] = None,
+        categories_bio: Optional[Sequence[TypeOrStr]] = None,
+        categories: Optional[Mapping[str, TypeOrStr]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
+    ):
+        """
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
+                                     entities self. To be consistent with detectors use only values >0. Conversion will
+                                     be done internally.
+        :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
+                               consistent with detectors use only values>0. Conversion will be done internally.
+        :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
+        :param device: The device (cpu,"cuda"), where to place the model.
+        """
+        self.name = self.get_name(path_weights, "LiLT")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
+        )
+    def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
+        """
+        Launch inference on LayoutLm for token classification. Pass the following arguments
+        `input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
+        `attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
+        `token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
+        `boxes:` Torch tensor of bounding boxes of type 'xyxy'
+        `tokens:` List of original tokens taken from `LayoutLMTokenizer`
+        :return: A list of TokenClassResults
+        """
+        ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
+        results = predict_token_classes(
+            ann_ids, input_ids, attention_mask, token_type_ids, boxes, tokens, self.model, None
+        )
+        return self._map_category_names(results)
+    @staticmethod
+    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return LiltForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
+class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
+    """
+    A wrapper class for `transformers.LiLTForSequenceClassification` to use within a pipeline component.
+    Check <https://huggingface.co/docs/transformers/model_doc/lilt> for documentation of the model itself.
+    Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
+    classification and other things please use another model of the family.
+    **Example**
+            # setting up compulsory ocr service
+            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+            tess = TesseractOcrDetector(tesseract_config_path)
+            ocr_service = TextExtractionService(tess)
+            # hf tokenizer and sequence classifier
+            tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
+            lilt = HFLiltSequenceClassifier("path/to/config.json",
+                                                "path/to/model.bin",
+                                                categories=["handwritten", "presentation", "resume"])
+            # sequence classification service
+            lilt_service = LMSequenceClassifierService(tokenizer,lilt)
+            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
+            path = "path/to/some/form"
+            df = pipe.analyze(path=path)
+            for dp in df:
+                ...
+    """
+    def __init__(
+        self,
+        path_config_json: str,
+        path_weights: str,
+        categories: Mapping[str, TypeOrStr],
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
+    ):
+        self.name = self.get_name(path_weights, "LiLT")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
+    def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
+        input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
+        result = predict_sequence_classes(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            boxes,
+            self.model,
+        )
+        result.class_id += 1
+        result.class_name = self.categories[str(result.class_id)]
+        return result
+    @staticmethod
+    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return LiltForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)

deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl