PyPI - deepdoctection - Versions diffs - 0.31__py3-none-any.whl → 0.33__py3-none-any.whl - Mend

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show

deepdoctection/__init__.py +16 -29
deepdoctection/analyzer/dd.py +70 -59
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +41 -56
deepdoctection/datapoint/box.py +9 -8
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +56 -44
deepdoctection/datapoint/view.py +245 -150
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +35 -26
deepdoctection/datasets/base.py +14 -12
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +24 -26
deepdoctection/datasets/instances/doclaynet.py +51 -51
deepdoctection/datasets/instances/fintabnet.py +46 -46
deepdoctection/datasets/instances/funsd.py +25 -24
deepdoctection/datasets/instances/iiitar13k.py +13 -10
deepdoctection/datasets/instances/layouttest.py +4 -3
deepdoctection/datasets/instances/publaynet.py +5 -5
deepdoctection/datasets/instances/pubtables1m.py +24 -21
deepdoctection/datasets/instances/pubtabnet.py +32 -30
deepdoctection/datasets/instances/rvlcdip.py +30 -30
deepdoctection/datasets/instances/xfund.py +26 -26
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +15 -13
deepdoctection/eval/eval.py +41 -37
deepdoctection/eval/tedsmetric.py +30 -23
deepdoctection/eval/tp_eval_callback.py +16 -19
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +85 -113
deepdoctection/extern/deskew.py +14 -11
deepdoctection/extern/doctrocr.py +141 -130
deepdoctection/extern/fastlang.py +27 -18
deepdoctection/extern/hfdetr.py +71 -62
deepdoctection/extern/hflayoutlm.py +504 -211
deepdoctection/extern/hflm.py +230 -0
deepdoctection/extern/model.py +488 -302
deepdoctection/extern/pdftext.py +23 -19
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +29 -19
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +18 -18
deepdoctection/extern/tp/tfutils.py +57 -9
deepdoctection/extern/tp/tpcompat.py +21 -14
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +45 -53
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/cats.py +27 -29
deepdoctection/mapper/cocostruct.py +10 -10
deepdoctection/mapper/d2struct.py +27 -26
deepdoctection/mapper/hfstruct.py +13 -8
deepdoctection/mapper/laylmstruct.py +178 -37
deepdoctection/mapper/maputils.py +12 -11
deepdoctection/mapper/match.py +2 -2
deepdoctection/mapper/misc.py +11 -9
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +5 -5
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +5 -5
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +12 -14
deepdoctection/pipe/base.py +52 -106
deepdoctection/pipe/common.py +72 -59
deepdoctection/pipe/concurrency.py +16 -11
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +20 -16
deepdoctection/pipe/lm.py +75 -105
deepdoctection/pipe/order.py +194 -89
deepdoctection/pipe/refine.py +111 -124
deepdoctection/pipe/segment.py +156 -161
deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +48 -41
deepdoctection/train/hf_detr_train.py +41 -30
deepdoctection/train/hf_layoutlm_train.py +153 -135
deepdoctection/train/tp_frcnn_train.py +32 -31
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +87 -125
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +22 -18
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +16 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +11 -11
deepdoctection/utils/settings.py +185 -181
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +74 -72
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
deepdoctection-0.33.dist-info/RECORD +146 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.31.dist-info/RECORD +0 -144
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0

deepdoctection/extern/hflayoutlm.py CHANGED Viewed

@@ -18,66 +18,138 @@
 """
 HF Layoutlm model for diverse downstream tasks.
 """
+from __future__ import annotations
+import os
 from abc import ABC
 from collections import defaultdict
-from copy import copy
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Any, Literal, Mapping, Optional, Sequence, Union
 import numpy as np
-from ..utils.detection_types import JsonDict, Requirement
-from ..utils.file_utils import (
-    get_pytorch_requirement,
-    get_transformers_requirement,
-    pytorch_available,
-    transformers_available,
-)
-from ..utils.settings import (
-    BioTag,
-    ObjectTypes,
-    TokenClasses,
-    TypeOrStr,
-    get_type,
-    token_class_tag_to_token_class_with_tag,
-    token_class_with_tag_to_token_class_and_tag,
+from lazy_imports import try_import
+from typing_extensions import TypeAlias
+from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
+from ..utils.settings import TypeOrStr
+from ..utils.types import JsonDict, PathLikeOrStr, Requirement
+from .base import (
+    LMSequenceClassifier,
+    LMTokenClassifier,
+    ModelCategories,
+    NerModelCategories,
+    SequenceClassResult,
+    TokenClassResult,
 )
-from .base import LMSequenceClassifier, LMTokenClassifier, SequenceClassResult, TokenClassResult
-from .pt.ptutils import set_torch_auto_device
+from .pt.ptutils import get_torch_device
-if pytorch_available():
+with try_import() as pt_import_guard:
     import torch
     import torch.nn.functional as F
-    from torch import Tensor  # pylint: disable=W0611
-if transformers_available():
+with try_import() as tr_import_guard:
     from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD  # type: ignore
     from transformers import (
         LayoutLMForSequenceClassification,
         LayoutLMForTokenClassification,
+        LayoutLMTokenizerFast,
         LayoutLMv2Config,
         LayoutLMv2ForSequenceClassification,
         LayoutLMv2ForTokenClassification,
         LayoutLMv3Config,
         LayoutLMv3ForSequenceClassification,
         LayoutLMv3ForTokenClassification,
+        LiltForSequenceClassification,
+        LiltForTokenClassification,
         PretrainedConfig,
+        RobertaTokenizerFast,
+        XLMRobertaTokenizerFast,
     )
+if TYPE_CHECKING:
+    LayoutTokenModels: TypeAlias = Union[
+        LayoutLMForTokenClassification,
+        LayoutLMv2ForTokenClassification,
+        LayoutLMv3ForTokenClassification,
+        LiltForTokenClassification,
+    ]
+    LayoutSequenceModels: TypeAlias = Union[
+        LayoutLMForSequenceClassification,
+        LayoutLMv2ForSequenceClassification,
+        LayoutLMv3ForSequenceClassification,
+        LiltForSequenceClassification,
+    ]
+    HfLayoutTokenModels: TypeAlias = Union[
+        LayoutLMForTokenClassification,
+        LayoutLMv2ForTokenClassification,
+        LayoutLMv3ForTokenClassification,
+        LiltForTokenClassification,
+    ]
+    HfLayoutSequenceModels: TypeAlias = Union[
+        LayoutLMForSequenceClassification,
+        LayoutLMv2ForSequenceClassification,
+        LayoutLMv3ForSequenceClassification,
+        LiltForSequenceClassification,
+    ]
+def get_tokenizer_from_model_class(model_class: str, use_xlm_tokenizer: bool) -> Any:
+    """
+    We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
+    returns the tokenizer that should be used for a particular model.
+    :param model_class: The model as stated in the transformer library.
+    :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
+    :return: Tokenizer instance to use.
+    """
+    return {
+        ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
+            "microsoft/layoutlm-base-uncased"
+        ),
+        ("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
+            "microsoft/layoutlm-base-uncased"
+        ),
+        ("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
+            "microsoft/layoutlm-base-uncased"
+        ),
+        ("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
+            "microsoft/layoutlm-base-uncased"
+        ),
+        ("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
+        ("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
+        ("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
+            "roberta-base", add_prefix_space=True
+        ),
+        ("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
+            "roberta-base", add_prefix_space=True
+        ),
+        ("LiltForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
+        ("LiltForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
+            "roberta-base", add_prefix_space=True
+        ),
+        ("LiltForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
+        ("LiltForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
+            "roberta-base", add_prefix_space=True
+        ),
+        ("XLMRobertaForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained(
+            "FacebookAI/xlm-roberta-base"
+        ),
+    }[(model_class, use_xlm_tokenizer)]
 def predict_token_classes(
-    uuids: List[List[str]],
-    input_ids: "Tensor",
-    attention_mask: "Tensor",
-    token_type_ids: "Tensor",
-    boxes: "Tensor",
-    tokens: List[List[str]],
-    model: Union[
-        "LayoutLMForTokenClassification", "LayoutLMv2ForTokenClassification", "LayoutLMv3ForTokenClassification"
-    ],
-    images: Optional["Tensor"] = None,
-) -> List[TokenClassResult]:
+    uuids: list[list[str]],
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    token_type_ids: torch.Tensor,
+    boxes: torch.Tensor,
+    tokens: list[list[str]],
+    model: LayoutTokenModels,
+    images: Optional[torch.Tensor] = None,
+) -> list[TokenClassResult]:
     """
     :param uuids: A list of uuids that correspond to a word that induces the resulting token
     :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
@@ -129,26 +201,23 @@ def predict_token_classes(
 def predict_sequence_classes(
-    input_ids: "Tensor",
-    attention_mask: "Tensor",
-    token_type_ids: "Tensor",
-    boxes: "Tensor",
-    model: Union[
-        "LayoutLMForSequenceClassification",
-        "LayoutLMv2ForSequenceClassification",
-        "LayoutLMv3ForSequenceClassification",
-    ],
-    images: Optional["Tensor"] = None,
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    token_type_ids: torch.Tensor,
+    boxes: torch.Tensor,
+    model: LayoutSequenceModels,
+    images: Optional[torch.Tensor] = None,
 ) -> SequenceClassResult:
     """
     :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
     :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
     :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
     :param boxes: Torch tensor of bounding boxes of type 'xyxy'
-    :param model: layoutlm model for token classification
+    :param model: layoutlm model for sequence classification
     :param images: A list of torch image tensors or None
     :return: SequenceClassResult
     """
     if images is None:
         outputs = model(input_ids=input_ids, bbox=boxes, attention_mask=attention_mask, token_type_ids=token_type_ids)
     elif isinstance(model, LayoutLMv2ForSequenceClassification):
@@ -177,16 +246,14 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
     Abstract base class for wrapping LayoutLM models for token classification into the deepdoctection framework.
     """
-    model: Union["LayoutLMForTokenClassification", "LayoutLMv2ForTokenClassification"]
     def __init__(
         self,
-        path_config_json: str,
-        path_weights: str,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
         categories_semantics: Optional[Sequence[TypeOrStr]] = None,
         categories_bio: Optional[Sequence[TypeOrStr]] = None,
-        categories: Optional[Mapping[str, TypeOrStr]] = None,
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        categories: Optional[Mapping[int, TypeOrStr]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
     ):
         """
         :param path_config_json: path to .json config file
@@ -198,6 +265,8 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
                                consistent with detectors use only values>0. Conversion will be done internally.
         :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
         :param device: The device (cpu,"cuda"), where to place the model.
+        :param use_xlm_tokenizer: True if one uses the LayoutXLM or a lilt model built with a xlm language model, e.g.
+                                  info-xlm or roberta-xlm. (LayoutXLM cannot be distinguished from LayoutLMv2).
         """
         if categories is None:
@@ -206,45 +275,21 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
             if categories_bio is None:
                 raise ValueError("If categories is None then categories_bio cannot be None")
-        self.path_config = path_config_json
-        self.path_weights = path_weights
-        self.categories_semantics = (
-            [get_type(cat_sem) for cat_sem in categories_semantics] if categories_semantics is not None else []
+        self.path_config = Path(path_config_json)
+        self.path_weights = Path(path_weights)
+        self.categories = NerModelCategories(
+            init_categories=categories, categories_semantics=categories_semantics, categories_bio=categories_bio
         )
-        self.categories_bio = [get_type(cat_bio) for cat_bio in categories_bio] if categories_bio is not None else []
-        if categories:
-            self.categories = copy(categories)  # type: ignore
-        else:
-            self.categories = self._categories_orig_to_categories(
-                self.categories_semantics, self.categories_bio  # type: ignore
-            )
-        if device is not None:
-            self.device = device
-        else:
-            self.device = set_torch_auto_device()
-        self.model.to(self.device)
+        self.device = get_torch_device(device)
     @classmethod
-    def get_requirements(cls) -> List[Requirement]:
+    def get_requirements(cls) -> list[Requirement]:
         return [get_pytorch_requirement(), get_transformers_requirement()]
-    @staticmethod
-    def _categories_orig_to_categories(
-        categories_semantics: List[TokenClasses], categories_bio: List[BioTag]
-    ) -> Dict[str, ObjectTypes]:
-        categories_list = sorted(
-            {
-                token_class_tag_to_token_class_with_tag(token, tag)
-                for token in categories_semantics
-                for tag in categories_bio
-            }
-        )
-        return {str(k): v for k, v in enumerate(categories_list, 1)}
-    def _map_category_names(self, token_results: List[TokenClassResult]) -> List[TokenClassResult]:
+    def _map_category_names(self, token_results: list[TokenClassResult]) -> list[TokenClassResult]:
         for result in token_results:
-            result.class_name = self.categories[str(result.class_id + 1)]
-            output = token_class_with_tag_to_token_class_and_tag(result.class_name)
+            result.class_name = self.categories.categories[result.class_id + 1]
+            output = self.categories.disentangle_token_class_and_tag(result.class_name)
             if output is not None:
                 token_class, tag = output
                 result.semantic_name = token_class
@@ -256,9 +301,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
     def _validate_encodings(
         self, **encodings: Any
-    ) -> Tuple[
-        List[List[str]], List[str], "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", List[List[str]]
-    ]:
+    ) -> tuple[list[list[str]], list[str], torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, list[list[str]]]:
         image_ids = encodings.get("image_ids", [])
         ann_ids = encodings.get("ann_ids")
         input_ids = encodings.get("input_ids")
@@ -291,21 +334,41 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
         return ann_ids, image_ids, input_ids, attention_mask, token_type_ids, boxes, tokens
-    def clone(self) -> "HFLayoutLmTokenClassifierBase":
+    def clone(self) -> HFLayoutLmTokenClassifierBase:
         return self.__class__(
             self.path_config,
             self.path_weights,
-            self.categories_semantics,
-            self.categories_bio,
-            self.categories,
+            self.categories.categories_semantics,
+            self.categories.categories_bio,
+            self.categories.get_categories(),
             self.device,
         )
     @staticmethod
-    def get_name(path_weights: str, architecture: str) -> str:
+    def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
         """Returns the name of the model"""
         return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
+    @staticmethod
+    def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
+        """A refinement for adding the tokenizer class name to the model configs.
+        :param model_class_name: The model name, e.g. model.__class__.__name__
+        :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
+        """
+        tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
+        return tokenizer.__class__.__name__
+    @staticmethod
+    def image_to_raw_features_mapping() -> str:
+        """Returns the mapping function to convert images into raw features."""
+        return "image_to_raw_layoutlm_features"
+    @staticmethod
+    def image_to_features_mapping() -> str:
+        """Returns the mapping function to convert images into features."""
+        return "image_to_layoutlm_features"
 class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
     """
@@ -343,12 +406,13 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
     def __init__(
         self,
-        path_config_json: str,
-        path_weights: str,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
         categories_semantics: Optional[Sequence[TypeOrStr]] = None,
         categories_bio: Optional[Sequence[TypeOrStr]] = None,
-        categories: Optional[Mapping[str, TypeOrStr]] = None,
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        categories: Optional[Mapping[int, TypeOrStr]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         """
         :param path_config_json: path to .json config file
@@ -360,13 +424,19 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
                                consistent with detectors use only values>0. Conversion will be done internally.
         :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
         :param device: The device (cpu,"cuda"), where to place the model.
+        :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
+                                  Tokenizer.
         """
+        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
         self.name = self.get_name(path_weights, "LayoutLM")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(
+            self.model.__class__.__name__, use_xlm_tokenizer
+        )
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
+    def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
         """
         Launch inference on LayoutLm for token classification. Pass the following arguments
@@ -392,7 +462,9 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
         return self._map_category_names(results)
     @staticmethod
-    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+    def get_wrapped_model(
+        path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
+    ) -> LayoutLMForTokenClassification:
         """
         Get the inner (wrapped) model.
@@ -400,8 +472,13 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
         :param path_weights: path to model artifact
         :return: 'nn.Module'
         """
-        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
-        return LayoutLMForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
+        return LayoutLMForTokenClassification.from_pretrained(
+            pretrained_model_name_or_path=os.fspath(path_weights), config=config
+        )
+    def clear_model(self) -> None:
+        self.model = None
 class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
@@ -442,12 +519,13 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
     def __init__(
         self,
-        path_config_json: str,
-        path_weights: str,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
         categories_semantics: Optional[Sequence[TypeOrStr]] = None,
         categories_bio: Optional[Sequence[TypeOrStr]] = None,
-        categories: Optional[Mapping[str, TypeOrStr]] = None,
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        categories: Optional[Mapping[int, TypeOrStr]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         """
         :param path_config_json: path to .json config file
@@ -459,13 +537,19 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
                                consistent with detectors use only values>0. Conversion will be done internally.
         :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
         :param device: The device (cpu,"cuda"), where to place the model.
+        :param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
+                                  default value.
         """
+        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
         self.name = self.get_name(path_weights, "LayoutLMv2")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(
+            self.model.__class__.__name__, use_xlm_tokenizer
+        )
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
+    def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
         """
         Launch inference on LayoutLm for token classification. Pass the following arguments
@@ -496,7 +580,7 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
         return self._map_category_names(results)
     @staticmethod
-    def default_kwargs_for_input_mapping() -> JsonDict:
+    def default_kwargs_for_image_to_features_mapping() -> JsonDict:
         """
         Add some default arguments that might be necessary when preparing a sample. Overwrite this method
         for some custom setting.
@@ -504,7 +588,9 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
         return {"image_width": 224, "image_height": 224}
     @staticmethod
-    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+    def get_wrapped_model(
+        path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
+    ) -> LayoutLMv2ForTokenClassification:
         """
         Get the inner (wrapped) model.
@@ -512,11 +598,14 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
         :param path_weights: path to model artifact
         :return: 'nn.Module'
         """
-        config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
         return LayoutLMv2ForTokenClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
+            pretrained_model_name_or_path=os.fspath(path_weights), config=config
         )
+    def clear_model(self) -> None:
+        self.model = None
 class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
     """
@@ -556,12 +645,13 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
     def __init__(
         self,
-        path_config_json: str,
-        path_weights: str,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
         categories_semantics: Optional[Sequence[TypeOrStr]] = None,
         categories_bio: Optional[Sequence[TypeOrStr]] = None,
-        categories: Optional[Mapping[str, TypeOrStr]] = None,
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        categories: Optional[Mapping[int, TypeOrStr]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         """
         :param path_config_json: path to .json config file
@@ -573,13 +663,19 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
                                consistent with detectors use only values>0. Conversion will be done internally.
         :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
         :param device: The device (cpu,"cuda"), where to place the model.
+        :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLMv3 model with a different
+                                  tokenizer.
         """
+        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
         self.name = self.get_name(path_weights, "LayoutLMv3")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(
+            self.model.__class__.__name__, use_xlm_tokenizer
+        )
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
+    def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
         """
         Launch inference on LayoutLm for token classification. Pass the following arguments
@@ -606,7 +702,7 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
         return self._map_category_names(results)
     @staticmethod
-    def default_kwargs_for_input_mapping() -> JsonDict:
+    def default_kwargs_for_image_to_features_mapping() -> JsonDict:
         """
         Add some default arguments that might be necessary when preparing a sample. Overwrite this method
         for some custom setting.
@@ -620,7 +716,9 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
         }
     @staticmethod
-    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+    def get_wrapped_model(
+        path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
+    ) -> LayoutLMv3ForTokenClassification:
         """
         Get the inner (wrapped) model.
@@ -628,81 +726,43 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
         :param path_weights: path to model artifact
         :return: 'nn.Module'
         """
-        config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
         return LayoutLMv3ForTokenClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
+            pretrained_model_name_or_path=os.fspath(path_weights), config=config
         )
+    def clear_model(self) -> None:
+        self.model = None
 class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
     """
     Abstract base class for wrapping LayoutLM models  for sequence classification into the deepdoctection framework.
     """
-    model: Union["LayoutLMForSequenceClassification", "LayoutLMv2ForSequenceClassification"]
     def __init__(
         self,
-        path_config_json: str,
-        path_weights: str,
-        categories: Mapping[str, TypeOrStr],
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
+        categories: Mapping[int, TypeOrStr],
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
     ):
-        self.path_config = path_config_json
-        self.path_weights = path_weights
-        self.categories = copy(categories)  # type: ignore
-        if device is not None:
-            self.device = device
-        else:
-            self.device = set_torch_auto_device()
-        self.model.to(self.device)
+        self.path_config = Path(path_config_json)
+        self.path_weights = Path(path_weights)
+        self.categories = ModelCategories(init_categories=categories)
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
-        input_ids = encodings.get("input_ids")
-        attention_mask = encodings.get("attention_mask")
-        token_type_ids = encodings.get("token_type_ids")
-        boxes = encodings.get("bbox")
-        if isinstance(input_ids, torch.Tensor):
-            input_ids = input_ids.to(self.device)
-        else:
-            raise ValueError(f"input_ids must be list but is {type(input_ids)}")
-        if isinstance(attention_mask, torch.Tensor):
-            attention_mask = attention_mask.to(self.device)
-        else:
-            raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
-        if isinstance(token_type_ids, torch.Tensor):
-            token_type_ids = token_type_ids.to(self.device)
-        else:
-            raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
-        if isinstance(boxes, torch.Tensor):
-            boxes = boxes.to(self.device)
-        else:
-            raise ValueError(f"boxes must be list but is {type(boxes)}")
-        result = predict_sequence_classes(
-            input_ids,
-            attention_mask,
-            token_type_ids,
-            boxes,
-            self.model,
-        )
-        result.class_id += 1
-        result.class_name = self.categories[str(result.class_id)]
-        return result
+        self.device = get_torch_device(device)
     @classmethod
-    def get_requirements(cls) -> List[Requirement]:
+    def get_requirements(cls) -> list[Requirement]:
         return [get_pytorch_requirement(), get_transformers_requirement()]
-    def clone(self) -> "HFLayoutLmSequenceClassifierBase":
-        return self.__class__(self.path_config, self.path_weights, self.categories, self.device)
+    def clone(self) -> HFLayoutLmSequenceClassifierBase:
+        return self.__class__(self.path_config, self.path_weights, self.categories.get_categories(), self.device)
     def _validate_encodings(
-        self, **encodings: Union[List[List[str]], "torch.Tensor"]
-    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        self, **encodings: Union[list[list[str]], torch.Tensor]
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         input_ids = encodings.get("input_ids")
         attention_mask = encodings.get("attention_mask")
         token_type_ids = encodings.get("token_type_ids")
@@ -732,10 +792,30 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
         return input_ids, attention_mask, token_type_ids, boxes
     @staticmethod
-    def get_name(path_weights: str, architecture: str) -> str:
+    def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
         """Returns the name of the model"""
         return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
+    @staticmethod
+    def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
+        """A refinement for adding the tokenizer class name to the model configs.
+        :param model_class_name: The model name, e.g. model.__class__.__name__
+        :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
+        """
+        tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
+        return tokenizer.__class__.__name__
+    @staticmethod
+    def image_to_raw_features_mapping() -> str:
+        """Returns the mapping function to convert images into raw features."""
+        return "image_to_raw_layoutlm_features"
+    @staticmethod
+    def image_to_features_mapping() -> str:
+        """Returns the mapping function to convert images into features."""
+        return "image_to_layoutlm_features"
 class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
     """
@@ -770,20 +850,22 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
     def __init__(
         self,
-        path_config_json: str,
-        path_weights: str,
-        categories: Mapping[str, TypeOrStr],
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
+        categories: Mapping[int, TypeOrStr],
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
+        super().__init__(path_config_json, path_weights, categories, device)
         self.name = self.get_name(path_weights, "LayoutLM")
         self.model_id = self.get_model_id()
-        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
-        self.model = LayoutLMForSequenceClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(
+            self.model.__class__.__name__, use_xlm_tokenizer
         )
-        super().__init__(path_config_json, path_weights, categories, device)
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
+    def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
         input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
         result = predict_sequence_classes(
@@ -795,11 +877,13 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         )
         result.class_id += 1
-        result.class_name = self.categories[str(result.class_id)]
+        result.class_name = self.categories.categories[result.class_id]
         return result
     @staticmethod
-    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+    def get_wrapped_model(
+        path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
+    ) -> LayoutLMForSequenceClassification:
         """
         Get the inner (wrapped) model.
@@ -807,11 +891,14 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         :param path_weights: path to model artifact
         :return: 'nn.Module'
         """
-        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
         return LayoutLMForSequenceClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
+            pretrained_model_name_or_path=os.fspath(path_weights), config=config
         )
+    def clear_model(self) -> None:
+        self.model = None
 class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
     """
@@ -846,17 +933,22 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
     def __init__(
         self,
-        path_config_json: str,
-        path_weights: str,
-        categories: Mapping[str, TypeOrStr],
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
+        categories: Mapping[int, TypeOrStr],
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
+        super().__init__(path_config_json, path_weights, categories, device)
         self.name = self.get_name(path_weights, "LayoutLMv2")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
-        super().__init__(path_config_json, path_weights, categories, device)
+        self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(
+            self.model.__class__.__name__, use_xlm_tokenizer
+        )
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
+    def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
         input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
         images = encodings.get("image")
         if isinstance(images, torch.Tensor):
@@ -867,11 +959,11 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         result = predict_sequence_classes(input_ids, attention_mask, token_type_ids, boxes, self.model, images)
         result.class_id += 1
-        result.class_name = self.categories[str(result.class_id)]
+        result.class_name = self.categories.categories[result.class_id]
         return result
     @staticmethod
-    def default_kwargs_for_input_mapping() -> JsonDict:
+    def default_kwargs_for_image_to_features_mapping() -> JsonDict:
         """
         Add some default arguments that might be necessary when preparing a sample. Overwrite this method
         for some custom setting.
@@ -879,7 +971,9 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         return {"image_width": 224, "image_height": 224}
     @staticmethod
-    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+    def get_wrapped_model(
+        path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
+    ) -> LayoutLMv2ForSequenceClassification:
         """
         Get the inner (wrapped) model.
@@ -887,11 +981,14 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         :param path_weights: path to model artifact
         :return: 'nn.Module'
         """
-        config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
         return LayoutLMv2ForSequenceClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
+            pretrained_model_name_or_path=os.fspath(path_weights), config=config
         )
+    def clear_model(self) -> None:
+        self.model = None
 class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
     """
@@ -926,17 +1023,22 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
     def __init__(
         self,
-        path_config_json: str,
-        path_weights: str,
-        categories: Mapping[str, TypeOrStr],
-        device: Optional[Literal["cpu", "cuda"]] = None,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
+        categories: Mapping[int, TypeOrStr],
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
+        super().__init__(path_config_json, path_weights, categories, device)
         self.name = self.get_name(path_weights, "LayoutLMv3")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
-        super().__init__(path_config_json, path_weights, categories, device)
+        self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(
+            self.model.__class__.__name__, use_xlm_tokenizer
+        )
-    def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
+    def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
         input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
         images = encodings.get("pixel_values")
         if isinstance(images, torch.Tensor):
@@ -947,11 +1049,11 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         result = predict_sequence_classes(input_ids, attention_mask, token_type_ids, boxes, self.model, images)
         result.class_id += 1
-        result.class_name = self.categories[str(result.class_id)]
+        result.class_name = self.categories.categories[result.class_id]
         return result
     @staticmethod
-    def default_kwargs_for_input_mapping() -> JsonDict:
+    def default_kwargs_for_image_to_features_mapping() -> JsonDict:
         """
         Add some default arguments that might be necessary when preparing a sample. Overwrite this method
         for some custom setting.
@@ -965,7 +1067,9 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         }
     @staticmethod
-    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+    def get_wrapped_model(
+        path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
+    ) -> LayoutLMv3ForSequenceClassification:
         """
         Get the inner (wrapped) model.
@@ -973,7 +1077,196 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         :param path_weights: path to model artifact
         :return: 'nn.Module'
         """
-        config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
         return LayoutLMv3ForSequenceClassification.from_pretrained(
-            pretrained_model_name_or_path=path_weights, config=config
+            pretrained_model_name_or_path=os.fspath(path_weights), config=config
+        )
+    def clear_model(self) -> None:
+        self.model = None
+class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
+    """
+    A wrapper class for `transformers.LiltForTokenClassification` to use within a pipeline component.
+    Check <https://huggingface.co/docs/transformers/model_doc/lilt> for documentation of the model itself.
+    Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
+    classification and other things please use another model of the family.
+    **Example**
+            # setting up compulsory ocr service
+            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+            tess = TesseractOcrDetector(tesseract_config_path)
+            ocr_service = TextExtractionService(tess)
+            # hf tokenizer and token classifier
+            tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+            lilt = HFLiltTokenClassifier("path/to/config.json","path/to/model.bin",
+                                                  categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
+                                                               'E-header', 'E-question', 'I-answer', 'I-header',
+                                                               'I-question', 'O', 'S-answer', 'S-header',
+                                                               'S-question'])
+            # token classification service
+            lilt_service = LMTokenClassifierService(tokenizer,lilt)
+            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
+            path = "path/to/some/form"
+            df = pipe.analyze(path=path)
+            for dp in df:
+                ...
+    """
+    def __init__(
+        self,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
+        categories_semantics: Optional[Sequence[TypeOrStr]] = None,
+        categories_bio: Optional[Sequence[TypeOrStr]] = None,
+        categories: Optional[Mapping[int, TypeOrStr]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
+    ):
+        """
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
+                                     entities self. To be consistent with detectors use only values >0. Conversion will
+                                     be done internally.
+        :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
+                               consistent with detectors use only values>0. Conversion will be done internally.
+        :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
+        :param device: The device (cpu,"cuda"), where to place the model.
+        """
+        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        self.name = self.get_name(path_weights, "LiLT")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(
+            self.model.__class__.__name__, use_xlm_tokenizer
         )
+    def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
+        """
+        Launch inference on LayoutLm for token classification. Pass the following arguments
+        `input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
+        `attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
+        `token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
+        `boxes:` Torch tensor of bounding boxes of type 'xyxy'
+        `tokens:` List of original tokens taken from `LayoutLMTokenizer`
+        :return: A list of TokenClassResults
+        """
+        ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
+        results = predict_token_classes(
+            ann_ids, input_ids, attention_mask, token_type_ids, boxes, tokens, self.model, None
+        )
+        return self._map_category_names(results)
+    @staticmethod
+    def get_wrapped_model(path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr) -> LiltForTokenClassification:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return LiltForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
+    def clear_model(self) -> None:
+        self.model = None
+class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
+    """
+    A wrapper class for `transformers.LiLTForSequenceClassification` to use within a pipeline component.
+    Check <https://huggingface.co/docs/transformers/model_doc/lilt> for documentation of the model itself.
+    Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
+    classification and other things please use another model of the family.
+    **Example**
+            # setting up compulsory ocr service
+            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+            tess = TesseractOcrDetector(tesseract_config_path)
+            ocr_service = TextExtractionService(tess)
+            # hf tokenizer and sequence classifier
+            tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
+            lilt = HFLiltSequenceClassifier("path/to/config.json",
+                                                "path/to/model.bin",
+                                                categories=["handwritten", "presentation", "resume"])
+            # sequence classification service
+            lilt_service = LMSequenceClassifierService(tokenizer,lilt)
+            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
+            path = "path/to/some/form"
+            df = pipe.analyze(path=path)
+            for dp in df:
+                ...
+    """
+    def __init__(
+        self,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
+        categories: Mapping[int, TypeOrStr],
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
+    ):
+        super().__init__(path_config_json, path_weights, categories, device)
+        self.name = self.get_name(path_weights, "LiLT")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(
+            self.model.__class__.__name__, use_xlm_tokenizer
+        )
+    def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
+        input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
+        result = predict_sequence_classes(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            boxes,
+            self.model,
+        )
+        result.class_id += 1
+        result.class_name = self.categories.categories[result.class_id]
+        return result
+    @staticmethod
+    def get_wrapped_model(path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr) -> Any:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return LiltForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
+    def clear_model(self) -> None:
+        self.model = None

deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl