PyPI - deepdoctection - Versions diffs - 0.31__py3-none-any.whl → 0.32__py3-none-any.whl - Mend

deepdoctection 0.31py3-none-any.whl → 0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (91) hide show

deepdoctection/__init__.py +35 -28
deepdoctection/analyzer/dd.py +30 -24
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/datapoint/annotation.py +2 -1
deepdoctection/datapoint/box.py +2 -1
deepdoctection/datapoint/image.py +13 -7
deepdoctection/datapoint/view.py +95 -24
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +5 -2
deepdoctection/datasets/base.py +5 -3
deepdoctection/datasets/info.py +2 -2
deepdoctection/datasets/instances/doclaynet.py +3 -2
deepdoctection/datasets/instances/fintabnet.py +2 -1
deepdoctection/datasets/instances/funsd.py +2 -1
deepdoctection/datasets/instances/iiitar13k.py +5 -2
deepdoctection/datasets/instances/layouttest.py +2 -1
deepdoctection/datasets/instances/publaynet.py +2 -2
deepdoctection/datasets/instances/pubtables1m.py +6 -3
deepdoctection/datasets/instances/pubtabnet.py +2 -1
deepdoctection/datasets/instances/rvlcdip.py +2 -1
deepdoctection/datasets/instances/xfund.py +2 -1
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/cocometric.py +2 -1
deepdoctection/eval/eval.py +17 -13
deepdoctection/eval/tedsmetric.py +14 -11
deepdoctection/eval/tp_eval_callback.py +9 -3
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/d2detect.py +24 -32
deepdoctection/extern/deskew.py +4 -2
deepdoctection/extern/doctrocr.py +75 -81
deepdoctection/extern/fastlang.py +4 -2
deepdoctection/extern/hfdetr.py +22 -28
deepdoctection/extern/hflayoutlm.py +335 -103
deepdoctection/extern/hflm.py +225 -0
deepdoctection/extern/model.py +56 -47
deepdoctection/extern/pdftext.py +8 -4
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +27 -19
deepdoctection/extern/texocr.py +4 -2
deepdoctection/extern/tp/tfutils.py +43 -9
deepdoctection/extern/tp/tpcompat.py +10 -7
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +7 -3
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +5 -8
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/d2struct.py +8 -6
deepdoctection/mapper/hfstruct.py +6 -1
deepdoctection/mapper/laylmstruct.py +163 -20
deepdoctection/mapper/maputils.py +3 -1
deepdoctection/mapper/misc.py +6 -3
deepdoctection/mapper/tpstruct.py +2 -2
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/common.py +11 -9
deepdoctection/pipe/concurrency.py +2 -1
deepdoctection/pipe/layout.py +3 -1
deepdoctection/pipe/lm.py +32 -64
deepdoctection/pipe/order.py +142 -35
deepdoctection/pipe/refine.py +8 -14
deepdoctection/pipe/{cell.py → sub_layout.py} +1 -1
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +21 -16
deepdoctection/train/hf_detr_train.py +18 -11
deepdoctection/train/hf_layoutlm_train.py +118 -101
deepdoctection/train/tp_frcnn_train.py +21 -19
deepdoctection/utils/env_info.py +41 -117
deepdoctection/utils/logger.py +1 -0
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/settings.py +1 -0
deepdoctection/utils/viz.py +4 -3
{deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/METADATA +27 -18
deepdoctection-0.32.dist-info/RECORD +146 -0
deepdoctection-0.31.dist-info/RECORD +0 -144
{deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/WHEEL +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0

deepdoctection/mapper/laylmstruct.py CHANGED Viewed

@@ -20,32 +20,30 @@ Module for mapping annotations from image to layout lm input structure. Heavily
 <https://github.com/NielsRogge/Transformers-Tutorials>
 """
+from __future__ import annotations
 import random
 from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List, Literal, NewType, Optional, Sequence, Union
 import numpy as np
 import numpy.typing as npt
+from lazy_imports import try_import
 from ..datapoint.annotation import ContainerAnnotation
 from ..datapoint.convert import box_to_point4, point4_to_box
 from ..datapoint.image import Image
+from ..datapoint.view import Page
 from ..utils.detection_types import JsonDict
-from ..utils.file_utils import pytorch_available, transformers_available
 from ..utils.settings import DatasetType, LayoutType, PageType, Relationships, WordType
 from ..utils.transform import ResizeTransform, normalize_image
 from .maputils import curry
-if pytorch_available():
+with try_import() as import_guard:
     import torch
-if transformers_available():
-    from transformers import (  # pylint: disable=W0611
-        BatchEncoding,
-        PreTrainedTokenizerFast,
-        RobertaTokenizerFast,
-        XLMRobertaTokenizerFast,
-    )
+with try_import() as tr_import_guard:
+    from transformers import BatchEncoding, PreTrainedTokenizerFast  # pylint: disable=W0611
 __all__ = [
     "image_to_raw_layoutlm_features",
@@ -54,12 +52,17 @@ __all__ = [
     "image_to_layoutlm_features",
     "DataCollator",
     "LayoutLMFeatures",
+    "image_to_raw_lm_features",
+    "image_to_lm_features",
 ]
 RawLayoutLMFeatures = NewType("RawLayoutLMFeatures", JsonDict)
+RawLMFeatures = NewType("RawLMFeatures", JsonDict)
 LayoutLMFeatures = NewType("LayoutLMFeatures", JsonDict)
+LMFeatures = NewType("LMFeatures", JsonDict)
 InputDataClass = NewType("InputDataClass", JsonDict)
 """
 <https://github.com/huggingface/transformers/src/transformers/data/data_collator.py>
 A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
@@ -208,7 +211,7 @@ def image_to_raw_layoutlm_features(
     return raw_features
-def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
+def layoutlm_features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
     """
     Converting list of floats to pytorch tensors
     :param features: LayoutLMFeatures
@@ -216,7 +219,8 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
     """
     _image_key = "pixel_values" if "pixel_values" in features else "image"
-    features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
+    if "bbox" in features:
+        features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
     if "labels" in features:
         features["labels"] = torch.tensor(features["labels"], dtype=torch.long)
     if _image_key in features:
@@ -230,12 +234,12 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
 def _tokenize_with_sliding_window(
-    raw_features: List[RawLayoutLMFeatures],
-    tokenizer: "PreTrainedTokenizerFast",
+    raw_features: List[Union[RawLayoutLMFeatures, RawLMFeatures]],
+    tokenizer: PreTrainedTokenizerFast,
     sliding_window_stride: int,
     max_batch_size: int,
     return_tensors: Optional[Literal["pt"]] = None,
-) -> Union[JsonDict, "BatchEncoding"]:
+) -> Union[JsonDict, BatchEncoding]:
     """
     Runs a tokenizer: If there are no overflowing tokens, the tokenizer output will be returned as it is.
     If there are overflowing tokens, sliding windows have to be built. As it is easier to prepare the sliding windows
@@ -398,8 +402,8 @@ def _tokenize_with_sliding_window(
 def raw_features_to_layoutlm_features(
-    raw_features: Union[RawLayoutLMFeatures, List[RawLayoutLMFeatures]],
-    tokenizer: "PreTrainedTokenizerFast",
+    raw_features: Union[RawLayoutLMFeatures, RawLMFeatures, List[Union[RawLayoutLMFeatures, RawLMFeatures]]],
+    tokenizer: PreTrainedTokenizerFast,
     padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
     truncation: bool = True,
     return_overflowing_tokens: bool = False,
@@ -407,6 +411,7 @@ def raw_features_to_layoutlm_features(
     remove_columns_for_training: bool = False,
     sliding_window_stride: int = 0,
     max_batch_size: int = 0,
+    remove_bounding_boxes: bool = False,
 ) -> LayoutLMFeatures:
     """
     Mapping raw features to tokenized input sequences for LayoutLM models.
@@ -563,8 +568,11 @@ def raw_features_to_layoutlm_features(
         input_dict.pop("ann_ids")
         input_dict.pop("tokens")
+    if remove_bounding_boxes:
+        input_dict.pop("bbox")
     if return_tensors == "pt":
-        return features_to_pt_tensors(LayoutLMFeatures(input_dict))
+        return layoutlm_features_to_pt_tensors(LayoutLMFeatures(input_dict))
     return LayoutLMFeatures(input_dict)
@@ -595,13 +603,14 @@ class LayoutLMDataCollator:
                            with windows shifted `sliding_window_stride` to the right.
     """
-    tokenizer: "PreTrainedTokenizerFast"
+    tokenizer: PreTrainedTokenizerFast
     padding: Literal["max_length", "do_not_pad", "longest"] = field(default="max_length")
     truncation: bool = field(default=True)
     return_overflowing_tokens: bool = field(default=False)
     return_tensors: Optional[Literal["pt"]] = field(default=None)
     sliding_window_stride: int = field(default=0)
     max_batch_size: int = field(default=0)
+    remove_bounding_box_features: bool = field(default=False)
     def __post_init__(self) -> None:
         assert isinstance(self.tokenizer, PreTrainedTokenizerFast), "Tokenizer must be a fast tokenizer"
@@ -620,7 +629,7 @@ class LayoutLMDataCollator:
                  token_type_ids, attention_masks, boxes, labels`.
         """
         return raw_features_to_layoutlm_features(
-            raw_features,
+            raw_features,  # type: ignore
             self.tokenizer,
             self.padding,
             self.truncation,
@@ -629,13 +638,14 @@ class LayoutLMDataCollator:
             True,
             self.sliding_window_stride,
             self.max_batch_size,
+            self.remove_bounding_box_features,
         )
 @curry
 def image_to_layoutlm_features(
     dp: Image,
-    tokenizer: "PreTrainedTokenizerFast",
+    tokenizer: PreTrainedTokenizerFast,
     padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
     truncation: bool = True,
     return_overflowing_tokens: bool = False,
@@ -724,3 +734,136 @@ def image_to_layoutlm_features(
         sliding_window_stride=sliding_window_stride,
     )
     return features
+@curry
+def image_to_raw_lm_features(
+    dp: Image,
+    dataset_type: Optional[Literal["sequence_classification", "token_classification"]] = None,
+    use_token_tag: bool = True,
+    text_container: Optional[LayoutType] = LayoutType.word,
+    floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
+    include_residual_text_container: bool = False,
+) -> Optional[RawLMFeatures]:
+    """
+    Mapping a datapoint into an intermediate format for bert-like models. Features will be provided into a dict and
+    this mapping can be used for sequence or token classification as well as for inference. To generate input features
+    for the model please `use raw_features_to_layoutlm_features`.
+    :param dp: Image
+    :param dataset_type: Either SEQUENCE_CLASSIFICATION or TOKEN_CLASSIFICATION. When using a built-in dataset use
+    :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
+                          labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
+                          `WordType.token_class`.
+    :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
+    :param floating_text_block_categories: A list of top level layout objects
+    :param include_residual_text_container: This will regard synthetic text line annotations as floating text
+                                            blocks and therefore incorporate all image annotations of category
+                                            `word` when building text strings.
+    :return: dictionary with the following arguments:
+            'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
+    """
+    raw_features: RawLMFeatures = RawLMFeatures({})
+    page = Page.from_image(dp, text_container, floating_text_block_categories, include_residual_text_container)
+    text_ = page.text_
+    # pylint: disable=E1137  #3162
+    raw_features["image_id"] = page.image_id
+    raw_features["width"] = page.width
+    raw_features["height"] = page.height
+    raw_features["ann_ids"] = text_["ann_ids"]
+    raw_features["words"] = text_["words"]
+    # We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
+    # raw_features_to_layoutlm_features
+    raw_features["bbox"] = [_CLS_BOX] * len(text_["words"])
+    raw_features["dataset_type"] = dataset_type
+    if use_token_tag and text_["token_tags"]:
+        raw_features["labels"] = text_["token_tags"]
+    elif text_["token_classes"]:
+        raw_features["labels"] = text_["token_classes"]
+    elif page.document_type is not None:
+        document_type_id = (
+            int(page.image_orig.summary.get_sub_category(PageType.document_type).category_id) - 1  # type: ignore
+        )
+        raw_features["labels"] = [document_type_id]
+    raw_features["dataset_type"] = dataset_type
+    # pylint: enable=E1137
+    return raw_features
+@curry
+def image_to_lm_features(
+    dp: Image,
+    tokenizer: PreTrainedTokenizerFast,
+    padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
+    truncation: bool = True,
+    return_overflowing_tokens: bool = False,
+    return_tensors: Optional[Literal["pt"]] = "pt",
+    sliding_window_stride: int = 0,
+    text_container: Optional[LayoutType] = LayoutType.word,
+    floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
+    include_residual_text_container: bool = False,
+) -> Optional[LayoutLMFeatures]:
+    """
+    Mapping function to generate layoutlm features from `Image` to be used for inference in a pipeline component.
+    `LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
+    with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
+    used internally in `LMTokenClassifierService`.
+            tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
+            layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
+                                                  categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
+            layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
+    :param dp: Image datapoint
+    :param tokenizer: Tokenizer compatible with the language model
+    :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
+                    `do_not_pad`.
+    :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
+                       maximum acceptable input length for the model if that argument is not provided. This will
+                       truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                       sequences (or a batch of pairs) is provided.
+                       If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
+                       model maximum admissible input size).
+    :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
+                                      can be returned as an additional batch element. Not that in this case, the number
+                                      of input batch samples will be smaller than the output batch samples.
+    :param return_tensors: Output tensor features. Either 'pt' for PyTorch models or None, if features should be
+                           returned in list objects.
+    :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length a sliding
+                                  windows will be created with each window having max_length sequence input. When using
+                                  `sliding_window_stride=0` no strides will be created, otherwise it will create slides
+                                  with windows shifted `sliding_window_stride` to the right.
+    :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
+    :param floating_text_block_categories: A list of top level layout objects
+    :param include_residual_text_container: This will regard synthetic text line annotations as floating text
+                                            blocks and therefore incorporate all image annotations of category
+                                            `word` when building text strings.
+    :return: A dict of lm features
+    """
+    raw_features = image_to_raw_lm_features(  # pylint: disable=E1102
+        dataset_type=None,
+        use_token_tag=True,
+        text_container=text_container,
+        floating_text_block_categories=floating_text_block_categories,
+        include_residual_text_container=include_residual_text_container,
+    )(dp)
+    if raw_features is None:
+        return None
+    features = raw_features_to_layoutlm_features(
+        raw_features,
+        tokenizer,
+        padding,
+        truncation,
+        return_overflowing_tokens,
+        return_tensors=return_tensors,
+        sliding_window_stride=sliding_window_stride,
+    )
+    return features

deepdoctection/mapper/maputils.py CHANGED Viewed

@@ -18,6 +18,8 @@
 """
 Utility functions related to mapping tasks
 """
+from __future__ import annotations
 import functools
 import itertools
 import traceback
@@ -55,7 +57,7 @@ class MappingContextManager:
         self.context_error = True
         self.kwargs = kwargs
-    def __enter__(self) -> "MappingContextManager":
+    def __enter__(self) -> MappingContextManager:
         """
         context enter
         """

deepdoctection/mapper/misc.py CHANGED Viewed

@@ -19,19 +19,22 @@
 Module for small mapping functions
 """
+from __future__ import annotations
 import ast
 import os
 from typing import List, Mapping, Optional, Sequence, Union
+from lazy_imports import try_import
 from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
 from ..datapoint.image import Image
 from ..utils.detection_types import JsonDict
-from ..utils.file_utils import lxml_available
 from ..utils.fs import get_load_image_func, load_image_from_file
 from ..utils.utils import is_file_extension
 from .maputils import MappingContextManager, curry
-if lxml_available():
+with try_import() as import_guard:
     from lxml import etree  # pylint: disable=W0611
@@ -175,7 +178,7 @@ def maybe_ann_to_sub_image(
 @curry
-def xml_to_dict(dp: JsonDict, xslt_obj: "etree.XSLT") -> JsonDict:
+def xml_to_dict(dp: JsonDict, xslt_obj: etree.XSLT) -> JsonDict:
     """
     Convert a xml object into a dict using a xsl style sheet.

deepdoctection/mapper/tpstruct.py CHANGED Viewed

@@ -22,15 +22,15 @@ import os.path
 from typing import Optional, Sequence, Union
 import numpy as np
+from lazy_imports import try_import
 from ..datapoint.annotation import ImageAnnotation
 from ..datapoint.image import Image
 from ..utils.detection_types import JsonDict
-from ..utils.file_utils import tf_available
 from ..utils.settings import ObjectTypes
 from .maputils import curry
-if tf_available():
+with try_import() as import_guard:
     from tensorflow import convert_to_tensor, uint8  # type: ignore # pylint: disable=E0401
     from tensorflow.image import non_max_suppression  # type: ignore # pylint: disable=E0401

deepdoctection/pipe/__init__.py CHANGED Viewed

@@ -22,7 +22,6 @@ Contains pipeline components that can be plugged into each other and predictors
 from .anngen import *
 from .base import *
-from .cell import *
 from .common import *
 from .concurrency import *
 from .doctectionpipe import *
@@ -33,5 +32,6 @@ from .order import *
 from .refine import *
 from .registry import *
 from .segment import *
+from .sub_layout import *
 from .text import *
 from .transform import *

deepdoctection/pipe/common.py CHANGED Viewed

@@ -18,6 +18,10 @@
 """
 Module for common pipeline components
 """
+from __future__ import annotations
+import os
 from copy import copy, deepcopy
 from typing import List, Literal, Mapping, Optional, Sequence, Union
@@ -30,16 +34,14 @@ from ..mapper.maputils import MappingContextManager
 from ..mapper.match import match_anns_by_intersection
 from ..mapper.misc import to_image
 from ..utils.detection_types import JsonDict
-from ..utils.file_utils import detectron2_available, pytorch_available, tf_available
 from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
 from .base import PipelineComponent
 from .registry import pipeline_component_registry
-if tf_available():
-    from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
-elif pytorch_available() and detectron2_available():
+if os.environ.get("DD_USE_TORCH"):
     from ..mapper.d2struct import pt_nms_image_annotations as nms_image_annotations
+elif os.environ.get("DD_USE_TF"):
+    from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
 @pipeline_component_registry.register("ImageCroppingService")
@@ -64,7 +66,7 @@ class ImageCroppingService(PipelineComponent):
         for ann in dp.get_annotation(category_names=self.category_names):
             dp.image_ann_to_image(ann.annotation_id, crop_image=True)
-    def clone(self) -> "PipelineComponent":
+    def clone(self) -> PipelineComponent:
         return self.__class__(self.category_names)
     def get_meta_annotation(self) -> JsonDict:
@@ -225,7 +227,7 @@ class PageParsingService:
         """
         return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
-    def clone(self) -> "PageParsingService":
+    def clone(self) -> PageParsingService:
         """clone"""
         return self.__class__(
             deepcopy(self.text_container),
@@ -292,7 +294,7 @@ class AnnotationNmsService(PipelineComponent):
                 if ann.annotation_id not in ann_ids_to_keep:
                     self.dp_manager.deactivate_annotation(ann.annotation_id)
-    def clone(self) -> "PipelineComponent":
+    def clone(self) -> PipelineComponent:
         return self.__class__(deepcopy(self.nms_pairs), self.threshold)
     def get_meta_annotation(self) -> JsonDict:
@@ -326,7 +328,7 @@ class ImageParsingService:
         """
         return MapData(df, self.pass_datapoint)
-    def clone(self) -> "ImageParsingService":
+    def clone(self) -> ImageParsingService:
         """clone"""
         return self.__class__(self.dpi)

deepdoctection/pipe/concurrency.py CHANGED Viewed

@@ -18,6 +18,7 @@
 """
 Module for multithreading tasks
 """
+from __future__ import annotations
 import itertools
 import queue
@@ -221,7 +222,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
     def serve(self, dp: Image) -> None:
         raise NotImplementedError("MultiThreadPipelineComponent does not follow the PipelineComponent implementation")
-    def clone(self) -> "MultiThreadPipelineComponent":
+    def clone(self) -> MultiThreadPipelineComponent:
         raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
     def get_meta_annotation(self) -> JsonDict:

deepdoctection/pipe/layout.py CHANGED Viewed

@@ -18,6 +18,8 @@
 """
 Module for layout pipeline component
 """
+from __future__ import annotations
 from typing import Optional
 import numpy as np
@@ -109,7 +111,7 @@ class ImageLayoutService(PredictorPipelineComponent):
     def _get_name(predictor_name: str) -> str:
         return f"image_{predictor_name}"
-    def clone(self) -> "PredictorPipelineComponent":
+    def clone(self) -> PredictorPipelineComponent:
         predictor = self.predictor.clone()
         padder_clone = None
         if self.padder:

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# File: tokenclass.py
+# File: lm.py
 # Copyright 2021 Dr. Janis Meyer. All rights reserved.
 #
@@ -18,57 +18,19 @@
 """
 Module for token classification pipeline
 """
+from __future__ import annotations
 from copy import copy
-from typing import Any, List, Literal, Optional, Sequence, Union
+from typing import Any, Callable, List, Literal, Optional, Sequence, Union
 from ..datapoint.image import Image
 from ..extern.hflayoutlm import HFLayoutLmSequenceClassifierBase, HFLayoutLmTokenClassifierBase
-from ..mapper.laylmstruct import image_to_layoutlm_features
+from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
 from ..utils.detection_types import JsonDict
-from ..utils.file_utils import transformers_available
 from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
 from .base import LanguageModelPipelineComponent
 from .registry import pipeline_component_registry
-if transformers_available():
-    from transformers import LayoutLMTokenizerFast, RobertaTokenizerFast, XLMRobertaTokenizerFast
-    _ARCHITECTURES_TO_TOKENIZER = {
-        ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
-        ("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
-        ("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
-            "roberta-base", add_prefix_space=True
-        ),
-        ("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
-            "roberta-base", add_prefix_space=True
-        ),
-    }
-def get_tokenizer_from_architecture(architecture_name: str, use_xlm_tokenizer: bool) -> Any:
-    """
-    We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
-    returns the tokenizer that should be used for a particular model.
-    :param architecture_name: The model as stated in the transformer library.
-    :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
-    :return: Tokenizer instance to use.
-    """
-    return _ARCHITECTURES_TO_TOKENIZER[(architecture_name, use_xlm_tokenizer)]
 @pipeline_component_registry.register("LMTokenClassifierService")
 class LMTokenClassifierService(LanguageModelPipelineComponent):
@@ -154,7 +116,8 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
             else:
                 self.default_key = TokenClasses.other
             self.other_name_as_key = {self.default_key: categories_name_as_key[self.default_key]}
-        super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
+        image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
+        super().__init__(self._get_name(), tokenizer, image_to_features_func)
         self.required_kwargs = {
             "tokenizer": self.tokenizer,
             "padding": self.padding,
@@ -218,7 +181,9 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
                         word.annotation_id,
                     )
-    def clone(self) -> "LMTokenClassifierService":
+    def clone(self) -> LMTokenClassifierService:
+        # ToDo: replace copying of tokenizer with a proper clone method. Otherwise we cannot run the evaluation with
+        # multiple threads
         return self.__class__(
             copy(self.tokenizer),
             self.language_model.clone(),
@@ -244,19 +209,20 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
         return f"lm_token_class_{self.language_model.name}"
     def _init_sanity_checks(self) -> None:
-        tokenizer_class = self.language_model.model.config.tokenizer_class
-        use_xlm_tokenizer = False
-        if tokenizer_class is not None:
-            use_xlm_tokenizer = True
-        tokenizer_reference = get_tokenizer_from_architecture(
-            self.language_model.model.__class__.__name__, use_xlm_tokenizer
-        )
-        if not isinstance(self.tokenizer, type(tokenizer_reference)):
+        tokenizer_class_name = self.language_model.model.config.tokenizer_class
+        if tokenizer_class_name != self.tokenizer.__class__.__name__:
             raise TypeError(
-                f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
+                f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
                 f"in this framework"
             )
+    @staticmethod
+    def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
+        """Replacing eval functions"""
+        return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
+            mapping_str
+        ]
 @pipeline_component_registry.register("LMSequenceClassifierService")
 class LMSequenceClassifierService(LanguageModelPipelineComponent):
@@ -315,7 +281,8 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
         self.padding = padding
         self.truncation = truncation
         self.return_overflowing_tokens = return_overflowing_tokens
-        super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
+        image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
+        super().__init__(self._get_name(), tokenizer, image_to_features_func)
         self.required_kwargs = {
             "tokenizer": self.tokenizer,
             "padding": self.padding,
@@ -335,7 +302,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
             PageType.document_type, lm_output.class_name, lm_output.class_id, None, lm_output.score
         )
-    def clone(self) -> "LMSequenceClassifierService":
+    def clone(self) -> LMSequenceClassifierService:
         return self.__class__(
             copy(self.tokenizer),
             self.language_model.clone(),
@@ -358,15 +325,16 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
         return f"lm_sequence_class_{self.language_model.name}"
     def _init_sanity_checks(self) -> None:
-        tokenizer_class = self.language_model.model.config.tokenizer_class
-        use_xlm_tokenizer = False
-        if tokenizer_class is not None:
-            use_xlm_tokenizer = True
-        tokenizer_reference = get_tokenizer_from_architecture(
-            self.language_model.model.__class__.__name__, use_xlm_tokenizer
-        )
-        if not isinstance(self.tokenizer, type(tokenizer_reference)):
+        tokenizer_class_name = self.language_model.model.config.tokenizer_class
+        if tokenizer_class_name != self.tokenizer.__class__.__name__:
             raise TypeError(
-                f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
+                f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
                 f"in this framework"
             )
+    @staticmethod
+    def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
+        """Replacing eval functions"""
+        return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
+            mapping_str
+        ]

deepdoctection 0.31__py3-none-any.whl → 0.32__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.31py3-none-any.whl → 0.32py3-none-any.whl