PyPI - deepdoctection - Versions diffs - 0.30__py3-none-any.whl → 0.32__py3-none-any.whl - Mend

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show

deepdoctection/__init__.py +38 -29
deepdoctection/analyzer/dd.py +36 -29
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/base.py +0 -19
deepdoctection/dataflow/custom.py +4 -3
deepdoctection/dataflow/custom_serialize.py +14 -5
deepdoctection/dataflow/parallel_map.py +12 -11
deepdoctection/dataflow/serialize.py +5 -4
deepdoctection/datapoint/annotation.py +35 -13
deepdoctection/datapoint/box.py +3 -5
deepdoctection/datapoint/convert.py +3 -1
deepdoctection/datapoint/image.py +79 -36
deepdoctection/datapoint/view.py +152 -49
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +6 -3
deepdoctection/datasets/base.py +86 -11
deepdoctection/datasets/dataflow_builder.py +1 -1
deepdoctection/datasets/info.py +4 -4
deepdoctection/datasets/instances/doclaynet.py +3 -2
deepdoctection/datasets/instances/fintabnet.py +2 -1
deepdoctection/datasets/instances/funsd.py +2 -1
deepdoctection/datasets/instances/iiitar13k.py +5 -2
deepdoctection/datasets/instances/layouttest.py +4 -8
deepdoctection/datasets/instances/publaynet.py +2 -2
deepdoctection/datasets/instances/pubtables1m.py +6 -3
deepdoctection/datasets/instances/pubtabnet.py +2 -1
deepdoctection/datasets/instances/rvlcdip.py +2 -1
deepdoctection/datasets/instances/xfund.py +2 -1
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +1 -1
deepdoctection/eval/base.py +5 -4
deepdoctection/eval/cocometric.py +2 -1
deepdoctection/eval/eval.py +19 -15
deepdoctection/eval/tedsmetric.py +14 -11
deepdoctection/eval/tp_eval_callback.py +14 -7
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +39 -13
deepdoctection/extern/d2detect.py +182 -90
deepdoctection/extern/deskew.py +36 -9
deepdoctection/extern/doctrocr.py +265 -83
deepdoctection/extern/fastlang.py +49 -9
deepdoctection/extern/hfdetr.py +106 -55
deepdoctection/extern/hflayoutlm.py +441 -122
deepdoctection/extern/hflm.py +225 -0
deepdoctection/extern/model.py +56 -47
deepdoctection/extern/pdftext.py +10 -5
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +27 -18
deepdoctection/extern/tessocr.py +134 -22
deepdoctection/extern/texocr.py +6 -2
deepdoctection/extern/tp/tfutils.py +43 -9
deepdoctection/extern/tp/tpcompat.py +14 -11
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +54 -30
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/d2struct.py +9 -7
deepdoctection/mapper/hfstruct.py +7 -2
deepdoctection/mapper/laylmstruct.py +164 -21
deepdoctection/mapper/maputils.py +16 -3
deepdoctection/mapper/misc.py +6 -3
deepdoctection/mapper/prodigystruct.py +1 -1
deepdoctection/mapper/pubstruct.py +10 -10
deepdoctection/mapper/tpstruct.py +3 -3
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +35 -8
deepdoctection/pipe/base.py +53 -19
deepdoctection/pipe/common.py +23 -13
deepdoctection/pipe/concurrency.py +2 -1
deepdoctection/pipe/doctectionpipe.py +2 -2
deepdoctection/pipe/language.py +3 -2
deepdoctection/pipe/layout.py +6 -3
deepdoctection/pipe/lm.py +34 -66
deepdoctection/pipe/order.py +142 -35
deepdoctection/pipe/refine.py +26 -24
deepdoctection/pipe/segment.py +21 -16
deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
deepdoctection/pipe/text.py +14 -8
deepdoctection/pipe/transform.py +16 -9
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +36 -28
deepdoctection/train/hf_detr_train.py +26 -17
deepdoctection/train/hf_layoutlm_train.py +133 -111
deepdoctection/train/tp_frcnn_train.py +21 -19
deepdoctection/utils/__init__.py +3 -0
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +2 -2
deepdoctection/utils/env_info.py +41 -84
deepdoctection/utils/error.py +84 -0
deepdoctection/utils/file_utils.py +4 -15
deepdoctection/utils/fs.py +7 -7
deepdoctection/utils/logger.py +1 -0
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +5 -4
deepdoctection/utils/settings.py +6 -1
deepdoctection/utils/transform.py +1 -1
deepdoctection/utils/utils.py +0 -6
deepdoctection/utils/viz.py +48 -5
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
deepdoctection-0.32.dist-info/RECORD +146 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
deepdoctection-0.30.dist-info/RECORD +0 -143
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0

deepdoctection/pipe/base.py CHANGED Viewed

@@ -23,12 +23,14 @@ from abc import ABC, abstractmethod
 from collections import defaultdict
 from copy import deepcopy
 from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional, Set, Union
+from uuid import uuid1
 from ..dataflow import DataFlow, MapData
 from ..datapoint.image import Image
 from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner, TextRecognizer
 from ..utils.context import timed_operation
 from ..utils.detection_types import JsonDict
+from ..utils.identifier import get_uuid_from_str
 from .anngen import DatapointManager
@@ -58,8 +60,9 @@ class PipelineComponent(ABC):
                      pipeline. Use something that describe the task of the pipeline.
         """
         self.name = name
+        self.service_id = self.get_service_id()
         self._meta_has_all_types()
-        self.dp_manager = DatapointManager()
+        self.dp_manager = DatapointManager(self.service_id)
         self.timer_on = False
     @abstractmethod
@@ -75,7 +78,7 @@ class PipelineComponent(ABC):
         As a simplified interface `serve` does not have to return a dp. The data point is passed on within
         pipelines internally (via `pass_datapoint`).
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     def pass_datapoint(self, dp: Image) -> Image:
         """
@@ -109,7 +112,7 @@ class PipelineComponent(ABC):
         """
         Clone an instance
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     @abstractmethod
     def get_meta_annotation(self) -> JsonDict:
@@ -122,7 +125,7 @@ class PipelineComponent(ABC):
         `summaries` with values: A list of summary sub categories
         :return: Dict with meta infos as just described
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     def _meta_has_all_types(self) -> None:
         if not {"image_annotations", "sub_categories", "relationships", "summaries"}.issubset(
@@ -133,6 +136,12 @@ class PipelineComponent(ABC):
                 f"Got {self.get_meta_annotation().keys()}"
             )
+    def get_service_id(self) -> str:
+        """
+        Get the generating model
+        """
+        return get_uuid_from_str(self.name)[:8]
 class PredictorPipelineComponent(PipelineComponent, ABC):
     """
@@ -151,10 +160,11 @@ class PredictorPipelineComponent(PipelineComponent, ABC):
         """
         self.predictor = predictor
         super().__init__(name)
+        self.dp_manager = DatapointManager(self.service_id, self.predictor.model_id)
     @abstractmethod
     def clone(self) -> "PredictorPipelineComponent":
-        raise NotImplementedError
+        raise NotImplementedError()
 class LanguageModelPipelineComponent(PipelineComponent, ABC):
@@ -175,15 +185,15 @@ class LanguageModelPipelineComponent(PipelineComponent, ABC):
         """
         self.tokenizer = tokenizer
-        self.mapping_to_lm_input_func = mapping_to_lm_input_func
         super().__init__(name)
+        self.mapping_to_lm_input_func = mapping_to_lm_input_func
     @abstractmethod
     def clone(self) -> "LanguageModelPipelineComponent":
         """
         Clone an instance
         """
-        raise NotImplementedError
+        raise NotImplementedError()
 class ImageTransformPipelineComponent(PipelineComponent, ABC):
@@ -206,7 +216,7 @@ class ImageTransformPipelineComponent(PipelineComponent, ABC):
         """
         Clone an instance
         """
-        raise NotImplementedError
+        raise NotImplementedError()
 class Pipeline(ABC):
@@ -228,7 +238,7 @@ class Pipeline(ABC):
             layout = LayoutPipeComponent(layout_detector ...)
             text = TextExtractPipeComponent(text_detector ...)
-            simple_pipe = MyPipeline (pipeline_component = [layout, text])
+            simple_pipe = MyPipeline(pipeline_component = [layout, text])
             doc_dataflow = simple_pipe.analyze(input = path / to / dir)
             for page in doc_dataflow:
@@ -238,6 +248,18 @@ class Pipeline(ABC):
     model or already processed further).
     In addition to `analyze`, the internal `_entry` is used to bundle preprocessing steps.
+    It is possible to set a session id for the pipeline. This is useful for logging purposes. The session id can be
+    either passed to the pipeline via the `analyze` method or generated automatically.
+    To generate a session_id automatically:
+    **Example:**
+           pipe = MyPipeline(pipeline_component = [layout, text])
+           pipe.set_session_id = True
+           df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
     """
     def __init__(self, pipeline_component_list: List[PipelineComponent]) -> None:
@@ -245,6 +267,7 @@ class Pipeline(ABC):
         :param pipeline_component_list: A list of pipeline components.
         """
         self.pipe_component_list = pipeline_component_list
+        self.set_session_id = False
     @abstractmethod
     def _entry(self, **kwargs: Any) -> DataFlow:
@@ -254,14 +277,17 @@ class Pipeline(ABC):
         :param kwargs: Arguments, for dynamic customizing of the processing or for the transfer of processing types
         """
-        raise NotImplementedError
+        raise NotImplementedError()
-    def _build_pipe(self, df: DataFlow) -> DataFlow:
+    def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
         """
         Composition of the backbone
         """
+        if session_id is None and self.set_session_id:
+            session_id = self.get_session_id()
         for component in self.pipe_component_list:
             component.timer_on = True
+            component.dp_manager.session_id = session_id
             df = component.predict_dataflow(df)
         return df
@@ -277,7 +303,7 @@ class Pipeline(ABC):
         can be triggered.
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     def get_meta_annotation(self) -> JsonDict:
         """
@@ -301,22 +327,30 @@ class Pipeline(ABC):
             for key, value in meta_anns["relationships"].items():
                 pipeline_populations["relationships"][key].update(value)
             pipeline_populations["summaries"].extend(meta_anns["summaries"])  # type: ignore
+        pipeline_populations["sub_categories"] = dict(pipeline_populations["sub_categories"])  # type: ignore
+        pipeline_populations["relationships"] = dict(pipeline_populations["relationships"])  # type: ignore
         return pipeline_populations
     def get_pipeline_info(
-        self, position: Optional[int] = None, name: Optional[str] = None
-    ) -> Union[Mapping[int, str], str, int]:
+        self, service_id: Optional[str] = None, name: Optional[str] = None
+    ) -> Union[str, Mapping[str, str]]:
         """Get pipeline information: Returns a dictionary with a description of each pipeline component
-        :param position: position of the pipeline component in the pipeline
+        :param service_id: service_id of the pipeline component to search for
         :param name: name of the pipeline component to search for
         :return: Either a full dictionary with position and name of all pipeline components or the name, if the position
                  has been passed or the position if the name has been passed.
         """
-        comp_info = {key: comp.name for key, comp in enumerate(self.pipe_component_list)}
+        comp_info = {comp.service_id: comp.name for comp in self.pipe_component_list}
         comp_info_name_as_key = {value: key for key, value in comp_info.items()}
-        if position is not None:
-            return comp_info[position]
+        if service_id is not None:
+            return comp_info[service_id]
         if name is not None:
             return comp_info_name_as_key[name]
         return comp_info
+    @staticmethod
+    def get_session_id() -> str:
+        """
+        Get the generating a session id
+        """
+        return str(uuid1())[:8]

deepdoctection/pipe/common.py CHANGED Viewed

@@ -18,6 +18,10 @@
 """
 Module for common pipeline components
 """
+from __future__ import annotations
+import os
 from copy import copy, deepcopy
 from typing import List, Literal, Mapping, Optional, Sequence, Union
@@ -30,16 +34,14 @@ from ..mapper.maputils import MappingContextManager
 from ..mapper.match import match_anns_by_intersection
 from ..mapper.misc import to_image
 from ..utils.detection_types import JsonDict
-from ..utils.file_utils import detectron2_available, pytorch_available, tf_available
 from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
 from .base import PipelineComponent
 from .registry import pipeline_component_registry
-if tf_available():
-    from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
-elif pytorch_available() and detectron2_available():
+if os.environ.get("DD_USE_TORCH"):
     from ..mapper.d2struct import pt_nms_image_annotations as nms_image_annotations
+elif os.environ.get("DD_USE_TF"):
+    from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
 @pipeline_component_registry.register("ImageCroppingService")
@@ -64,7 +66,7 @@ class ImageCroppingService(PipelineComponent):
         for ann in dp.get_annotation(category_names=self.category_names):
             dp.image_ann_to_image(ann.annotation_id, crop_image=True)
-    def clone(self) -> "PipelineComponent":
+    def clone(self) -> PipelineComponent:
         return self.__class__(self.category_names)
     def get_meta_annotation(self) -> JsonDict:
@@ -93,8 +95,8 @@ class MatchingService(PipelineComponent):
     def __init__(
         self,
-        parent_categories: Union[TypeOrStr, List[TypeOrStr]],
-        child_categories: Union[TypeOrStr, List[TypeOrStr]],
+        parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
+        child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
         matching_rule: Literal["iou", "ioa"],
         threshold: float,
         use_weighted_intersections: bool = False,
@@ -112,8 +114,16 @@ class MatchingService(PipelineComponent):
                                            value calibrate the ioa.
         :param max_parent_only: Will assign to each child at most one parent with maximum ioa
         """
-        self.parent_categories = parent_categories
-        self.child_categories = child_categories
+        self.parent_categories = (
+            [get_type(parent_categories)]  # type: ignore
+            if not isinstance(parent_categories, (list, set))
+            else [get_type(parent_category) for parent_category in parent_categories]
+        )
+        self.child_categories = (
+            [get_type(child_categories)]  # type: ignore
+            if not isinstance(child_categories, (list, set))
+            else [get_type(child_category) for child_category in child_categories]
+        )
         assert matching_rule in ["iou", "ioa"], "segment rule must be either iou or ioa"
         self.matching_rule = matching_rule
         self.threshold = threshold
@@ -217,7 +227,7 @@ class PageParsingService:
         """
         return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
-    def clone(self) -> "PageParsingService":
+    def clone(self) -> PageParsingService:
         """clone"""
         return self.__class__(
             deepcopy(self.text_container),
@@ -284,7 +294,7 @@ class AnnotationNmsService(PipelineComponent):
                 if ann.annotation_id not in ann_ids_to_keep:
                     self.dp_manager.deactivate_annotation(ann.annotation_id)
-    def clone(self) -> "PipelineComponent":
+    def clone(self) -> PipelineComponent:
         return self.__class__(deepcopy(self.nms_pairs), self.threshold)
     def get_meta_annotation(self) -> JsonDict:
@@ -318,7 +328,7 @@ class ImageParsingService:
         """
         return MapData(df, self.pass_datapoint)
-    def clone(self) -> "ImageParsingService":
+    def clone(self) -> ImageParsingService:
         """clone"""
         return self.__class__(self.dpi)

deepdoctection/pipe/concurrency.py CHANGED Viewed

@@ -18,6 +18,7 @@
 """
 Module for multithreading tasks
 """
+from __future__ import annotations
 import itertools
 import queue
@@ -221,7 +222,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
     def serve(self, dp: Image) -> None:
         raise NotImplementedError("MultiThreadPipelineComponent does not follow the PipelineComponent implementation")
-    def clone(self) -> "MultiThreadPipelineComponent":
+    def clone(self) -> MultiThreadPipelineComponent:
         raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
     def get_meta_annotation(self) -> JsonDict:

deepdoctection/pipe/doctectionpipe.py CHANGED Viewed

@@ -82,7 +82,6 @@ def _proto_process(
     else:
         path_tmp = path
     logger.info(LoggingRecord(f"Processing {file_name}", {"path": path_tmp, "df": path_tmp, "file_name": file_name}))
-    # logger.info("Processing %s", file_name, {"path": path_tmp, "df": path_tmp, "file_name": file_name})
     return dp
@@ -221,9 +220,10 @@ class DoctectionPipe(Pipeline):
         """
         output = kwargs.get("output", "page")
+        session_id = kwargs.get("session_id")
         assert output in ("page", "image", "dict"), "output must be either page image or dict"
         df = self._entry(**kwargs)
-        df = self._build_pipe(df)
+        df = self._build_pipe(df, session_id=session_id)  # type: ignore
         if output == "page":
             df = self.dataflow_to_page(df)
         elif output == "dict":

deepdoctection/pipe/language.py CHANGED Viewed

@@ -25,6 +25,7 @@ from ..datapoint.image import Image
 from ..datapoint.view import Page
 from ..extern.base import LanguageDetector, ObjectDetector
 from ..utils.detection_types import JsonDict
+from ..utils.error import ImageError
 from ..utils.settings import PageType, TypeOrStr, get_type
 from .base import PipelineComponent
 from .registry import pipeline_component_registry
@@ -86,7 +87,7 @@ class LanguageDetectionService(PipelineComponent):
             text = page.text_no_line_break
         else:
             if dp.image is None:
-                raise ValueError("dp.image cannot be None")
+                raise ImageError("image cannot be None")
             detect_result_list = self.text_detector.predict(dp.image)
             # this is a concatenation of all detection result. No reading order
             text = " ".join([result.text for result in detect_result_list if result.text is not None])
@@ -98,7 +99,7 @@ class LanguageDetectionService(PipelineComponent):
     def clone(self) -> PipelineComponent:
         predictor = self.predictor.clone()
         if not isinstance(predictor, LanguageDetector):
-            raise ValueError(f"Predictor must be of type LanguageDetector, but is of type {type(predictor)}")
+            raise TypeError(f"Predictor must be of type LanguageDetector, but is of type {type(predictor)}")
         return self.__class__(
             predictor,
             copy(self.text_container),

deepdoctection/pipe/layout.py CHANGED Viewed

@@ -18,6 +18,8 @@
 """
 Module for layout pipeline component
 """
+from __future__ import annotations
 from typing import Optional
 import numpy as np
@@ -25,6 +27,7 @@ import numpy as np
 from ..datapoint.image import Image
 from ..extern.base import ObjectDetector, PdfMiner
 from ..utils.detection_types import JsonDict
+from ..utils.error import ImageError
 from ..utils.transform import PadTransform
 from .base import PredictorPipelineComponent
 from .registry import pipeline_component_registry
@@ -79,7 +82,7 @@ class ImageLayoutService(PredictorPipelineComponent):
             if anns:
                 return
         if dp.image is None:
-            raise ValueError("image cannot be None")
+            raise ImageError("image cannot be None")
         np_image = dp.image
         if self.padder:
             np_image = self.padder.apply_image(np_image)
@@ -108,11 +111,11 @@ class ImageLayoutService(PredictorPipelineComponent):
     def _get_name(predictor_name: str) -> str:
         return f"image_{predictor_name}"
-    def clone(self) -> "PredictorPipelineComponent":
+    def clone(self) -> PredictorPipelineComponent:
         predictor = self.predictor.clone()
         padder_clone = None
         if self.padder:
             padder_clone = self.padder.clone()
         if not isinstance(predictor, ObjectDetector):
-            raise ValueError(f"predictor must be of type ObjectDetector, but is of type {type(predictor)}")
+            raise TypeError(f"predictor must be of type ObjectDetector, but is of type {type(predictor)}")
         return self.__class__(predictor, self.to_image, self.crop_image, padder_clone, self.skip_if_layout_extracted)

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# File: tokenclass.py
+# File: lm.py
 # Copyright 2021 Dr. Janis Meyer. All rights reserved.
 #
@@ -18,57 +18,19 @@
 """
 Module for token classification pipeline
 """
+from __future__ import annotations
 from copy import copy
-from typing import Any, List, Literal, Optional, Sequence, Union
+from typing import Any, Callable, List, Literal, Optional, Sequence, Union
 from ..datapoint.image import Image
 from ..extern.hflayoutlm import HFLayoutLmSequenceClassifierBase, HFLayoutLmTokenClassifierBase
-from ..mapper.laylmstruct import image_to_layoutlm_features
+from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
 from ..utils.detection_types import JsonDict
-from ..utils.file_utils import transformers_available
 from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
 from .base import LanguageModelPipelineComponent
 from .registry import pipeline_component_registry
-if transformers_available():
-    from transformers import LayoutLMTokenizerFast, RobertaTokenizerFast, XLMRobertaTokenizerFast
-    _ARCHITECTURES_TO_TOKENIZER = {
-        ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
-            "microsoft/layoutlm-base-uncased"
-        ),
-        ("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
-        ("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
-        ("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
-            "roberta-base", add_prefix_space=True
-        ),
-        ("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
-            "roberta-base", add_prefix_space=True
-        ),
-    }
-def get_tokenizer_from_architecture(architecture_name: str, use_xlm_tokenizer: bool) -> Any:
-    """
-    We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
-    returns the tokenizer that should be used for a particular model.
-    :param architecture_name: The model as stated in the transformer library.
-    :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
-    :return: Tokenizer instance to use.
-    """
-    return _ARCHITECTURES_TO_TOKENIZER[(architecture_name, use_xlm_tokenizer)]
 @pipeline_component_registry.register("LMTokenClassifierService")
 class LMTokenClassifierService(LanguageModelPipelineComponent):
@@ -154,7 +116,8 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
             else:
                 self.default_key = TokenClasses.other
             self.other_name_as_key = {self.default_key: categories_name_as_key[self.default_key]}
-        super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
+        image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
+        super().__init__(self._get_name(), tokenizer, image_to_features_func)
         self.required_kwargs = {
             "tokenizer": self.tokenizer,
             "padding": self.padding,
@@ -218,7 +181,9 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
                         word.annotation_id,
                     )
-    def clone(self) -> "LMTokenClassifierService":
+    def clone(self) -> LMTokenClassifierService:
+        # ToDo: replace copying of tokenizer with a proper clone method. Otherwise we cannot run the evaluation with
+        # multiple threads
         return self.__class__(
             copy(self.tokenizer),
             self.language_model.clone(),
@@ -244,19 +209,20 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
         return f"lm_token_class_{self.language_model.name}"
     def _init_sanity_checks(self) -> None:
-        tokenizer_class = self.language_model.model.config.tokenizer_class
-        use_xlm_tokenizer = False
-        if tokenizer_class is not None:
-            use_xlm_tokenizer = True
-        tokenizer_reference = get_tokenizer_from_architecture(
-            self.language_model.model.__class__.__name__, use_xlm_tokenizer
-        )
-        if not isinstance(self.tokenizer, type(tokenizer_reference)):
-            raise ValueError(
-                f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
+        tokenizer_class_name = self.language_model.model.config.tokenizer_class
+        if tokenizer_class_name != self.tokenizer.__class__.__name__:
+            raise TypeError(
+                f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
                 f"in this framework"
             )
+    @staticmethod
+    def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
+        """Replacing eval functions"""
+        return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
+            mapping_str
+        ]
 @pipeline_component_registry.register("LMSequenceClassifierService")
 class LMSequenceClassifierService(LanguageModelPipelineComponent):
@@ -315,7 +281,8 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
         self.padding = padding
         self.truncation = truncation
         self.return_overflowing_tokens = return_overflowing_tokens
-        super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
+        image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
+        super().__init__(self._get_name(), tokenizer, image_to_features_func)
         self.required_kwargs = {
             "tokenizer": self.tokenizer,
             "padding": self.padding,
@@ -335,7 +302,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
             PageType.document_type, lm_output.class_name, lm_output.class_id, None, lm_output.score
         )
-    def clone(self) -> "LMSequenceClassifierService":
+    def clone(self) -> LMSequenceClassifierService:
         return self.__class__(
             copy(self.tokenizer),
             self.language_model.clone(),
@@ -358,15 +325,16 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
         return f"lm_sequence_class_{self.language_model.name}"
     def _init_sanity_checks(self) -> None:
-        tokenizer_class = self.language_model.model.config.tokenizer_class
-        use_xlm_tokenizer = False
-        if tokenizer_class is not None:
-            use_xlm_tokenizer = True
-        tokenizer_reference = get_tokenizer_from_architecture(
-            self.language_model.model.__class__.__name__, use_xlm_tokenizer
-        )
-        if not isinstance(self.tokenizer, type(tokenizer_reference)):
-            raise ValueError(
-                f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
+        tokenizer_class_name = self.language_model.model.config.tokenizer_class
+        if tokenizer_class_name != self.tokenizer.__class__.__name__:
+            raise TypeError(
+                f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
                 f"in this framework"
             )
+    @staticmethod
+    def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
+        """Replacing eval functions"""
+        return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
+            mapping_str
+        ]

deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl