PyPI - deepdoctection - Versions diffs - 0.37.2__py3-none-any.whl → 0.37.3__py3-none-any.whl - Mend

deepdoctection 0.37.2py3-none-any.whl → 0.37.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (8) hide show

deepdoctection/__init__.py CHANGED Viewed

@@ -24,7 +24,7 @@ from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = "0.37.2"
+__version__ = "0.37.3"
 _IMPORT_STRUCTURE = {
     "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],

deepdoctection/extern/hflayoutlm.py CHANGED Viewed

@@ -48,7 +48,7 @@ with try_import() as pt_import_guard:
     import torch.nn.functional as F
 with try_import() as tr_import_guard:
-    from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD  # type:ignore
+    from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
     from transformers import (
         LayoutLMForSequenceClassification,
         LayoutLMForTokenClassification,
@@ -67,20 +67,6 @@ with try_import() as tr_import_guard:
     )
 if TYPE_CHECKING:
-    LayoutTokenModels: TypeAlias = Union[
-        LayoutLMForTokenClassification,
-        LayoutLMv2ForTokenClassification,
-        LayoutLMv3ForTokenClassification,
-        LiltForTokenClassification,
-    ]
-    LayoutSequenceModels: TypeAlias = Union[
-        LayoutLMForSequenceClassification,
-        LayoutLMv2ForSequenceClassification,
-        LayoutLMv3ForSequenceClassification,
-        LiltForSequenceClassification,
-    ]
     HfLayoutTokenModels: TypeAlias = Union[
         LayoutLMForTokenClassification,
         LayoutLMv2ForTokenClassification,
@@ -147,7 +133,7 @@ def predict_token_classes(
     token_type_ids: torch.Tensor,
     boxes: torch.Tensor,
     tokens: list[list[str]],
-    model: LayoutTokenModels,
+    model: HfLayoutTokenModels,
     images: Optional[torch.Tensor] = None,
 ) -> list[TokenClassResult]:
     """
@@ -205,7 +191,7 @@ def predict_sequence_classes(
     attention_mask: torch.Tensor,
     token_type_ids: torch.Tensor,
     boxes: torch.Tensor,
-    model: LayoutSequenceModels,
+    model: HfLayoutSequenceModels,
     images: Optional[torch.Tensor] = None,
 ) -> SequenceClassResult:
     """
@@ -254,6 +240,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
         categories_bio: Optional[Sequence[TypeOrStr]] = None,
         categories: Optional[Mapping[int, TypeOrStr]] = None,
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         """
         :param path_config_json: path to .json config file
@@ -281,6 +268,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
             init_categories=categories, categories_semantics=categories_semantics, categories_bio=categories_bio
         )
         self.device = get_torch_device(device)
+        self.use_xlm_tokenizer = use_xlm_tokenizer
     @classmethod
     def get_requirements(cls) -> list[Requirement]:
@@ -342,6 +330,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
             self.categories.categories_bio,
             self.categories.get_categories(),
             self.device,
+            self.use_xlm_tokenizer,
         )
     @staticmethod
@@ -427,13 +416,15 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
         :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
                                   Tokenizer.
         """
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
+        )
         self.name = self.get_name(path_weights, "LayoutLM")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -540,13 +531,15 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
         :param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
                                   default value.
         """
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
+        )
         self.name = self.get_name(path_weights, "LayoutLMv2")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -666,13 +659,15 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
         :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLMv3 model with a different
                                   tokenizer.
         """
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
+        )
         self.name = self.get_name(path_weights, "LayoutLMv3")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -746,19 +741,23 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
         path_weights: PathLikeOrStr,
         categories: Mapping[int, TypeOrStr],
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         self.path_config = Path(path_config_json)
         self.path_weights = Path(path_weights)
         self.categories = ModelCategories(init_categories=categories)
         self.device = get_torch_device(device)
+        self.use_xlm_tokenizer = use_xlm_tokenizer
     @classmethod
     def get_requirements(cls) -> list[Requirement]:
         return [get_pytorch_requirement(), get_transformers_requirement()]
     def clone(self) -> HFLayoutLmSequenceClassifierBase:
-        return self.__class__(self.path_config, self.path_weights, self.categories.get_categories(), self.device)
+        return self.__class__(
+            self.path_config, self.path_weights, self.categories.get_categories(), self.device, self.use_xlm_tokenizer
+        )
     def _validate_encodings(
         self, **encodings: Union[list[list[str]], torch.Tensor]
@@ -856,13 +855,13 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
         use_xlm_tokenizer: bool = False,
     ):
-        super().__init__(path_config_json, path_weights, categories, device)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
         self.name = self.get_name(path_weights, "LayoutLM")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -939,13 +938,13 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
         use_xlm_tokenizer: bool = False,
     ):
-        super().__init__(path_config_json, path_weights, categories, device)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
         self.name = self.get_name(path_weights, "LayoutLMv2")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -1029,13 +1028,13 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
         use_xlm_tokenizer: bool = False,
     ):
-        super().__init__(path_config_json, path_weights, categories, device)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
         self.name = self.get_name(path_weights, "LayoutLMv3")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -1142,13 +1141,15 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
         :param device: The device (cpu,"cuda"), where to place the model.
         """
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
+        )
         self.name = self.get_name(path_weights, "LiLT")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -1232,13 +1233,13 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
         use_xlm_tokenizer: bool = False,
     ):
-        super().__init__(path_config_json, path_weights, categories, device)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
         self.name = self.get_name(path_weights, "LiLT")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -1270,3 +1271,19 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
     def clear_model(self) -> None:
         self.model = None
+if TYPE_CHECKING:
+    LayoutTokenModels: TypeAlias = Union[
+        HFLayoutLmTokenClassifier,
+        HFLayoutLmv2TokenClassifier,
+        HFLayoutLmv3TokenClassifier,
+        HFLiltTokenClassifier,
+    ]
+    LayoutSequenceModels: TypeAlias = Union[
+        HFLayoutLmSequenceClassifier,
+        HFLayoutLmv2SequenceClassifier,
+        HFLayoutLmv3SequenceClassifier,
+        HFLiltSequenceClassifier,
+    ]

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -30,7 +30,7 @@ from .base import MetaAnnotation, PipelineComponent
 from .registry import pipeline_component_registry
 if TYPE_CHECKING:
-    from ..extern.hflayoutlm import HfLayoutSequenceModels, HfLayoutTokenModels
+    from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
 @pipeline_component_registry.register("LMTokenClassifierService")
@@ -66,7 +66,7 @@ class LMTokenClassifierService(PipelineComponent):
     def __init__(
         self,
         tokenizer: Any,
-        language_model: HfLayoutTokenModels,
+        language_model: LayoutTokenModels,
         padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
         truncation: bool = True,
         return_overflowing_tokens: bool = False,
@@ -155,11 +155,11 @@ class LMTokenClassifierService(PipelineComponent):
                 else:
                     token_class_name_id = None
                 self.dp_manager.set_category_annotation(
-                    token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid
+                    token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid, token.score
                 )
                 self.dp_manager.set_category_annotation(token.bio_tag, None, WordType.TAG, token.uuid)
                 self.dp_manager.set_category_annotation(
-                    token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid
+                    token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid, token.score
                 )
                 words_populated.append(token.uuid)
@@ -188,7 +188,7 @@ class LMTokenClassifierService(PipelineComponent):
         # multiple threads
         return self.__class__(
             copy(self.tokenizer),
-            self.language_model.clone(),
+            self.language_model.clone(),  # type: ignore
             self.padding,
             self.truncation,
             self.return_overflowing_tokens,
@@ -260,7 +260,7 @@ class LMSequenceClassifierService(PipelineComponent):
     def __init__(
         self,
         tokenizer: Any,
-        language_model: HfLayoutSequenceModels,
+        language_model: LayoutSequenceModels,
         padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
         truncation: bool = True,
         return_overflowing_tokens: bool = False,
@@ -309,7 +309,7 @@ class LMSequenceClassifierService(PipelineComponent):
     def clone(self) -> LMSequenceClassifierService:
         return self.__class__(
             copy(self.tokenizer),
-            self.language_model.clone(),
+            self.language_model.clone(),  # type: ignore
             self.padding,
             self.truncation,
             self.return_overflowing_tokens,

{deepdoctection-0.37.2.dist-info → deepdoctection-0.37.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepdoctection
-Version: 0.37.2
+Version: 0.37.3
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer

{deepdoctection-0.37.2.dist-info → deepdoctection-0.37.3.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-deepdoctection/__init__.py,sha256=T2sHOc6ZPpx44hWbarp0i_QlAqm0dEmzs7HVg2mL_nM,12655
+deepdoctection/__init__.py,sha256=SSW2Y9uos3Mnkihi-H-CmRF5V8nHUARbToRp83yWJB0,12655
 deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
 deepdoctection/analyzer/_config.py,sha256=NZl_REM8Ge2xfxvHN-mZR5KURcHfZii3xfMlKQwckbA,4864
@@ -56,7 +56,7 @@ deepdoctection/extern/deskew.py,sha256=sPoixu8S9he-0wbs-jgxtPE2V9BiP4-3uZlb6F5Y1
 deepdoctection/extern/doctrocr.py,sha256=T3_tvlih22_dVCBZypS1Y8tjQQB1fkAxIbGdUGHIapQ,24473
 deepdoctection/extern/fastlang.py,sha256=F4gK-SEwcCujjxH327ZDzMGWToJ49xS_dCKcePQ9IlY,4780
 deepdoctection/extern/hfdetr.py,sha256=1NPW_u5eH2tP3ixZ91l4WR-O-wLVcrFsLWA7BqID0oM,12055
-deepdoctection/extern/hflayoutlm.py,sha256=T1IBm3C8CtG97-tauo03YqhUac6xdFc2y345BWVMajQ,56509
+deepdoctection/extern/hflayoutlm.py,sha256=tFaf90FRbZzhSycdp8rGkeiPywQa6UcTEEwbayIXkr0,57023
 deepdoctection/extern/hflm.py,sha256=kwS6kcSlY_2m9u0RzBLTRq-UMM7c1PhyUaDTvSdejus,9217
 deepdoctection/extern/model.py,sha256=ViHHKPvbGmLCPw7ZESv_rmjlkA90UiBU6oZiHOMqNSw,59869
 deepdoctection/extern/pdftext.py,sha256=KS_t27SUiYn_IOS_J2lF9lSSo22vLagxmxvYCY3CqXA,7228
@@ -109,7 +109,7 @@ deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrN
 deepdoctection/pipe/doctectionpipe.py,sha256=wCg96P9Pb54i5AVgG02b4FljobM64_qEML_GxiULy-4,11765
 deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
 deepdoctection/pipe/layout.py,sha256=xIhnJpyUSbvLbhTXyAKXY1hmG9352jihGYFSclTH_1g,5567
-deepdoctection/pipe/lm.py,sha256=Sp-b7smeslNDyioEfNjuNBUxAuFKn3-OKpCZkGXri_c,16643
+deepdoctection/pipe/lm.py,sha256=tLuCtML-S_kTEYcDAEtM3NBYmR7Aovv9p5TcXYL_AAg,16693
 deepdoctection/pipe/order.py,sha256=PnJZiCnxFluJiECXLTZT0c1Rr66vIRBFraa_G41UA2k,40121
 deepdoctection/pipe/refine.py,sha256=RjaOv5dvVxVrMm_ze-yeAqc46Be2I--7UARLezYbNxA,22250
 deepdoctection/pipe/registry.py,sha256=aFx-Tn0xhVA5l5H18duNW5QoTNKQltybsEUEzsMgUfg,902
@@ -141,8 +141,8 @@ deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F
 deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
 deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
 deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
-deepdoctection-0.37.2.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
-deepdoctection-0.37.2.dist-info/METADATA,sha256=XLOCkFBWynZhyZmKpDDRaomDIxPnVpy07WdUkodRF3Y,19545
-deepdoctection-0.37.2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-deepdoctection-0.37.2.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
-deepdoctection-0.37.2.dist-info/RECORD,,
+deepdoctection-0.37.3.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
+deepdoctection-0.37.3.dist-info/METADATA,sha256=aD8ZORAl_RBxZXaL0uojCx-bs8Xe3x9ocLSpvXn5nRc,19545
+deepdoctection-0.37.3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+deepdoctection-0.37.3.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
+deepdoctection-0.37.3.dist-info/RECORD,,

{deepdoctection-0.37.2.dist-info → deepdoctection-0.37.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{deepdoctection-0.37.2.dist-info → deepdoctection-0.37.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{deepdoctection-0.37.2.dist-info → deepdoctection-0.37.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

deepdoctection 0.37.2__py3-none-any.whl → 0.37.3__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.37.2py3-none-any.whl → 0.37.3py3-none-any.whl