deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +8 -25
- deepdoctection/analyzer/dd.py +84 -71
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +78 -56
- deepdoctection/datapoint/box.py +7 -7
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +157 -75
- deepdoctection/datapoint/view.py +175 -151
- deepdoctection/datasets/adapter.py +30 -24
- deepdoctection/datasets/base.py +10 -10
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +23 -25
- deepdoctection/datasets/instances/doclaynet.py +48 -49
- deepdoctection/datasets/instances/fintabnet.py +44 -45
- deepdoctection/datasets/instances/funsd.py +23 -23
- deepdoctection/datasets/instances/iiitar13k.py +8 -8
- deepdoctection/datasets/instances/layouttest.py +2 -2
- deepdoctection/datasets/instances/publaynet.py +3 -3
- deepdoctection/datasets/instances/pubtables1m.py +18 -18
- deepdoctection/datasets/instances/pubtabnet.py +30 -29
- deepdoctection/datasets/instances/rvlcdip.py +28 -29
- deepdoctection/datasets/instances/xfund.py +51 -30
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +13 -12
- deepdoctection/eval/eval.py +32 -26
- deepdoctection/eval/tedsmetric.py +16 -12
- deepdoctection/eval/tp_eval_callback.py +7 -16
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +69 -89
- deepdoctection/extern/deskew.py +11 -10
- deepdoctection/extern/doctrocr.py +81 -64
- deepdoctection/extern/fastlang.py +23 -16
- deepdoctection/extern/hfdetr.py +53 -38
- deepdoctection/extern/hflayoutlm.py +216 -155
- deepdoctection/extern/hflm.py +35 -30
- deepdoctection/extern/model.py +433 -255
- deepdoctection/extern/pdftext.py +15 -15
- deepdoctection/extern/pt/ptutils.py +4 -2
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +14 -16
- deepdoctection/extern/tp/tfutils.py +16 -2
- deepdoctection/extern/tp/tpcompat.py +11 -7
- deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
- deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
- deepdoctection/extern/tpdetect.py +40 -45
- deepdoctection/mapper/cats.py +36 -40
- deepdoctection/mapper/cocostruct.py +16 -12
- deepdoctection/mapper/d2struct.py +22 -22
- deepdoctection/mapper/hfstruct.py +7 -7
- deepdoctection/mapper/laylmstruct.py +22 -24
- deepdoctection/mapper/maputils.py +9 -10
- deepdoctection/mapper/match.py +33 -2
- deepdoctection/mapper/misc.py +6 -7
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +6 -6
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/anngen.py +39 -14
- deepdoctection/pipe/base.py +68 -99
- deepdoctection/pipe/common.py +181 -85
- deepdoctection/pipe/concurrency.py +14 -10
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +18 -16
- deepdoctection/pipe/lm.py +49 -47
- deepdoctection/pipe/order.py +63 -65
- deepdoctection/pipe/refine.py +102 -109
- deepdoctection/pipe/segment.py +157 -162
- deepdoctection/pipe/sub_layout.py +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/d2_frcnn_train.py +27 -25
- deepdoctection/train/hf_detr_train.py +22 -18
- deepdoctection/train/hf_layoutlm_train.py +49 -48
- deepdoctection/train/tp_frcnn_train.py +10 -11
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +52 -14
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +41 -14
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +15 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/pdf_utils.py +39 -14
- deepdoctection/utils/settings.py +188 -182
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +70 -69
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
- deepdoctection-0.34.dist-info/RECORD +146 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.32.dist-info/RECORD +0 -146
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
deepdoctection/extern/hflm.py
CHANGED
|
@@ -21,16 +21,15 @@ Wrapper for the Hugging Face Language Model for sequence and token classificati
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
23
|
from abc import ABC
|
|
24
|
-
from copy import copy
|
|
25
24
|
from pathlib import Path
|
|
26
|
-
from typing import
|
|
25
|
+
from typing import Literal, Mapping, Optional, Union
|
|
27
26
|
|
|
28
27
|
from lazy_imports import try_import
|
|
29
28
|
|
|
30
|
-
from ..utils.detection_types import JsonDict, Requirement
|
|
31
29
|
from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
|
|
32
30
|
from ..utils.settings import TypeOrStr
|
|
33
|
-
from .
|
|
31
|
+
from ..utils.types import JsonDict, PathLikeOrStr, Requirement
|
|
32
|
+
from .base import LMSequenceClassifier, ModelCategories, SequenceClassResult
|
|
34
33
|
from .hflayoutlm import get_tokenizer_from_model_class
|
|
35
34
|
from .pt.ptutils import get_torch_device
|
|
36
35
|
|
|
@@ -69,34 +68,29 @@ class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
|
69
68
|
Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
|
|
70
69
|
"""
|
|
71
70
|
|
|
72
|
-
model: Union[XLMRobertaForSequenceClassification]
|
|
73
|
-
|
|
74
71
|
def __init__(
|
|
75
72
|
self,
|
|
76
|
-
path_config_json:
|
|
77
|
-
path_weights:
|
|
78
|
-
categories: Mapping[
|
|
73
|
+
path_config_json: PathLikeOrStr,
|
|
74
|
+
path_weights: PathLikeOrStr,
|
|
75
|
+
categories: Mapping[int, TypeOrStr],
|
|
79
76
|
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
80
|
-
use_xlm_tokenizer: bool = False,
|
|
81
77
|
):
|
|
82
|
-
self.path_config = path_config_json
|
|
83
|
-
self.path_weights = path_weights
|
|
84
|
-
self.categories =
|
|
78
|
+
self.path_config = Path(path_config_json)
|
|
79
|
+
self.path_weights = Path(path_weights)
|
|
80
|
+
self.categories = ModelCategories(init_categories=categories)
|
|
85
81
|
|
|
86
82
|
self.device = get_torch_device(device)
|
|
87
|
-
self.model.to(self.device)
|
|
88
|
-
self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
|
|
89
83
|
|
|
90
84
|
@classmethod
|
|
91
|
-
def get_requirements(cls) ->
|
|
85
|
+
def get_requirements(cls) -> list[Requirement]:
|
|
92
86
|
return [get_pytorch_requirement(), get_transformers_requirement()]
|
|
93
87
|
|
|
94
88
|
def clone(self) -> HFLmSequenceClassifierBase:
|
|
95
|
-
return self.__class__(self.path_config, self.path_weights, self.categories, self.device)
|
|
89
|
+
return self.__class__(self.path_config, self.path_weights, self.categories.get_categories(), self.device)
|
|
96
90
|
|
|
97
91
|
def _validate_encodings(
|
|
98
|
-
self, **encodings: Union[
|
|
99
|
-
) ->
|
|
92
|
+
self, **encodings: Union[list[list[str]], torch.Tensor]
|
|
93
|
+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
100
94
|
input_ids = encodings.get("input_ids")
|
|
101
95
|
attention_mask = encodings.get("attention_mask")
|
|
102
96
|
token_type_ids = encodings.get("token_type_ids")
|
|
@@ -120,16 +114,18 @@ class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
|
120
114
|
return input_ids, attention_mask, token_type_ids
|
|
121
115
|
|
|
122
116
|
@staticmethod
|
|
123
|
-
def get_name(path_weights:
|
|
117
|
+
def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
|
|
124
118
|
"""Returns the name of the model"""
|
|
125
119
|
return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
|
|
126
120
|
|
|
127
|
-
|
|
121
|
+
@staticmethod
|
|
122
|
+
def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
|
|
128
123
|
"""A refinement for adding the tokenizer class name to the model configs.
|
|
129
124
|
|
|
125
|
+
:param model_class_name: The model name, e.g. model.__class__.__name__
|
|
130
126
|
:param use_xlm_tokenizer: Whether to use a XLM tokenizer.
|
|
131
127
|
"""
|
|
132
|
-
tokenizer = get_tokenizer_from_model_class(
|
|
128
|
+
tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
|
|
133
129
|
return tokenizer.__class__.__name__
|
|
134
130
|
|
|
135
131
|
@staticmethod
|
|
@@ -177,18 +173,22 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
177
173
|
|
|
178
174
|
def __init__(
|
|
179
175
|
self,
|
|
180
|
-
path_config_json:
|
|
181
|
-
path_weights:
|
|
182
|
-
categories: Mapping[
|
|
176
|
+
path_config_json: PathLikeOrStr,
|
|
177
|
+
path_weights: PathLikeOrStr,
|
|
178
|
+
categories: Mapping[int, TypeOrStr],
|
|
183
179
|
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
184
180
|
use_xlm_tokenizer: bool = True,
|
|
185
181
|
):
|
|
182
|
+
super().__init__(path_config_json, path_weights, categories, device)
|
|
186
183
|
self.name = self.get_name(path_weights, "bert-like")
|
|
187
184
|
self.model_id = self.get_model_id()
|
|
188
185
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
189
|
-
|
|
186
|
+
self.model.to(self.device)
|
|
187
|
+
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
188
|
+
self.model.__class__.__name__, use_xlm_tokenizer
|
|
189
|
+
)
|
|
190
190
|
|
|
191
|
-
def predict(self, **encodings: Union[
|
|
191
|
+
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
|
|
192
192
|
input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
|
|
193
193
|
|
|
194
194
|
result = predict_sequence_classes(
|
|
@@ -199,11 +199,13 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
199
199
|
)
|
|
200
200
|
|
|
201
201
|
result.class_id += 1
|
|
202
|
-
result.class_name = self.categories[
|
|
202
|
+
result.class_name = self.categories.categories[result.class_id]
|
|
203
203
|
return result
|
|
204
204
|
|
|
205
205
|
@staticmethod
|
|
206
|
-
def get_wrapped_model(
|
|
206
|
+
def get_wrapped_model(
|
|
207
|
+
path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
|
|
208
|
+
) -> XLMRobertaForSequenceClassification:
|
|
207
209
|
"""
|
|
208
210
|
Get the inner (wrapped) model.
|
|
209
211
|
|
|
@@ -217,9 +219,12 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
217
219
|
)
|
|
218
220
|
|
|
219
221
|
@staticmethod
|
|
220
|
-
def
|
|
222
|
+
def default_kwargs_for_image_to_features_mapping() -> JsonDict:
|
|
221
223
|
"""
|
|
222
224
|
Add some default arguments that might be necessary when preparing a sample. Overwrite this method
|
|
223
225
|
for some custom setting.
|
|
224
226
|
"""
|
|
225
227
|
return {}
|
|
228
|
+
|
|
229
|
+
def clear_model(self) -> None:
|
|
230
|
+
self.model = None
|