deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +16 -29
- deepdoctection/analyzer/dd.py +70 -59
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +41 -56
- deepdoctection/datapoint/box.py +9 -8
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +56 -44
- deepdoctection/datapoint/view.py +245 -150
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +35 -26
- deepdoctection/datasets/base.py +14 -12
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +24 -26
- deepdoctection/datasets/instances/doclaynet.py +51 -51
- deepdoctection/datasets/instances/fintabnet.py +46 -46
- deepdoctection/datasets/instances/funsd.py +25 -24
- deepdoctection/datasets/instances/iiitar13k.py +13 -10
- deepdoctection/datasets/instances/layouttest.py +4 -3
- deepdoctection/datasets/instances/publaynet.py +5 -5
- deepdoctection/datasets/instances/pubtables1m.py +24 -21
- deepdoctection/datasets/instances/pubtabnet.py +32 -30
- deepdoctection/datasets/instances/rvlcdip.py +30 -30
- deepdoctection/datasets/instances/xfund.py +26 -26
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +15 -13
- deepdoctection/eval/eval.py +41 -37
- deepdoctection/eval/tedsmetric.py +30 -23
- deepdoctection/eval/tp_eval_callback.py +16 -19
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +85 -113
- deepdoctection/extern/deskew.py +14 -11
- deepdoctection/extern/doctrocr.py +141 -130
- deepdoctection/extern/fastlang.py +27 -18
- deepdoctection/extern/hfdetr.py +71 -62
- deepdoctection/extern/hflayoutlm.py +504 -211
- deepdoctection/extern/hflm.py +230 -0
- deepdoctection/extern/model.py +488 -302
- deepdoctection/extern/pdftext.py +23 -19
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +29 -19
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +18 -18
- deepdoctection/extern/tp/tfutils.py +57 -9
- deepdoctection/extern/tp/tpcompat.py +21 -14
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +45 -53
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/cats.py +27 -29
- deepdoctection/mapper/cocostruct.py +10 -10
- deepdoctection/mapper/d2struct.py +27 -26
- deepdoctection/mapper/hfstruct.py +13 -8
- deepdoctection/mapper/laylmstruct.py +178 -37
- deepdoctection/mapper/maputils.py +12 -11
- deepdoctection/mapper/match.py +2 -2
- deepdoctection/mapper/misc.py +11 -9
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +5 -5
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +5 -5
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +12 -14
- deepdoctection/pipe/base.py +52 -106
- deepdoctection/pipe/common.py +72 -59
- deepdoctection/pipe/concurrency.py +16 -11
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +20 -16
- deepdoctection/pipe/lm.py +75 -105
- deepdoctection/pipe/order.py +194 -89
- deepdoctection/pipe/refine.py +111 -124
- deepdoctection/pipe/segment.py +156 -161
- deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +48 -41
- deepdoctection/train/hf_detr_train.py +41 -30
- deepdoctection/train/hf_layoutlm_train.py +153 -135
- deepdoctection/train/tp_frcnn_train.py +32 -31
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +87 -125
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +22 -18
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +16 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +11 -11
- deepdoctection/utils/settings.py +185 -181
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +74 -72
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
- deepdoctection-0.33.dist-info/RECORD +146 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.31.dist-info/RECORD +0 -144
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: hfml.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Wrapper for the Hugging Face Language Model for sequence and token classification
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from abc import ABC
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Literal, Mapping, Optional, Union
|
|
26
|
+
|
|
27
|
+
from lazy_imports import try_import
|
|
28
|
+
|
|
29
|
+
from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
|
|
30
|
+
from ..utils.settings import TypeOrStr
|
|
31
|
+
from ..utils.types import JsonDict, PathLikeOrStr, Requirement
|
|
32
|
+
from .base import LMSequenceClassifier, ModelCategories, SequenceClassResult
|
|
33
|
+
from .hflayoutlm import get_tokenizer_from_model_class
|
|
34
|
+
from .pt.ptutils import get_torch_device
|
|
35
|
+
|
|
36
|
+
with try_import() as pt_import_guard:
|
|
37
|
+
import torch
|
|
38
|
+
import torch.nn.functional as F
|
|
39
|
+
|
|
40
|
+
with try_import() as tr_import_guard:
|
|
41
|
+
from transformers import PretrainedConfig, XLMRobertaForSequenceClassification
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def predict_sequence_classes(
|
|
45
|
+
input_ids: torch.Tensor,
|
|
46
|
+
attention_mask: torch.Tensor,
|
|
47
|
+
token_type_ids: torch.Tensor,
|
|
48
|
+
model: Union[XLMRobertaForSequenceClassification],
|
|
49
|
+
) -> SequenceClassResult:
|
|
50
|
+
"""
|
|
51
|
+
:param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
|
|
52
|
+
:param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
|
|
53
|
+
:param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
|
|
54
|
+
:param model: layoutlm model for sequence classification
|
|
55
|
+
:return: SequenceClassResult
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
|
|
59
|
+
|
|
60
|
+
score = torch.max(F.softmax(outputs.logits)).tolist()
|
|
61
|
+
sequence_class_predictions = outputs.logits.argmax(-1).squeeze().tolist()
|
|
62
|
+
|
|
63
|
+
return SequenceClassResult(class_id=sequence_class_predictions, score=float(score)) # type: ignore
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
67
|
+
"""
|
|
68
|
+
Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
path_config_json: PathLikeOrStr,
|
|
74
|
+
path_weights: PathLikeOrStr,
|
|
75
|
+
categories: Mapping[int, TypeOrStr],
|
|
76
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
77
|
+
):
|
|
78
|
+
self.path_config = Path(path_config_json)
|
|
79
|
+
self.path_weights = Path(path_weights)
|
|
80
|
+
self.categories = ModelCategories(init_categories=categories)
|
|
81
|
+
|
|
82
|
+
self.device = get_torch_device(device)
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def get_requirements(cls) -> list[Requirement]:
|
|
86
|
+
return [get_pytorch_requirement(), get_transformers_requirement()]
|
|
87
|
+
|
|
88
|
+
def clone(self) -> HFLmSequenceClassifierBase:
|
|
89
|
+
return self.__class__(self.path_config, self.path_weights, self.categories.get_categories(), self.device)
|
|
90
|
+
|
|
91
|
+
def _validate_encodings(
|
|
92
|
+
self, **encodings: Union[list[list[str]], torch.Tensor]
|
|
93
|
+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
94
|
+
input_ids = encodings.get("input_ids")
|
|
95
|
+
attention_mask = encodings.get("attention_mask")
|
|
96
|
+
token_type_ids = encodings.get("token_type_ids")
|
|
97
|
+
|
|
98
|
+
if isinstance(input_ids, torch.Tensor):
|
|
99
|
+
input_ids = input_ids.to(self.device)
|
|
100
|
+
else:
|
|
101
|
+
raise ValueError(f"input_ids must be list but is {type(input_ids)}")
|
|
102
|
+
if isinstance(attention_mask, torch.Tensor):
|
|
103
|
+
attention_mask = attention_mask.to(self.device)
|
|
104
|
+
else:
|
|
105
|
+
raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
|
|
106
|
+
if isinstance(token_type_ids, torch.Tensor):
|
|
107
|
+
token_type_ids = token_type_ids.to(self.device)
|
|
108
|
+
else:
|
|
109
|
+
raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
|
|
110
|
+
|
|
111
|
+
input_ids = input_ids.to(self.device)
|
|
112
|
+
attention_mask = attention_mask.to(self.device)
|
|
113
|
+
token_type_ids = token_type_ids.to(self.device)
|
|
114
|
+
return input_ids, attention_mask, token_type_ids
|
|
115
|
+
|
|
116
|
+
@staticmethod
|
|
117
|
+
def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
|
|
118
|
+
"""Returns the name of the model"""
|
|
119
|
+
return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
|
|
123
|
+
"""A refinement for adding the tokenizer class name to the model configs.
|
|
124
|
+
|
|
125
|
+
:param model_class_name: The model name, e.g. model.__class__.__name__
|
|
126
|
+
:param use_xlm_tokenizer: Whether to use a XLM tokenizer.
|
|
127
|
+
"""
|
|
128
|
+
tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
|
|
129
|
+
return tokenizer.__class__.__name__
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def image_to_raw_features_mapping() -> str:
|
|
133
|
+
"""Returns the mapping function to convert images into raw features."""
|
|
134
|
+
return "image_to_raw_lm_features"
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def image_to_features_mapping() -> str:
|
|
138
|
+
"""Returns the mapping function to convert images into features."""
|
|
139
|
+
return "image_to_lm_features"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
143
|
+
"""
|
|
144
|
+
A wrapper class for `transformers.XLMRobertaForSequenceClassification` and similar models to use within a pipeline
|
|
145
|
+
component. Check <https://huggingface.co/docs/transformers/model_doc/xlm-roberta> for documentation of the
|
|
146
|
+
model itself.
|
|
147
|
+
Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
|
|
148
|
+
classification and other things please use another model of the family.
|
|
149
|
+
|
|
150
|
+
**Example**
|
|
151
|
+
|
|
152
|
+
# setting up compulsory ocr service
|
|
153
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
154
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
155
|
+
ocr_service = TextExtractionService(tess)
|
|
156
|
+
|
|
157
|
+
# hf tokenizer and token classifier
|
|
158
|
+
tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
|
|
159
|
+
roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
|
|
160
|
+
categories=["handwritten", "presentation", "resume"])
|
|
161
|
+
|
|
162
|
+
# token classification service
|
|
163
|
+
roberta_service = LMSequenceClassifierService(tokenizer,roberta)
|
|
164
|
+
|
|
165
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
|
|
166
|
+
|
|
167
|
+
path = "path/to/some/form"
|
|
168
|
+
df = pipe.analyze(path=path)
|
|
169
|
+
|
|
170
|
+
for dp in df:
|
|
171
|
+
...
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
def __init__(
|
|
175
|
+
self,
|
|
176
|
+
path_config_json: PathLikeOrStr,
|
|
177
|
+
path_weights: PathLikeOrStr,
|
|
178
|
+
categories: Mapping[int, TypeOrStr],
|
|
179
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
180
|
+
use_xlm_tokenizer: bool = True,
|
|
181
|
+
):
|
|
182
|
+
super().__init__(path_config_json, path_weights, categories, device)
|
|
183
|
+
self.name = self.get_name(path_weights, "bert-like")
|
|
184
|
+
self.model_id = self.get_model_id()
|
|
185
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
186
|
+
self.model.to(self.device)
|
|
187
|
+
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
188
|
+
self.model.__class__.__name__, use_xlm_tokenizer
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
|
|
192
|
+
input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
|
|
193
|
+
|
|
194
|
+
result = predict_sequence_classes(
|
|
195
|
+
input_ids,
|
|
196
|
+
attention_mask,
|
|
197
|
+
token_type_ids,
|
|
198
|
+
self.model,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
result.class_id += 1
|
|
202
|
+
result.class_name = self.categories.categories[result.class_id]
|
|
203
|
+
return result
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def get_wrapped_model(
|
|
207
|
+
path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
|
|
208
|
+
) -> XLMRobertaForSequenceClassification:
|
|
209
|
+
"""
|
|
210
|
+
Get the inner (wrapped) model.
|
|
211
|
+
|
|
212
|
+
:param path_config_json: path to .json config file
|
|
213
|
+
:param path_weights: path to model artifact
|
|
214
|
+
:return: 'nn.Module'
|
|
215
|
+
"""
|
|
216
|
+
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
217
|
+
return XLMRobertaForSequenceClassification.from_pretrained(
|
|
218
|
+
pretrained_model_name_or_path=path_weights, config=config
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
@staticmethod
|
|
222
|
+
def default_kwargs_for_image_to_features_mapping() -> JsonDict:
|
|
223
|
+
"""
|
|
224
|
+
Add some default arguments that might be necessary when preparing a sample. Overwrite this method
|
|
225
|
+
for some custom setting.
|
|
226
|
+
"""
|
|
227
|
+
return {}
|
|
228
|
+
|
|
229
|
+
def clear_model(self) -> None:
|
|
230
|
+
self.model = None
|