deepdoctection 0.44.1__py3-none-any.whl → 0.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +7 -3
- deepdoctection/analyzer/config.py +44 -0
- deepdoctection/analyzer/factory.py +264 -7
- deepdoctection/configs/profiles.jsonl +2 -1
- deepdoctection/dataflow/parallel_map.py +7 -1
- deepdoctection/datapoint/box.py +5 -5
- deepdoctection/datapoint/image.py +5 -5
- deepdoctection/datapoint/view.py +73 -52
- deepdoctection/eval/cocometric.py +1 -0
- deepdoctection/extern/__init__.py +1 -0
- deepdoctection/extern/base.py +8 -1
- deepdoctection/extern/d2detect.py +1 -1
- deepdoctection/extern/doctrocr.py +18 -2
- deepdoctection/extern/fastlang.py +2 -2
- deepdoctection/extern/hflayoutlm.py +17 -10
- deepdoctection/extern/hflm.py +432 -7
- deepdoctection/extern/tessocr.py +17 -1
- deepdoctection/pipe/language.py +4 -4
- deepdoctection/pipe/lm.py +7 -3
- deepdoctection/pipe/order.py +12 -6
- deepdoctection/pipe/refine.py +10 -1
- deepdoctection/pipe/text.py +6 -0
- deepdoctection/pipe/transform.py +3 -0
- deepdoctection/utils/file_utils.py +34 -5
- deepdoctection/utils/logger.py +38 -1
- deepdoctection/utils/settings.py +2 -0
- deepdoctection/utils/transform.py +43 -18
- deepdoctection/utils/viz.py +24 -15
- {deepdoctection-0.44.1.dist-info → deepdoctection-0.46.dist-info}/METADATA +16 -21
- {deepdoctection-0.44.1.dist-info → deepdoctection-0.46.dist-info}/RECORD +33 -33
- {deepdoctection-0.44.1.dist-info → deepdoctection-0.46.dist-info}/WHEEL +0 -0
- {deepdoctection-0.44.1.dist-info → deepdoctection-0.46.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.44.1.dist-info → deepdoctection-0.46.dist-info}/top_level.txt +0 -0
deepdoctection/extern/hflm.py
CHANGED
|
@@ -20,16 +20,28 @@ Wrapper for the HF Language Model for sequence and token classification
|
|
|
20
20
|
"""
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
+
import os
|
|
23
24
|
from abc import ABC
|
|
25
|
+
from collections import defaultdict
|
|
24
26
|
from pathlib import Path
|
|
25
|
-
from typing import Literal, Mapping, Optional, Union
|
|
27
|
+
from typing import TYPE_CHECKING, Any, Literal, Mapping, Optional, Sequence, Union
|
|
26
28
|
|
|
27
29
|
from lazy_imports import try_import
|
|
30
|
+
from typing_extensions import TypeAlias
|
|
28
31
|
|
|
29
32
|
from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
|
|
30
33
|
from ..utils.settings import TypeOrStr
|
|
31
34
|
from ..utils.types import JsonDict, PathLikeOrStr, Requirement
|
|
32
|
-
from .base import
|
|
35
|
+
from .base import (
|
|
36
|
+
DetectionResult,
|
|
37
|
+
LanguageDetector,
|
|
38
|
+
LMSequenceClassifier,
|
|
39
|
+
LMTokenClassifier,
|
|
40
|
+
ModelCategories,
|
|
41
|
+
NerModelCategories,
|
|
42
|
+
SequenceClassResult,
|
|
43
|
+
TokenClassResult,
|
|
44
|
+
)
|
|
33
45
|
from .hflayoutlm import get_tokenizer_from_model_class
|
|
34
46
|
from .pt.ptutils import get_torch_device
|
|
35
47
|
|
|
@@ -38,14 +50,63 @@ with try_import() as pt_import_guard:
|
|
|
38
50
|
import torch.nn.functional as F
|
|
39
51
|
|
|
40
52
|
with try_import() as tr_import_guard:
|
|
41
|
-
from transformers import
|
|
53
|
+
from transformers import (
|
|
54
|
+
PretrainedConfig,
|
|
55
|
+
XLMRobertaForSequenceClassification,
|
|
56
|
+
XLMRobertaForTokenClassification,
|
|
57
|
+
XLMRobertaTokenizerFast,
|
|
58
|
+
)
|
|
42
59
|
|
|
43
60
|
|
|
44
|
-
def
|
|
61
|
+
def predict_token_classes_from_lm(
|
|
62
|
+
uuids: list[list[str]],
|
|
45
63
|
input_ids: torch.Tensor,
|
|
46
64
|
attention_mask: torch.Tensor,
|
|
47
65
|
token_type_ids: torch.Tensor,
|
|
48
|
-
|
|
66
|
+
tokens: list[list[str]],
|
|
67
|
+
model: XLMRobertaForTokenClassification,
|
|
68
|
+
) -> list[TokenClassResult]:
|
|
69
|
+
"""
|
|
70
|
+
Args:
|
|
71
|
+
uuids: A list of uuids that correspond to a word that induces the resulting token
|
|
72
|
+
input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
73
|
+
attention_mask: The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
|
|
74
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
75
|
+
tokens: List of original tokens taken from `LayoutLMTokenizer`
|
|
76
|
+
model: layoutlm model for token classification
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
A list of `TokenClassResult`s
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
|
|
83
|
+
|
|
84
|
+
soft_max = F.softmax(outputs.logits, dim=2)
|
|
85
|
+
score = torch.max(soft_max, dim=2)[0].tolist()
|
|
86
|
+
token_class_predictions_ = outputs.logits.argmax(-1).tolist()
|
|
87
|
+
input_ids_list = input_ids.tolist()
|
|
88
|
+
|
|
89
|
+
all_results = defaultdict(list)
|
|
90
|
+
for idx, uuid_list in enumerate(uuids):
|
|
91
|
+
for pos, token in enumerate(uuid_list):
|
|
92
|
+
all_results[token].append(
|
|
93
|
+
(input_ids_list[idx][pos], token_class_predictions_[idx][pos], tokens[idx][pos], score[idx][pos])
|
|
94
|
+
)
|
|
95
|
+
all_token_classes = []
|
|
96
|
+
for uuid, res in all_results.items():
|
|
97
|
+
res.sort(key=lambda x: x[3], reverse=True)
|
|
98
|
+
output = res[0]
|
|
99
|
+
all_token_classes.append(
|
|
100
|
+
TokenClassResult(uuid=uuid, token_id=output[0], class_id=output[1], token=output[2], score=output[3])
|
|
101
|
+
)
|
|
102
|
+
return all_token_classes
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def predict_sequence_classes_from_lm(
|
|
106
|
+
input_ids: torch.Tensor,
|
|
107
|
+
attention_mask: torch.Tensor,
|
|
108
|
+
token_type_ids: torch.Tensor,
|
|
109
|
+
model: XLMRobertaForSequenceClassification,
|
|
49
110
|
) -> SequenceClassResult:
|
|
50
111
|
"""
|
|
51
112
|
Args:
|
|
@@ -66,6 +127,250 @@ def predict_sequence_classes(
|
|
|
66
127
|
return SequenceClassResult(class_id=sequence_class_predictions, score=float(score)) # type: ignore
|
|
67
128
|
|
|
68
129
|
|
|
130
|
+
class HFLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
131
|
+
"""
|
|
132
|
+
Abstract base class for wrapping Bert-like models for token classification into the framework.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(
|
|
136
|
+
self,
|
|
137
|
+
path_config_json: PathLikeOrStr,
|
|
138
|
+
path_weights: PathLikeOrStr,
|
|
139
|
+
categories_semantics: Optional[Sequence[TypeOrStr]] = None,
|
|
140
|
+
categories_bio: Optional[Sequence[TypeOrStr]] = None,
|
|
141
|
+
categories: Optional[Mapping[int, TypeOrStr]] = None,
|
|
142
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
143
|
+
use_xlm_tokenizer: bool = False,
|
|
144
|
+
):
|
|
145
|
+
"""
|
|
146
|
+
Args:
|
|
147
|
+
path_config_json: path to `.json` config file
|
|
148
|
+
path_weights: path to model artifact
|
|
149
|
+
categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
|
|
150
|
+
entities self. To be consistent with detectors use only values `>0`. Conversion will
|
|
151
|
+
be done internally.
|
|
152
|
+
categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. `BIO`). To be
|
|
153
|
+
consistent with detectors use only `values>0`. Conversion will be done internally.
|
|
154
|
+
categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
155
|
+
device: The device (cpu,"cuda"), where to place the model.
|
|
156
|
+
use_xlm_tokenizer: True if one uses the `LayoutXLM` or a lilt model built with a xlm language model, e.g.
|
|
157
|
+
`info-xlm` or `roberta-xlm`. (`LayoutXLM` cannot be distinguished from LayoutLMv2).
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
if categories is None:
|
|
161
|
+
if categories_semantics is None:
|
|
162
|
+
raise ValueError("If categories is None then categories_semantics cannot be None")
|
|
163
|
+
if categories_bio is None:
|
|
164
|
+
raise ValueError("If categories is None then categories_bio cannot be None")
|
|
165
|
+
|
|
166
|
+
self.path_config = Path(path_config_json)
|
|
167
|
+
self.path_weights = Path(path_weights)
|
|
168
|
+
self.categories = NerModelCategories(
|
|
169
|
+
init_categories=categories, categories_semantics=categories_semantics, categories_bio=categories_bio
|
|
170
|
+
)
|
|
171
|
+
self.device = get_torch_device(device)
|
|
172
|
+
self.use_xlm_tokenizer = use_xlm_tokenizer
|
|
173
|
+
|
|
174
|
+
@classmethod
|
|
175
|
+
def get_requirements(cls) -> list[Requirement]:
|
|
176
|
+
return [get_pytorch_requirement(), get_transformers_requirement()]
|
|
177
|
+
|
|
178
|
+
def _map_category_names(self, token_results: list[TokenClassResult]) -> list[TokenClassResult]:
|
|
179
|
+
for result in token_results:
|
|
180
|
+
result.class_name = self.categories.categories[result.class_id + 1]
|
|
181
|
+
output = self.categories.disentangle_token_class_and_tag(result.class_name)
|
|
182
|
+
if output is not None:
|
|
183
|
+
token_class, tag = output
|
|
184
|
+
result.semantic_name = token_class
|
|
185
|
+
result.bio_tag = tag
|
|
186
|
+
else:
|
|
187
|
+
result.semantic_name = result.class_name
|
|
188
|
+
result.class_id += 1
|
|
189
|
+
return token_results
|
|
190
|
+
|
|
191
|
+
def _validate_encodings(
|
|
192
|
+
self, **encodings: Any
|
|
193
|
+
) -> tuple[list[list[str]], list[str], torch.Tensor, torch.Tensor, torch.Tensor, list[list[str]]]:
|
|
194
|
+
image_ids = encodings.get("image_ids", [])
|
|
195
|
+
ann_ids = encodings.get("ann_ids")
|
|
196
|
+
input_ids = encodings.get("input_ids")
|
|
197
|
+
attention_mask = encodings.get("attention_mask")
|
|
198
|
+
token_type_ids = encodings.get("token_type_ids")
|
|
199
|
+
tokens = encodings.get("tokens")
|
|
200
|
+
|
|
201
|
+
assert isinstance(ann_ids, list), type(ann_ids)
|
|
202
|
+
if len(set(image_ids)) > 1:
|
|
203
|
+
raise ValueError("HFLmTokenClassifier accepts for inference only one image.")
|
|
204
|
+
if isinstance(input_ids, torch.Tensor):
|
|
205
|
+
input_ids = input_ids.to(self.device)
|
|
206
|
+
else:
|
|
207
|
+
raise ValueError(f"input_ids must be list but is {type(input_ids)}")
|
|
208
|
+
if isinstance(attention_mask, torch.Tensor):
|
|
209
|
+
attention_mask = attention_mask.to(self.device)
|
|
210
|
+
else:
|
|
211
|
+
raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
|
|
212
|
+
if isinstance(token_type_ids, torch.Tensor):
|
|
213
|
+
token_type_ids = token_type_ids.to(self.device)
|
|
214
|
+
else:
|
|
215
|
+
raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
|
|
216
|
+
if not isinstance(tokens, list):
|
|
217
|
+
raise ValueError(f"tokens must be list but is {type(tokens)}")
|
|
218
|
+
|
|
219
|
+
return ann_ids, image_ids, input_ids, attention_mask, token_type_ids, tokens
|
|
220
|
+
|
|
221
|
+
def clone(self) -> HFLmTokenClassifierBase:
|
|
222
|
+
return self.__class__(
|
|
223
|
+
self.path_config,
|
|
224
|
+
self.path_weights,
|
|
225
|
+
self.categories.categories_semantics,
|
|
226
|
+
self.categories.categories_bio,
|
|
227
|
+
self.categories.get_categories(),
|
|
228
|
+
self.device,
|
|
229
|
+
self.use_xlm_tokenizer,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
@staticmethod
|
|
233
|
+
def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
|
|
234
|
+
"""Returns the name of the model"""
|
|
235
|
+
return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
|
|
236
|
+
|
|
237
|
+
@staticmethod
|
|
238
|
+
def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
|
|
239
|
+
"""
|
|
240
|
+
A refinement for adding the tokenizer class name to the model configs.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
model_class_name: The model name, e.g. `model.__class__.__name__`
|
|
244
|
+
use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
The name of the tokenizer class.
|
|
248
|
+
"""
|
|
249
|
+
tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
|
|
250
|
+
return tokenizer.__class__.__name__
|
|
251
|
+
|
|
252
|
+
@staticmethod
|
|
253
|
+
def image_to_raw_features_mapping() -> str:
|
|
254
|
+
"""Returns the mapping function to convert images into raw features."""
|
|
255
|
+
return "image_to_raw_lm_features"
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
def image_to_features_mapping() -> str:
|
|
259
|
+
"""Returns the mapping function to convert images into features."""
|
|
260
|
+
return "image_to_lm_features"
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class HFLmTokenClassifier(HFLmTokenClassifierBase):
|
|
264
|
+
"""
|
|
265
|
+
A wrapper class for `transformers.XLMRobertaForTokenClassification` and similar models to use within a pipeline
|
|
266
|
+
component. Check <https://huggingface.co/docs/transformers/model_doc/xlm-roberta> for documentation of the
|
|
267
|
+
model itself.
|
|
268
|
+
Note that this model is equipped with a head that is only useful for classifying the tokens. For sequence
|
|
269
|
+
classification and other things please use another model of the family.
|
|
270
|
+
|
|
271
|
+
Example:
|
|
272
|
+
```python
|
|
273
|
+
# setting up compulsory ocr service
|
|
274
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
275
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
276
|
+
ocr_service = TextExtractionService(tess)
|
|
277
|
+
|
|
278
|
+
# hf tokenizer and token classifier
|
|
279
|
+
tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
|
|
280
|
+
roberta = XLMRobertaForTokenClassification("path/to/config.json","path/to/model.bin",
|
|
281
|
+
categories=["first_name", "surname", "street"])
|
|
282
|
+
|
|
283
|
+
# token classification service
|
|
284
|
+
roberta_service = LMTokenClassifierService(tokenizer,roberta)
|
|
285
|
+
|
|
286
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
|
|
287
|
+
|
|
288
|
+
path = "path/to/some/form"
|
|
289
|
+
df = pipe.analyze(path=path)
|
|
290
|
+
|
|
291
|
+
for dp in df:
|
|
292
|
+
...
|
|
293
|
+
```
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
def __init__(
|
|
297
|
+
self,
|
|
298
|
+
path_config_json: PathLikeOrStr,
|
|
299
|
+
path_weights: PathLikeOrStr,
|
|
300
|
+
categories_semantics: Optional[Sequence[TypeOrStr]] = None,
|
|
301
|
+
categories_bio: Optional[Sequence[TypeOrStr]] = None,
|
|
302
|
+
categories: Optional[Mapping[int, TypeOrStr]] = None,
|
|
303
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
304
|
+
use_xlm_tokenizer: bool = True,
|
|
305
|
+
):
|
|
306
|
+
"""
|
|
307
|
+
Args:
|
|
308
|
+
path_config_json: path to `.json` config file
|
|
309
|
+
path_weights: path to model artifact
|
|
310
|
+
categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
|
|
311
|
+
entities self. To be consistent with detectors use only values `>0`. Conversion will
|
|
312
|
+
be done internally.
|
|
313
|
+
categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. BIO). To be
|
|
314
|
+
consistent with detectors use only values>0. Conversion will be done internally.
|
|
315
|
+
categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
316
|
+
device: The device (cpu,"cuda"), where to place the model.
|
|
317
|
+
use_xlm_tokenizer: Do not change this value unless you pre-trained a bert-like model with a different
|
|
318
|
+
Tokenizer.
|
|
319
|
+
"""
|
|
320
|
+
super().__init__(
|
|
321
|
+
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
322
|
+
)
|
|
323
|
+
self.name = self.get_name(path_weights, "bert-like-token-classification")
|
|
324
|
+
self.model_id = self.get_model_id()
|
|
325
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
326
|
+
self.model.to(self.device)
|
|
327
|
+
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
328
|
+
self.model.__class__.__name__, self.use_xlm_tokenizer
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
|
|
332
|
+
"""
|
|
333
|
+
Launch inference on bert-like models for token classification. Pass the following arguments
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
337
|
+
attention_mask: The associated attention masks from padded sequences taken from
|
|
338
|
+
`LayoutLMTokenizer`
|
|
339
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
340
|
+
boxes: Torch tensor of bounding boxes of type `xyxy`
|
|
341
|
+
tokens: List of original tokens taken from `LayoutLMTokenizer`
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
A list of `TokenClassResult`s
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
ann_ids, _, input_ids, attention_mask, token_type_ids, tokens = self._validate_encodings(**encodings)
|
|
348
|
+
results = predict_token_classes_from_lm(ann_ids, input_ids, attention_mask, token_type_ids, tokens, self.model)
|
|
349
|
+
return self._map_category_names(results)
|
|
350
|
+
|
|
351
|
+
@staticmethod
|
|
352
|
+
def get_wrapped_model(
|
|
353
|
+
path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
|
|
354
|
+
) -> XLMRobertaForTokenClassification:
|
|
355
|
+
"""
|
|
356
|
+
Get the inner (wrapped) model.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
path_config_json: path to .json config file
|
|
360
|
+
path_weights: path to model artifact
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
`nn.Module`
|
|
364
|
+
"""
|
|
365
|
+
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
|
|
366
|
+
return XLMRobertaForTokenClassification.from_pretrained(
|
|
367
|
+
pretrained_model_name_or_path=os.fspath(path_weights), config=config
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
def clear_model(self) -> None:
|
|
371
|
+
self.model = None
|
|
372
|
+
|
|
373
|
+
|
|
69
374
|
class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
70
375
|
"""
|
|
71
376
|
Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
|
|
@@ -208,10 +513,11 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
208
513
|
use_xlm_tokenizer: bool = True,
|
|
209
514
|
):
|
|
210
515
|
super().__init__(path_config_json, path_weights, categories, device)
|
|
211
|
-
self.name = self.get_name(path_weights, "bert-like")
|
|
516
|
+
self.name = self.get_name(path_weights, "bert-like-sequence-classification")
|
|
212
517
|
self.model_id = self.get_model_id()
|
|
213
518
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
214
519
|
self.model.to(self.device)
|
|
520
|
+
self.use_xlm_tokenizer = use_xlm_tokenizer
|
|
215
521
|
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
216
522
|
self.model.__class__.__name__, use_xlm_tokenizer
|
|
217
523
|
)
|
|
@@ -219,7 +525,7 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
219
525
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
|
|
220
526
|
input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
|
|
221
527
|
|
|
222
|
-
result =
|
|
528
|
+
result = predict_sequence_classes_from_lm(
|
|
223
529
|
input_ids,
|
|
224
530
|
attention_mask,
|
|
225
531
|
token_type_ids,
|
|
@@ -262,3 +568,122 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
262
568
|
|
|
263
569
|
def clear_model(self) -> None:
|
|
264
570
|
self.model = None
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
class HFLmLanguageDetector(LanguageDetector):
|
|
574
|
+
"""
|
|
575
|
+
Language detector using HuggingFace's `XLMRobertaForSequenceClassification`.
|
|
576
|
+
|
|
577
|
+
This class wraps a multilingual sequence classification model (XLMRobertaForSequenceClassification)
|
|
578
|
+
for language detection tasks. Input text is tokenized and truncated/padded to a maximum length of 512 tokens.
|
|
579
|
+
The prediction returns a `DetectionResult` containing the detected language code and its confidence score.
|
|
580
|
+
"""
|
|
581
|
+
|
|
582
|
+
def __init__(
|
|
583
|
+
self,
|
|
584
|
+
path_config_json: PathLikeOrStr,
|
|
585
|
+
path_weights: PathLikeOrStr,
|
|
586
|
+
categories: Mapping[int, TypeOrStr],
|
|
587
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
588
|
+
use_xlm_tokenizer: bool = True,
|
|
589
|
+
):
|
|
590
|
+
super().__init__()
|
|
591
|
+
self.path_config = Path(path_config_json)
|
|
592
|
+
self.path_weights = Path(path_weights)
|
|
593
|
+
self.categories = ModelCategories(init_categories=categories)
|
|
594
|
+
self.device = get_torch_device(device)
|
|
595
|
+
self.use_xlm_tokenizer = use_xlm_tokenizer
|
|
596
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
597
|
+
self.model.to(self.device)
|
|
598
|
+
self.tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
|
|
599
|
+
self.name = self.get_name(path_weights, "bert-like-language-detection")
|
|
600
|
+
self.model_id = self.get_model_id()
|
|
601
|
+
|
|
602
|
+
def predict(self, text_string: str) -> DetectionResult:
|
|
603
|
+
"""
|
|
604
|
+
Predict the language of the input sequence.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
text_string: The input text sequence to classify.
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
DetectionResult: The detected language and its confidence score.
|
|
611
|
+
"""
|
|
612
|
+
encoding = self.tokenizer(
|
|
613
|
+
text_string,
|
|
614
|
+
return_tensors="pt",
|
|
615
|
+
padding=True,
|
|
616
|
+
truncation=True,
|
|
617
|
+
max_length=512,
|
|
618
|
+
)
|
|
619
|
+
input_ids = encoding["input_ids"].to(self.device)
|
|
620
|
+
attention_mask = encoding["attention_mask"].to(self.device)
|
|
621
|
+
token_type_ids = encoding.get("token_type_ids")
|
|
622
|
+
if token_type_ids is not None:
|
|
623
|
+
token_type_ids = token_type_ids.to(self.device)
|
|
624
|
+
else:
|
|
625
|
+
token_type_ids = torch.zeros_like(input_ids)
|
|
626
|
+
|
|
627
|
+
self.model.eval()
|
|
628
|
+
with torch.no_grad():
|
|
629
|
+
outputs = self.model(
|
|
630
|
+
input_ids=input_ids,
|
|
631
|
+
attention_mask=attention_mask,
|
|
632
|
+
token_type_ids=token_type_ids,
|
|
633
|
+
)
|
|
634
|
+
probs = torch.softmax(outputs.logits, dim=-1)
|
|
635
|
+
score, class_id_tensor = torch.max(probs, dim=-1)
|
|
636
|
+
class_id = int(class_id_tensor.item() + 1)
|
|
637
|
+
lang = self.categories.categories[class_id]
|
|
638
|
+
|
|
639
|
+
return DetectionResult(class_name=lang, score=float(score.item()))
|
|
640
|
+
|
|
641
|
+
def clear_model(self) -> None:
|
|
642
|
+
self.model = None
|
|
643
|
+
|
|
644
|
+
@classmethod
|
|
645
|
+
def get_requirements(cls) -> list[Requirement]:
|
|
646
|
+
return [get_pytorch_requirement(), get_transformers_requirement()]
|
|
647
|
+
|
|
648
|
+
@staticmethod
|
|
649
|
+
def get_wrapped_model(
|
|
650
|
+
path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
|
|
651
|
+
) -> XLMRobertaForSequenceClassification:
|
|
652
|
+
"""
|
|
653
|
+
Get the inner (wrapped) model.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
path_config_json: path to .json config file
|
|
657
|
+
path_weights: path to model artifact
|
|
658
|
+
|
|
659
|
+
Returns:
|
|
660
|
+
`XLMRobertaForSequenceClassification`
|
|
661
|
+
"""
|
|
662
|
+
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
663
|
+
return XLMRobertaForSequenceClassification.from_pretrained(
|
|
664
|
+
pretrained_model_name_or_path=path_weights, config=config
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
def clone(self) -> HFLmLanguageDetector:
|
|
668
|
+
return self.__class__(
|
|
669
|
+
self.path_config, self.path_weights, self.categories.get_categories(), self.device, self.use_xlm_tokenizer
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
@staticmethod
|
|
673
|
+
def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
|
|
674
|
+
"""
|
|
675
|
+
Returns the name of the model
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
path_weights: Path to model weights
|
|
679
|
+
architecture: Architecture name
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
str: Model name
|
|
683
|
+
"""
|
|
684
|
+
return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
if TYPE_CHECKING:
|
|
688
|
+
LmTokenModels: TypeAlias = Union[HFLmTokenClassifier,]
|
|
689
|
+
LmSequenceModels: TypeAlias = Union[HFLmSequenceClassifier,]
|
deepdoctection/extern/tessocr.py
CHANGED
|
@@ -28,8 +28,9 @@ from errno import ENOENT
|
|
|
28
28
|
from itertools import groupby
|
|
29
29
|
from os import environ, fspath
|
|
30
30
|
from pathlib import Path
|
|
31
|
-
from typing import Any, Mapping, Optional, Union
|
|
31
|
+
from typing import Any, Mapping, Optional, Sequence, Union
|
|
32
32
|
|
|
33
|
+
import numpy as np
|
|
33
34
|
from packaging.version import InvalidVersion, Version, parse
|
|
34
35
|
|
|
35
36
|
from ..utils.context import save_tmp_file, timeout_manager
|
|
@@ -37,6 +38,7 @@ from ..utils.error import DependencyError, TesseractError
|
|
|
37
38
|
from ..utils.file_utils import _TESS_PATH, get_tesseract_requirement
|
|
38
39
|
from ..utils.metacfg import config_to_cli_str, set_config_by_yaml
|
|
39
40
|
from ..utils.settings import LayoutType, ObjectTypes, PageType
|
|
41
|
+
from ..utils.transform import RotationTransform
|
|
40
42
|
from ..utils.types import PathLikeOrStr, PixelValues, Requirement
|
|
41
43
|
from ..utils.viz import viz_handler
|
|
42
44
|
from .base import DetectionResult, ImageTransformer, ModelCategories, ObjectDetector
|
|
@@ -450,6 +452,7 @@ class TesseractRotationTransformer(ImageTransformer):
|
|
|
450
452
|
self.name = fspath(_TESS_PATH) + "-rotation"
|
|
451
453
|
self.categories = ModelCategories(init_categories={1: PageType.ANGLE})
|
|
452
454
|
self.model_id = self.get_model_id()
|
|
455
|
+
self.rotator = RotationTransform(360)
|
|
453
456
|
|
|
454
457
|
def transform_image(self, np_img: PixelValues, specification: DetectionResult) -> PixelValues:
|
|
455
458
|
"""
|
|
@@ -465,6 +468,19 @@ class TesseractRotationTransformer(ImageTransformer):
|
|
|
465
468
|
"""
|
|
466
469
|
return viz_handler.rotate_image(np_img, specification.angle) # type: ignore
|
|
467
470
|
|
|
471
|
+
def transform_coords(self, detect_results: Sequence[DetectionResult]) -> Sequence[DetectionResult]:
|
|
472
|
+
if detect_results:
|
|
473
|
+
if detect_results[0].angle:
|
|
474
|
+
self.rotator.set_angle(detect_results[0].angle) # type: ignore
|
|
475
|
+
self.rotator.set_image_width(detect_results[0].image_width) # type: ignore
|
|
476
|
+
self.rotator.set_image_height(detect_results[0].image_height) # type: ignore
|
|
477
|
+
transformed_coords = self.rotator.apply_coords(
|
|
478
|
+
np.asarray([detect_result.box for detect_result in detect_results], dtype=float)
|
|
479
|
+
)
|
|
480
|
+
for idx, detect_result in enumerate(detect_results):
|
|
481
|
+
detect_result.box = transformed_coords[idx, :].tolist()
|
|
482
|
+
return detect_results
|
|
483
|
+
|
|
468
484
|
def predict(self, np_img: PixelValues) -> DetectionResult:
|
|
469
485
|
"""
|
|
470
486
|
Determines the angle of the rotated image. It can only handle angles that are multiples of 90 degrees.
|
deepdoctection/pipe/language.py
CHANGED
|
@@ -21,7 +21,7 @@ Module for language detection pipeline component
|
|
|
21
21
|
from typing import Optional, Sequence
|
|
22
22
|
|
|
23
23
|
from ..datapoint.image import Image, MetaAnnotation
|
|
24
|
-
from ..datapoint.view import
|
|
24
|
+
from ..datapoint.view import IMAGE_DEFAULTS, Page
|
|
25
25
|
from ..extern.base import LanguageDetector, ObjectDetector
|
|
26
26
|
from ..utils.error import ImageError
|
|
27
27
|
from ..utils.settings import PageType, TypeOrStr, get_type
|
|
@@ -75,11 +75,11 @@ class LanguageDetectionService(PipelineComponent):
|
|
|
75
75
|
|
|
76
76
|
self.predictor = language_detector
|
|
77
77
|
self.text_detector = text_detector
|
|
78
|
-
self.text_container = get_type(text_container) if text_container is not None else
|
|
78
|
+
self.text_container = get_type(text_container) if text_container is not None else IMAGE_DEFAULTS.TEXT_CONTAINER
|
|
79
79
|
self.floating_text_block_categories = (
|
|
80
80
|
tuple(get_type(text_block) for text_block in floating_text_block_categories)
|
|
81
81
|
if (floating_text_block_categories is not None)
|
|
82
|
-
else
|
|
82
|
+
else IMAGE_DEFAULTS.FLOATING_TEXT_BLOCK_CATEGORIES
|
|
83
83
|
)
|
|
84
84
|
|
|
85
85
|
super().__init__(self._get_name(self.predictor.name))
|
|
@@ -109,7 +109,7 @@ class LanguageDetectionService(PipelineComponent):
|
|
|
109
109
|
text = " ".join((result.text for result in detect_result_list if result.text is not None))
|
|
110
110
|
predict_result = self.predictor.predict(text)
|
|
111
111
|
self.dp_manager.set_summary_annotation(
|
|
112
|
-
PageType.LANGUAGE, PageType.LANGUAGE, 1, predict_result.
|
|
112
|
+
PageType.LANGUAGE, PageType.LANGUAGE, 1, predict_result.class_name, predict_result.score
|
|
113
113
|
)
|
|
114
114
|
|
|
115
115
|
def clone(self) -> PipelineComponent:
|
deepdoctection/pipe/lm.py
CHANGED
|
@@ -20,6 +20,7 @@ Module for token classification pipeline
|
|
|
20
20
|
"""
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
+
import inspect
|
|
23
24
|
from copy import copy
|
|
24
25
|
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
|
|
25
26
|
|
|
@@ -32,6 +33,7 @@ from .registry import pipeline_component_registry
|
|
|
32
33
|
|
|
33
34
|
if TYPE_CHECKING:
|
|
34
35
|
from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
|
|
36
|
+
from ..extern.hflm import LmSequenceModels, LmTokenModels
|
|
35
37
|
|
|
36
38
|
|
|
37
39
|
@pipeline_component_registry.register("LMTokenClassifierService")
|
|
@@ -70,7 +72,7 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
70
72
|
def __init__(
|
|
71
73
|
self,
|
|
72
74
|
tokenizer: Any,
|
|
73
|
-
language_model: LayoutTokenModels,
|
|
75
|
+
language_model: Union[LayoutTokenModels, LmTokenModels],
|
|
74
76
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
75
77
|
truncation: bool = True,
|
|
76
78
|
return_overflowing_tokens: bool = False,
|
|
@@ -124,7 +126,7 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
124
126
|
might not get sent to the model because they are categorized as not
|
|
125
127
|
eligible token (e.g. empty string). If set to `True` it will assign all
|
|
126
128
|
words without token the `BioTag.outside` token.
|
|
127
|
-
segment_positions: Using bounding boxes of
|
|
129
|
+
segment_positions: Using bounding boxes of segments instead of words improves model accuracy
|
|
128
130
|
significantly for models that have been trained on segments rather than words.
|
|
129
131
|
Choose a single or a sequence of layout segments to use their bounding boxes. Note,
|
|
130
132
|
that the layout segments need to have a child-relationship with words. If a word
|
|
@@ -271,6 +273,8 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
271
273
|
f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
|
|
272
274
|
f"in this framework"
|
|
273
275
|
)
|
|
276
|
+
func_params = inspect.signature(self.mapping_to_lm_input_func).parameters
|
|
277
|
+
self.required_kwargs = {k: v for k, v in self.required_kwargs.items() if k in func_params}
|
|
274
278
|
|
|
275
279
|
@staticmethod
|
|
276
280
|
def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
|
|
@@ -318,7 +322,7 @@ class LMSequenceClassifierService(PipelineComponent):
|
|
|
318
322
|
def __init__(
|
|
319
323
|
self,
|
|
320
324
|
tokenizer: Any,
|
|
321
|
-
language_model: LayoutSequenceModels,
|
|
325
|
+
language_model: Union[LayoutSequenceModels, LmSequenceModels],
|
|
322
326
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
323
327
|
truncation: bool = True,
|
|
324
328
|
return_overflowing_tokens: bool = False,
|
deepdoctection/pipe/order.py
CHANGED
|
@@ -228,8 +228,8 @@ class OrderGenerator:
|
|
|
228
228
|
columns: list[BoundingBox] = []
|
|
229
229
|
anns.sort(
|
|
230
230
|
key=lambda x: (
|
|
231
|
-
x.
|
|
232
|
-
x.
|
|
231
|
+
x.get_bounding_box(image_id).transform(image_width, image_height).cy,
|
|
232
|
+
x.get_bounding_box(image_id).transform(image_width, image_height).cx,
|
|
233
233
|
)
|
|
234
234
|
)
|
|
235
235
|
for ann in anns:
|
|
@@ -309,7 +309,9 @@ class OrderGenerator:
|
|
|
309
309
|
filtered_blocks: Sequence[tuple[int, str]]
|
|
310
310
|
for idx in range(max_block_number + 1):
|
|
311
311
|
filtered_blocks = list(filter(lambda x: x[0] == idx, blocks)) # type: ignore # pylint: disable=W0640
|
|
312
|
-
sorted_blocks.extend(
|
|
312
|
+
sorted_blocks.extend(
|
|
313
|
+
self._sort_anns_grouped_by_blocks(filtered_blocks, anns, image_width, image_height, image_id)
|
|
314
|
+
)
|
|
313
315
|
reading_blocks = [(idx + 1, block[1]) for idx, block in enumerate(sorted_blocks)]
|
|
314
316
|
|
|
315
317
|
if logger.isEnabledFor(DEBUG):
|
|
@@ -346,7 +348,11 @@ class OrderGenerator:
|
|
|
346
348
|
|
|
347
349
|
@staticmethod
|
|
348
350
|
def _sort_anns_grouped_by_blocks(
|
|
349
|
-
block: Sequence[tuple[int, str]],
|
|
351
|
+
block: Sequence[tuple[int, str]],
|
|
352
|
+
anns: Sequence[ImageAnnotation],
|
|
353
|
+
image_width: float,
|
|
354
|
+
image_height: float,
|
|
355
|
+
image_id: Optional[str] = None,
|
|
350
356
|
) -> list[tuple[int, str]]:
|
|
351
357
|
if not block:
|
|
352
358
|
return []
|
|
@@ -356,8 +362,8 @@ class OrderGenerator:
|
|
|
356
362
|
block_anns = [ann for ann in anns if ann.annotation_id in ann_ids]
|
|
357
363
|
block_anns.sort(
|
|
358
364
|
key=lambda x: (
|
|
359
|
-
round(x.
|
|
360
|
-
round(x.
|
|
365
|
+
round(x.get_bounding_box(image_id).transform(image_width, image_height).uly, 2),
|
|
366
|
+
round(x.get_bounding_box(image_id).transform(image_width, image_height).ulx, 2),
|
|
361
367
|
)
|
|
362
368
|
)
|
|
363
369
|
return [(block_number, ann.annotation_id) for ann in block_anns]
|