deepdoctection 0.44.0__py3-none-any.whl → 0.45.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +6 -3
- deepdoctection/analyzer/config.py +41 -0
- deepdoctection/analyzer/factory.py +249 -1
- deepdoctection/configs/profiles.jsonl +2 -1
- deepdoctection/datapoint/image.py +1 -0
- deepdoctection/datapoint/view.py +162 -69
- deepdoctection/datasets/base.py +1 -0
- deepdoctection/extern/__init__.py +1 -0
- deepdoctection/extern/d2detect.py +1 -1
- deepdoctection/extern/fastlang.py +6 -4
- deepdoctection/extern/hflayoutlm.py +23 -10
- deepdoctection/extern/hflm.py +432 -7
- deepdoctection/mapper/laylmstruct.py +7 -7
- deepdoctection/pipe/language.py +4 -4
- deepdoctection/pipe/lm.py +7 -3
- deepdoctection/utils/file_utils.py +34 -0
- deepdoctection/utils/settings.py +2 -0
- deepdoctection/utils/types.py +0 -1
- deepdoctection/utils/viz.py +3 -3
- {deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/METADATA +15 -15
- {deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/RECORD +24 -24
- {deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/WHEEL +0 -0
- {deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/top_level.txt +0 -0
deepdoctection/extern/hflm.py
CHANGED
|
@@ -20,16 +20,28 @@ Wrapper for the HF Language Model for sequence and token classification
|
|
|
20
20
|
"""
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
+
import os
|
|
23
24
|
from abc import ABC
|
|
25
|
+
from collections import defaultdict
|
|
24
26
|
from pathlib import Path
|
|
25
|
-
from typing import Literal, Mapping, Optional, Union
|
|
27
|
+
from typing import TYPE_CHECKING, Any, Literal, Mapping, Optional, Sequence, Union
|
|
26
28
|
|
|
27
29
|
from lazy_imports import try_import
|
|
30
|
+
from typing_extensions import TypeAlias
|
|
28
31
|
|
|
29
32
|
from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
|
|
30
33
|
from ..utils.settings import TypeOrStr
|
|
31
34
|
from ..utils.types import JsonDict, PathLikeOrStr, Requirement
|
|
32
|
-
from .base import
|
|
35
|
+
from .base import (
|
|
36
|
+
DetectionResult,
|
|
37
|
+
LanguageDetector,
|
|
38
|
+
LMSequenceClassifier,
|
|
39
|
+
LMTokenClassifier,
|
|
40
|
+
ModelCategories,
|
|
41
|
+
NerModelCategories,
|
|
42
|
+
SequenceClassResult,
|
|
43
|
+
TokenClassResult,
|
|
44
|
+
)
|
|
33
45
|
from .hflayoutlm import get_tokenizer_from_model_class
|
|
34
46
|
from .pt.ptutils import get_torch_device
|
|
35
47
|
|
|
@@ -38,14 +50,63 @@ with try_import() as pt_import_guard:
|
|
|
38
50
|
import torch.nn.functional as F
|
|
39
51
|
|
|
40
52
|
with try_import() as tr_import_guard:
|
|
41
|
-
from transformers import
|
|
53
|
+
from transformers import (
|
|
54
|
+
PretrainedConfig,
|
|
55
|
+
XLMRobertaForSequenceClassification,
|
|
56
|
+
XLMRobertaForTokenClassification,
|
|
57
|
+
XLMRobertaTokenizerFast,
|
|
58
|
+
)
|
|
42
59
|
|
|
43
60
|
|
|
44
|
-
def
|
|
61
|
+
def predict_token_classes_from_lm(
|
|
62
|
+
uuids: list[list[str]],
|
|
45
63
|
input_ids: torch.Tensor,
|
|
46
64
|
attention_mask: torch.Tensor,
|
|
47
65
|
token_type_ids: torch.Tensor,
|
|
48
|
-
|
|
66
|
+
tokens: list[list[str]],
|
|
67
|
+
model: XLMRobertaForTokenClassification,
|
|
68
|
+
) -> list[TokenClassResult]:
|
|
69
|
+
"""
|
|
70
|
+
Args:
|
|
71
|
+
uuids: A list of uuids that correspond to a word that induces the resulting token
|
|
72
|
+
input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
73
|
+
attention_mask: The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
|
|
74
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
75
|
+
tokens: List of original tokens taken from `LayoutLMTokenizer`
|
|
76
|
+
model: layoutlm model for token classification
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
A list of `TokenClassResult`s
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
|
|
83
|
+
|
|
84
|
+
soft_max = F.softmax(outputs.logits, dim=2)
|
|
85
|
+
score = torch.max(soft_max, dim=2)[0].tolist()
|
|
86
|
+
token_class_predictions_ = outputs.logits.argmax(-1).tolist()
|
|
87
|
+
input_ids_list = input_ids.tolist()
|
|
88
|
+
|
|
89
|
+
all_results = defaultdict(list)
|
|
90
|
+
for idx, uuid_list in enumerate(uuids):
|
|
91
|
+
for pos, token in enumerate(uuid_list):
|
|
92
|
+
all_results[token].append(
|
|
93
|
+
(input_ids_list[idx][pos], token_class_predictions_[idx][pos], tokens[idx][pos], score[idx][pos])
|
|
94
|
+
)
|
|
95
|
+
all_token_classes = []
|
|
96
|
+
for uuid, res in all_results.items():
|
|
97
|
+
res.sort(key=lambda x: x[3], reverse=True)
|
|
98
|
+
output = res[0]
|
|
99
|
+
all_token_classes.append(
|
|
100
|
+
TokenClassResult(uuid=uuid, token_id=output[0], class_id=output[1], token=output[2], score=output[3])
|
|
101
|
+
)
|
|
102
|
+
return all_token_classes
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def predict_sequence_classes_from_lm(
|
|
106
|
+
input_ids: torch.Tensor,
|
|
107
|
+
attention_mask: torch.Tensor,
|
|
108
|
+
token_type_ids: torch.Tensor,
|
|
109
|
+
model: XLMRobertaForSequenceClassification,
|
|
49
110
|
) -> SequenceClassResult:
|
|
50
111
|
"""
|
|
51
112
|
Args:
|
|
@@ -66,6 +127,250 @@ def predict_sequence_classes(
|
|
|
66
127
|
return SequenceClassResult(class_id=sequence_class_predictions, score=float(score)) # type: ignore
|
|
67
128
|
|
|
68
129
|
|
|
130
|
+
class HFLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
131
|
+
"""
|
|
132
|
+
Abstract base class for wrapping Bert-like models for token classification into the framework.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(
|
|
136
|
+
self,
|
|
137
|
+
path_config_json: PathLikeOrStr,
|
|
138
|
+
path_weights: PathLikeOrStr,
|
|
139
|
+
categories_semantics: Optional[Sequence[TypeOrStr]] = None,
|
|
140
|
+
categories_bio: Optional[Sequence[TypeOrStr]] = None,
|
|
141
|
+
categories: Optional[Mapping[int, TypeOrStr]] = None,
|
|
142
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
143
|
+
use_xlm_tokenizer: bool = False,
|
|
144
|
+
):
|
|
145
|
+
"""
|
|
146
|
+
Args:
|
|
147
|
+
path_config_json: path to `.json` config file
|
|
148
|
+
path_weights: path to model artifact
|
|
149
|
+
categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
|
|
150
|
+
entities self. To be consistent with detectors use only values `>0`. Conversion will
|
|
151
|
+
be done internally.
|
|
152
|
+
categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. `BIO`). To be
|
|
153
|
+
consistent with detectors use only `values>0`. Conversion will be done internally.
|
|
154
|
+
categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
155
|
+
device: The device (cpu,"cuda"), where to place the model.
|
|
156
|
+
use_xlm_tokenizer: True if one uses the `LayoutXLM` or a lilt model built with a xlm language model, e.g.
|
|
157
|
+
`info-xlm` or `roberta-xlm`. (`LayoutXLM` cannot be distinguished from LayoutLMv2).
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
if categories is None:
|
|
161
|
+
if categories_semantics is None:
|
|
162
|
+
raise ValueError("If categories is None then categories_semantics cannot be None")
|
|
163
|
+
if categories_bio is None:
|
|
164
|
+
raise ValueError("If categories is None then categories_bio cannot be None")
|
|
165
|
+
|
|
166
|
+
self.path_config = Path(path_config_json)
|
|
167
|
+
self.path_weights = Path(path_weights)
|
|
168
|
+
self.categories = NerModelCategories(
|
|
169
|
+
init_categories=categories, categories_semantics=categories_semantics, categories_bio=categories_bio
|
|
170
|
+
)
|
|
171
|
+
self.device = get_torch_device(device)
|
|
172
|
+
self.use_xlm_tokenizer = use_xlm_tokenizer
|
|
173
|
+
|
|
174
|
+
@classmethod
|
|
175
|
+
def get_requirements(cls) -> list[Requirement]:
|
|
176
|
+
return [get_pytorch_requirement(), get_transformers_requirement()]
|
|
177
|
+
|
|
178
|
+
def _map_category_names(self, token_results: list[TokenClassResult]) -> list[TokenClassResult]:
|
|
179
|
+
for result in token_results:
|
|
180
|
+
result.class_name = self.categories.categories[result.class_id + 1]
|
|
181
|
+
output = self.categories.disentangle_token_class_and_tag(result.class_name)
|
|
182
|
+
if output is not None:
|
|
183
|
+
token_class, tag = output
|
|
184
|
+
result.semantic_name = token_class
|
|
185
|
+
result.bio_tag = tag
|
|
186
|
+
else:
|
|
187
|
+
result.semantic_name = result.class_name
|
|
188
|
+
result.class_id += 1
|
|
189
|
+
return token_results
|
|
190
|
+
|
|
191
|
+
def _validate_encodings(
|
|
192
|
+
self, **encodings: Any
|
|
193
|
+
) -> tuple[list[list[str]], list[str], torch.Tensor, torch.Tensor, torch.Tensor, list[list[str]]]:
|
|
194
|
+
image_ids = encodings.get("image_ids", [])
|
|
195
|
+
ann_ids = encodings.get("ann_ids")
|
|
196
|
+
input_ids = encodings.get("input_ids")
|
|
197
|
+
attention_mask = encodings.get("attention_mask")
|
|
198
|
+
token_type_ids = encodings.get("token_type_ids")
|
|
199
|
+
tokens = encodings.get("tokens")
|
|
200
|
+
|
|
201
|
+
assert isinstance(ann_ids, list), type(ann_ids)
|
|
202
|
+
if len(set(image_ids)) > 1:
|
|
203
|
+
raise ValueError("HFLmTokenClassifier accepts for inference only one image.")
|
|
204
|
+
if isinstance(input_ids, torch.Tensor):
|
|
205
|
+
input_ids = input_ids.to(self.device)
|
|
206
|
+
else:
|
|
207
|
+
raise ValueError(f"input_ids must be list but is {type(input_ids)}")
|
|
208
|
+
if isinstance(attention_mask, torch.Tensor):
|
|
209
|
+
attention_mask = attention_mask.to(self.device)
|
|
210
|
+
else:
|
|
211
|
+
raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
|
|
212
|
+
if isinstance(token_type_ids, torch.Tensor):
|
|
213
|
+
token_type_ids = token_type_ids.to(self.device)
|
|
214
|
+
else:
|
|
215
|
+
raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
|
|
216
|
+
if not isinstance(tokens, list):
|
|
217
|
+
raise ValueError(f"tokens must be list but is {type(tokens)}")
|
|
218
|
+
|
|
219
|
+
return ann_ids, image_ids, input_ids, attention_mask, token_type_ids, tokens
|
|
220
|
+
|
|
221
|
+
def clone(self) -> HFLmTokenClassifierBase:
|
|
222
|
+
return self.__class__(
|
|
223
|
+
self.path_config,
|
|
224
|
+
self.path_weights,
|
|
225
|
+
self.categories.categories_semantics,
|
|
226
|
+
self.categories.categories_bio,
|
|
227
|
+
self.categories.get_categories(),
|
|
228
|
+
self.device,
|
|
229
|
+
self.use_xlm_tokenizer,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
@staticmethod
|
|
233
|
+
def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
|
|
234
|
+
"""Returns the name of the model"""
|
|
235
|
+
return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
|
|
236
|
+
|
|
237
|
+
@staticmethod
|
|
238
|
+
def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
|
|
239
|
+
"""
|
|
240
|
+
A refinement for adding the tokenizer class name to the model configs.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
model_class_name: The model name, e.g. `model.__class__.__name__`
|
|
244
|
+
use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
The name of the tokenizer class.
|
|
248
|
+
"""
|
|
249
|
+
tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
|
|
250
|
+
return tokenizer.__class__.__name__
|
|
251
|
+
|
|
252
|
+
@staticmethod
|
|
253
|
+
def image_to_raw_features_mapping() -> str:
|
|
254
|
+
"""Returns the mapping function to convert images into raw features."""
|
|
255
|
+
return "image_to_raw_lm_features"
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
def image_to_features_mapping() -> str:
|
|
259
|
+
"""Returns the mapping function to convert images into features."""
|
|
260
|
+
return "image_to_lm_features"
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class HFLmTokenClassifier(HFLmTokenClassifierBase):
|
|
264
|
+
"""
|
|
265
|
+
A wrapper class for `transformers.XLMRobertaForTokenClassification` and similar models to use within a pipeline
|
|
266
|
+
component. Check <https://huggingface.co/docs/transformers/model_doc/xlm-roberta> for documentation of the
|
|
267
|
+
model itself.
|
|
268
|
+
Note that this model is equipped with a head that is only useful for classifying the tokens. For sequence
|
|
269
|
+
classification and other things please use another model of the family.
|
|
270
|
+
|
|
271
|
+
Example:
|
|
272
|
+
```python
|
|
273
|
+
# setting up compulsory ocr service
|
|
274
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
275
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
276
|
+
ocr_service = TextExtractionService(tess)
|
|
277
|
+
|
|
278
|
+
# hf tokenizer and token classifier
|
|
279
|
+
tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
|
|
280
|
+
roberta = XLMRobertaForTokenClassification("path/to/config.json","path/to/model.bin",
|
|
281
|
+
categories=["first_name", "surname", "street"])
|
|
282
|
+
|
|
283
|
+
# token classification service
|
|
284
|
+
roberta_service = LMTokenClassifierService(tokenizer,roberta)
|
|
285
|
+
|
|
286
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
|
|
287
|
+
|
|
288
|
+
path = "path/to/some/form"
|
|
289
|
+
df = pipe.analyze(path=path)
|
|
290
|
+
|
|
291
|
+
for dp in df:
|
|
292
|
+
...
|
|
293
|
+
```
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
def __init__(
|
|
297
|
+
self,
|
|
298
|
+
path_config_json: PathLikeOrStr,
|
|
299
|
+
path_weights: PathLikeOrStr,
|
|
300
|
+
categories_semantics: Optional[Sequence[TypeOrStr]] = None,
|
|
301
|
+
categories_bio: Optional[Sequence[TypeOrStr]] = None,
|
|
302
|
+
categories: Optional[Mapping[int, TypeOrStr]] = None,
|
|
303
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
304
|
+
use_xlm_tokenizer: bool = True,
|
|
305
|
+
):
|
|
306
|
+
"""
|
|
307
|
+
Args:
|
|
308
|
+
path_config_json: path to `.json` config file
|
|
309
|
+
path_weights: path to model artifact
|
|
310
|
+
categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
|
|
311
|
+
entities self. To be consistent with detectors use only values `>0`. Conversion will
|
|
312
|
+
be done internally.
|
|
313
|
+
categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. BIO). To be
|
|
314
|
+
consistent with detectors use only values>0. Conversion will be done internally.
|
|
315
|
+
categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
316
|
+
device: The device (cpu,"cuda"), where to place the model.
|
|
317
|
+
use_xlm_tokenizer: Do not change this value unless you pre-trained a bert-like model with a different
|
|
318
|
+
Tokenizer.
|
|
319
|
+
"""
|
|
320
|
+
super().__init__(
|
|
321
|
+
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
322
|
+
)
|
|
323
|
+
self.name = self.get_name(path_weights, "bert-like-token-classification")
|
|
324
|
+
self.model_id = self.get_model_id()
|
|
325
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
326
|
+
self.model.to(self.device)
|
|
327
|
+
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
328
|
+
self.model.__class__.__name__, self.use_xlm_tokenizer
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
|
|
332
|
+
"""
|
|
333
|
+
Launch inference on bert-like models for token classification. Pass the following arguments
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
337
|
+
attention_mask: The associated attention masks from padded sequences taken from
|
|
338
|
+
`LayoutLMTokenizer`
|
|
339
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
340
|
+
boxes: Torch tensor of bounding boxes of type `xyxy`
|
|
341
|
+
tokens: List of original tokens taken from `LayoutLMTokenizer`
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
A list of `TokenClassResult`s
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
ann_ids, _, input_ids, attention_mask, token_type_ids, tokens = self._validate_encodings(**encodings)
|
|
348
|
+
results = predict_token_classes_from_lm(ann_ids, input_ids, attention_mask, token_type_ids, tokens, self.model)
|
|
349
|
+
return self._map_category_names(results)
|
|
350
|
+
|
|
351
|
+
@staticmethod
|
|
352
|
+
def get_wrapped_model(
|
|
353
|
+
path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
|
|
354
|
+
) -> XLMRobertaForTokenClassification:
|
|
355
|
+
"""
|
|
356
|
+
Get the inner (wrapped) model.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
path_config_json: path to .json config file
|
|
360
|
+
path_weights: path to model artifact
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
`nn.Module`
|
|
364
|
+
"""
|
|
365
|
+
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
|
|
366
|
+
return XLMRobertaForTokenClassification.from_pretrained(
|
|
367
|
+
pretrained_model_name_or_path=os.fspath(path_weights), config=config
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
def clear_model(self) -> None:
|
|
371
|
+
self.model = None
|
|
372
|
+
|
|
373
|
+
|
|
69
374
|
class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
70
375
|
"""
|
|
71
376
|
Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
|
|
@@ -208,10 +513,11 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
208
513
|
use_xlm_tokenizer: bool = True,
|
|
209
514
|
):
|
|
210
515
|
super().__init__(path_config_json, path_weights, categories, device)
|
|
211
|
-
self.name = self.get_name(path_weights, "bert-like")
|
|
516
|
+
self.name = self.get_name(path_weights, "bert-like-sequence-classification")
|
|
212
517
|
self.model_id = self.get_model_id()
|
|
213
518
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
214
519
|
self.model.to(self.device)
|
|
520
|
+
self.use_xlm_tokenizer = use_xlm_tokenizer
|
|
215
521
|
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
216
522
|
self.model.__class__.__name__, use_xlm_tokenizer
|
|
217
523
|
)
|
|
@@ -219,7 +525,7 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
219
525
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
|
|
220
526
|
input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
|
|
221
527
|
|
|
222
|
-
result =
|
|
528
|
+
result = predict_sequence_classes_from_lm(
|
|
223
529
|
input_ids,
|
|
224
530
|
attention_mask,
|
|
225
531
|
token_type_ids,
|
|
@@ -262,3 +568,122 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
262
568
|
|
|
263
569
|
def clear_model(self) -> None:
|
|
264
570
|
self.model = None
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
class HFLmLanguageDetector(LanguageDetector):
|
|
574
|
+
"""
|
|
575
|
+
Language detector using HuggingFace's `XLMRobertaForSequenceClassification`.
|
|
576
|
+
|
|
577
|
+
This class wraps a multilingual sequence classification model (XLMRobertaForSequenceClassification)
|
|
578
|
+
for language detection tasks. Input text is tokenized and truncated/padded to a maximum length of 512 tokens.
|
|
579
|
+
The prediction returns a `DetectionResult` containing the detected language code and its confidence score.
|
|
580
|
+
"""
|
|
581
|
+
|
|
582
|
+
def __init__(
|
|
583
|
+
self,
|
|
584
|
+
path_config_json: PathLikeOrStr,
|
|
585
|
+
path_weights: PathLikeOrStr,
|
|
586
|
+
categories: Mapping[int, TypeOrStr],
|
|
587
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
588
|
+
use_xlm_tokenizer: bool = True,
|
|
589
|
+
):
|
|
590
|
+
super().__init__()
|
|
591
|
+
self.path_config = Path(path_config_json)
|
|
592
|
+
self.path_weights = Path(path_weights)
|
|
593
|
+
self.categories = ModelCategories(init_categories=categories)
|
|
594
|
+
self.device = get_torch_device(device)
|
|
595
|
+
self.use_xlm_tokenizer = use_xlm_tokenizer
|
|
596
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
597
|
+
self.model.to(self.device)
|
|
598
|
+
self.tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
|
|
599
|
+
self.name = self.get_name(path_weights, "bert-like-language-detection")
|
|
600
|
+
self.model_id = self.get_model_id()
|
|
601
|
+
|
|
602
|
+
def predict(self, text_string: str) -> DetectionResult:
|
|
603
|
+
"""
|
|
604
|
+
Predict the language of the input sequence.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
text_string: The input text sequence to classify.
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
DetectionResult: The detected language and its confidence score.
|
|
611
|
+
"""
|
|
612
|
+
encoding = self.tokenizer(
|
|
613
|
+
text_string,
|
|
614
|
+
return_tensors="pt",
|
|
615
|
+
padding=True,
|
|
616
|
+
truncation=True,
|
|
617
|
+
max_length=512,
|
|
618
|
+
)
|
|
619
|
+
input_ids = encoding["input_ids"].to(self.device)
|
|
620
|
+
attention_mask = encoding["attention_mask"].to(self.device)
|
|
621
|
+
token_type_ids = encoding.get("token_type_ids")
|
|
622
|
+
if token_type_ids is not None:
|
|
623
|
+
token_type_ids = token_type_ids.to(self.device)
|
|
624
|
+
else:
|
|
625
|
+
token_type_ids = torch.zeros_like(input_ids)
|
|
626
|
+
|
|
627
|
+
self.model.eval()
|
|
628
|
+
with torch.no_grad():
|
|
629
|
+
outputs = self.model(
|
|
630
|
+
input_ids=input_ids,
|
|
631
|
+
attention_mask=attention_mask,
|
|
632
|
+
token_type_ids=token_type_ids,
|
|
633
|
+
)
|
|
634
|
+
probs = torch.softmax(outputs.logits, dim=-1)
|
|
635
|
+
score, class_id_tensor = torch.max(probs, dim=-1)
|
|
636
|
+
class_id = int(class_id_tensor.item() + 1)
|
|
637
|
+
lang = self.categories.categories[class_id]
|
|
638
|
+
|
|
639
|
+
return DetectionResult(class_name=lang, score=float(score.item()))
|
|
640
|
+
|
|
641
|
+
def clear_model(self) -> None:
|
|
642
|
+
self.model = None
|
|
643
|
+
|
|
644
|
+
@classmethod
|
|
645
|
+
def get_requirements(cls) -> list[Requirement]:
|
|
646
|
+
return [get_pytorch_requirement(), get_transformers_requirement()]
|
|
647
|
+
|
|
648
|
+
@staticmethod
|
|
649
|
+
def get_wrapped_model(
|
|
650
|
+
path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
|
|
651
|
+
) -> XLMRobertaForSequenceClassification:
|
|
652
|
+
"""
|
|
653
|
+
Get the inner (wrapped) model.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
path_config_json: path to .json config file
|
|
657
|
+
path_weights: path to model artifact
|
|
658
|
+
|
|
659
|
+
Returns:
|
|
660
|
+
`XLMRobertaForSequenceClassification`
|
|
661
|
+
"""
|
|
662
|
+
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
663
|
+
return XLMRobertaForSequenceClassification.from_pretrained(
|
|
664
|
+
pretrained_model_name_or_path=path_weights, config=config
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
def clone(self) -> HFLmLanguageDetector:
|
|
668
|
+
return self.__class__(
|
|
669
|
+
self.path_config, self.path_weights, self.categories.get_categories(), self.device, self.use_xlm_tokenizer
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
@staticmethod
|
|
673
|
+
def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
|
|
674
|
+
"""
|
|
675
|
+
Returns the name of the model
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
path_weights: Path to model weights
|
|
679
|
+
architecture: Architecture name
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
str: Model name
|
|
683
|
+
"""
|
|
684
|
+
return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
if TYPE_CHECKING:
|
|
688
|
+
LmTokenModels: TypeAlias = Union[HFLmTokenClassifier,]
|
|
689
|
+
LmSequenceModels: TypeAlias = Union[HFLmSequenceClassifier,]
|
|
@@ -806,17 +806,17 @@ def image_to_raw_lm_features(
|
|
|
806
806
|
raw_features["image_id"] = page.image_id
|
|
807
807
|
raw_features["width"] = page.width
|
|
808
808
|
raw_features["height"] = page.height
|
|
809
|
-
raw_features["ann_ids"] = text_
|
|
810
|
-
raw_features["words"] = text_
|
|
809
|
+
raw_features["ann_ids"] = text_.ann_ids
|
|
810
|
+
raw_features["words"] = text_.words
|
|
811
811
|
# We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
|
|
812
812
|
# raw_features_to_layoutlm_features
|
|
813
|
-
raw_features["bbox"] = [_CLS_BOX] * len(text_
|
|
813
|
+
raw_features["bbox"] = [_CLS_BOX] * len(text_.words)
|
|
814
814
|
raw_features["dataset_type"] = dataset_type
|
|
815
815
|
|
|
816
|
-
if use_token_tag and text_
|
|
817
|
-
raw_features["labels"] = text_
|
|
818
|
-
elif text_
|
|
819
|
-
raw_features["labels"] = text_
|
|
816
|
+
if use_token_tag and text_.token_tags:
|
|
817
|
+
raw_features["labels"] = text_.token_tags
|
|
818
|
+
elif text_.token_classes:
|
|
819
|
+
raw_features["labels"] = text_.token_classes
|
|
820
820
|
elif page.document_type is not None:
|
|
821
821
|
document_type_id = page.image_orig.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id - 1
|
|
822
822
|
raw_features["labels"] = [document_type_id]
|
deepdoctection/pipe/language.py
CHANGED
|
@@ -21,7 +21,7 @@ Module for language detection pipeline component
|
|
|
21
21
|
from typing import Optional, Sequence
|
|
22
22
|
|
|
23
23
|
from ..datapoint.image import Image, MetaAnnotation
|
|
24
|
-
from ..datapoint.view import
|
|
24
|
+
from ..datapoint.view import IMAGE_DEFAULTS, Page
|
|
25
25
|
from ..extern.base import LanguageDetector, ObjectDetector
|
|
26
26
|
from ..utils.error import ImageError
|
|
27
27
|
from ..utils.settings import PageType, TypeOrStr, get_type
|
|
@@ -75,11 +75,11 @@ class LanguageDetectionService(PipelineComponent):
|
|
|
75
75
|
|
|
76
76
|
self.predictor = language_detector
|
|
77
77
|
self.text_detector = text_detector
|
|
78
|
-
self.text_container = get_type(text_container) if text_container is not None else
|
|
78
|
+
self.text_container = get_type(text_container) if text_container is not None else IMAGE_DEFAULTS.TEXT_CONTAINER
|
|
79
79
|
self.floating_text_block_categories = (
|
|
80
80
|
tuple(get_type(text_block) for text_block in floating_text_block_categories)
|
|
81
81
|
if (floating_text_block_categories is not None)
|
|
82
|
-
else
|
|
82
|
+
else IMAGE_DEFAULTS.FLOATING_TEXT_BLOCK_CATEGORIES
|
|
83
83
|
)
|
|
84
84
|
|
|
85
85
|
super().__init__(self._get_name(self.predictor.name))
|
|
@@ -109,7 +109,7 @@ class LanguageDetectionService(PipelineComponent):
|
|
|
109
109
|
text = " ".join((result.text for result in detect_result_list if result.text is not None))
|
|
110
110
|
predict_result = self.predictor.predict(text)
|
|
111
111
|
self.dp_manager.set_summary_annotation(
|
|
112
|
-
PageType.LANGUAGE, PageType.LANGUAGE, 1, predict_result.
|
|
112
|
+
PageType.LANGUAGE, PageType.LANGUAGE, 1, predict_result.class_name, predict_result.score
|
|
113
113
|
)
|
|
114
114
|
|
|
115
115
|
def clone(self) -> PipelineComponent:
|
deepdoctection/pipe/lm.py
CHANGED
|
@@ -20,6 +20,7 @@ Module for token classification pipeline
|
|
|
20
20
|
"""
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
+
import inspect
|
|
23
24
|
from copy import copy
|
|
24
25
|
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
|
|
25
26
|
|
|
@@ -32,6 +33,7 @@ from .registry import pipeline_component_registry
|
|
|
32
33
|
|
|
33
34
|
if TYPE_CHECKING:
|
|
34
35
|
from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
|
|
36
|
+
from ..extern.hflm import LmSequenceModels, LmTokenModels
|
|
35
37
|
|
|
36
38
|
|
|
37
39
|
@pipeline_component_registry.register("LMTokenClassifierService")
|
|
@@ -70,7 +72,7 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
70
72
|
def __init__(
|
|
71
73
|
self,
|
|
72
74
|
tokenizer: Any,
|
|
73
|
-
language_model: LayoutTokenModels,
|
|
75
|
+
language_model: Union[LayoutTokenModels, LmTokenModels],
|
|
74
76
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
75
77
|
truncation: bool = True,
|
|
76
78
|
return_overflowing_tokens: bool = False,
|
|
@@ -124,7 +126,7 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
124
126
|
might not get sent to the model because they are categorized as not
|
|
125
127
|
eligible token (e.g. empty string). If set to `True` it will assign all
|
|
126
128
|
words without token the `BioTag.outside` token.
|
|
127
|
-
segment_positions: Using bounding boxes of
|
|
129
|
+
segment_positions: Using bounding boxes of segments instead of words improves model accuracy
|
|
128
130
|
significantly for models that have been trained on segments rather than words.
|
|
129
131
|
Choose a single or a sequence of layout segments to use their bounding boxes. Note,
|
|
130
132
|
that the layout segments need to have a child-relationship with words. If a word
|
|
@@ -271,6 +273,8 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
271
273
|
f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
|
|
272
274
|
f"in this framework"
|
|
273
275
|
)
|
|
276
|
+
func_params = inspect.signature(self.mapping_to_lm_input_func).parameters
|
|
277
|
+
self.required_kwargs = {k: v for k, v in self.required_kwargs.items() if k in func_params}
|
|
274
278
|
|
|
275
279
|
@staticmethod
|
|
276
280
|
def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
|
|
@@ -318,7 +322,7 @@ class LMSequenceClassifierService(PipelineComponent):
|
|
|
318
322
|
def __init__(
|
|
319
323
|
self,
|
|
320
324
|
tokenizer: Any,
|
|
321
|
-
language_model: LayoutSequenceModels,
|
|
325
|
+
language_model: Union[LayoutSequenceModels, LmSequenceModels],
|
|
322
326
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
323
327
|
truncation: bool = True,
|
|
324
328
|
return_overflowing_tokens: bool = False,
|
|
@@ -18,6 +18,7 @@ from types import ModuleType
|
|
|
18
18
|
from typing import Any, Union, no_type_check
|
|
19
19
|
|
|
20
20
|
import importlib_metadata
|
|
21
|
+
import numpy as np
|
|
21
22
|
from packaging import version
|
|
22
23
|
|
|
23
24
|
from .error import DependencyError
|
|
@@ -249,6 +250,39 @@ def get_distance_requirement() -> Requirement:
|
|
|
249
250
|
return "distance", distance_available(), _DISTANCE_ERR_MSG
|
|
250
251
|
|
|
251
252
|
|
|
253
|
+
_NUMPY_V1_ERR_MSG = "numpy v1 must be installed."
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def numpy_v1_available() -> bool:
|
|
257
|
+
"""
|
|
258
|
+
Check if the installed NumPy version is version 1.
|
|
259
|
+
|
|
260
|
+
This helper function determines whether the currently installed version
|
|
261
|
+
of NumPy is version 1 by inspecting its major version number.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
True if the installed NumPy version is 1, otherwise False
|
|
265
|
+
"""
|
|
266
|
+
major_version = np.__version__.split(".", maxsplit=1)[0]
|
|
267
|
+
print(f"major version: {major_version}")
|
|
268
|
+
if major_version in (1, "1"):
|
|
269
|
+
return True
|
|
270
|
+
return False
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def get_numpy_v1_requirement() -> Requirement:
|
|
274
|
+
"""
|
|
275
|
+
Retrieves the requirement details for numpy version 1.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
A tuple containing three elements:
|
|
279
|
+
- The requirement name for numpy version 1.
|
|
280
|
+
- A Boolean value indicating whether numpy version 1 is available.
|
|
281
|
+
- An error message in case numpy version 1 is not available.
|
|
282
|
+
"""
|
|
283
|
+
return "numpy v1", numpy_v1_available(), _NUMPY_V1_ERR_MSG
|
|
284
|
+
|
|
285
|
+
|
|
252
286
|
# Transformers
|
|
253
287
|
_TRANSFORMERS_AVAILABLE = importlib.util.find_spec("transformers") is not None
|
|
254
288
|
_TRANSFORMERS_ERR_MSG = f"transformers must be installed. {_GENERIC_ERR_MSG}"
|
deepdoctection/utils/settings.py
CHANGED
|
@@ -108,6 +108,7 @@ class DocumentType(ObjectTypes):
|
|
|
108
108
|
GOVERNMENT_TENDERS = "government_tenders"
|
|
109
109
|
MANUALS = "manuals"
|
|
110
110
|
PATENTS = "patents"
|
|
111
|
+
BANK_STATEMENT = "bank_statement"
|
|
111
112
|
|
|
112
113
|
|
|
113
114
|
@object_types_registry.register("LayoutType")
|
|
@@ -296,6 +297,7 @@ class Languages(ObjectTypes):
|
|
|
296
297
|
BOSNIAN = "bos"
|
|
297
298
|
NORWEGIAN_NOVOSIBIRSK = "nno"
|
|
298
299
|
URDU = "urd"
|
|
300
|
+
SWAHILI = "swa"
|
|
299
301
|
NOT_DEFINED = "nn"
|
|
300
302
|
|
|
301
303
|
|
deepdoctection/utils/types.py
CHANGED
|
@@ -70,7 +70,6 @@ AnnotationDict: TypeAlias = dict[str, Any]
|
|
|
70
70
|
ImageDict: TypeAlias = dict[str, Any]
|
|
71
71
|
|
|
72
72
|
# We use these types for output types of the Page object
|
|
73
|
-
Text_: TypeAlias = dict[str, Any]
|
|
74
73
|
HTML: TypeAlias = str
|
|
75
74
|
csv: TypeAlias = list[list[str]]
|
|
76
75
|
Chunks: TypeAlias = list[tuple[str, str, int, str, str, str, str]]
|