deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +38 -29
- deepdoctection/analyzer/dd.py +36 -29
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/base.py +0 -19
- deepdoctection/dataflow/custom.py +4 -3
- deepdoctection/dataflow/custom_serialize.py +14 -5
- deepdoctection/dataflow/parallel_map.py +12 -11
- deepdoctection/dataflow/serialize.py +5 -4
- deepdoctection/datapoint/annotation.py +35 -13
- deepdoctection/datapoint/box.py +3 -5
- deepdoctection/datapoint/convert.py +3 -1
- deepdoctection/datapoint/image.py +79 -36
- deepdoctection/datapoint/view.py +152 -49
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +6 -3
- deepdoctection/datasets/base.py +86 -11
- deepdoctection/datasets/dataflow_builder.py +1 -1
- deepdoctection/datasets/info.py +4 -4
- deepdoctection/datasets/instances/doclaynet.py +3 -2
- deepdoctection/datasets/instances/fintabnet.py +2 -1
- deepdoctection/datasets/instances/funsd.py +2 -1
- deepdoctection/datasets/instances/iiitar13k.py +5 -2
- deepdoctection/datasets/instances/layouttest.py +4 -8
- deepdoctection/datasets/instances/publaynet.py +2 -2
- deepdoctection/datasets/instances/pubtables1m.py +6 -3
- deepdoctection/datasets/instances/pubtabnet.py +2 -1
- deepdoctection/datasets/instances/rvlcdip.py +2 -1
- deepdoctection/datasets/instances/xfund.py +2 -1
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +1 -1
- deepdoctection/eval/base.py +5 -4
- deepdoctection/eval/cocometric.py +2 -1
- deepdoctection/eval/eval.py +19 -15
- deepdoctection/eval/tedsmetric.py +14 -11
- deepdoctection/eval/tp_eval_callback.py +14 -7
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +39 -13
- deepdoctection/extern/d2detect.py +182 -90
- deepdoctection/extern/deskew.py +36 -9
- deepdoctection/extern/doctrocr.py +265 -83
- deepdoctection/extern/fastlang.py +49 -9
- deepdoctection/extern/hfdetr.py +106 -55
- deepdoctection/extern/hflayoutlm.py +441 -122
- deepdoctection/extern/hflm.py +225 -0
- deepdoctection/extern/model.py +56 -47
- deepdoctection/extern/pdftext.py +10 -5
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +27 -18
- deepdoctection/extern/tessocr.py +134 -22
- deepdoctection/extern/texocr.py +6 -2
- deepdoctection/extern/tp/tfutils.py +43 -9
- deepdoctection/extern/tp/tpcompat.py +14 -11
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +54 -30
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/d2struct.py +9 -7
- deepdoctection/mapper/hfstruct.py +7 -2
- deepdoctection/mapper/laylmstruct.py +164 -21
- deepdoctection/mapper/maputils.py +16 -3
- deepdoctection/mapper/misc.py +6 -3
- deepdoctection/mapper/prodigystruct.py +1 -1
- deepdoctection/mapper/pubstruct.py +10 -10
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +35 -8
- deepdoctection/pipe/base.py +53 -19
- deepdoctection/pipe/common.py +23 -13
- deepdoctection/pipe/concurrency.py +2 -1
- deepdoctection/pipe/doctectionpipe.py +2 -2
- deepdoctection/pipe/language.py +3 -2
- deepdoctection/pipe/layout.py +6 -3
- deepdoctection/pipe/lm.py +34 -66
- deepdoctection/pipe/order.py +142 -35
- deepdoctection/pipe/refine.py +26 -24
- deepdoctection/pipe/segment.py +21 -16
- deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
- deepdoctection/pipe/text.py +14 -8
- deepdoctection/pipe/transform.py +16 -9
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +36 -28
- deepdoctection/train/hf_detr_train.py +26 -17
- deepdoctection/train/hf_layoutlm_train.py +133 -111
- deepdoctection/train/tp_frcnn_train.py +21 -19
- deepdoctection/utils/__init__.py +3 -0
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +2 -2
- deepdoctection/utils/env_info.py +41 -84
- deepdoctection/utils/error.py +84 -0
- deepdoctection/utils/file_utils.py +4 -15
- deepdoctection/utils/fs.py +7 -7
- deepdoctection/utils/logger.py +1 -0
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +5 -4
- deepdoctection/utils/settings.py +6 -1
- deepdoctection/utils/transform.py +1 -1
- deepdoctection/utils/utils.py +0 -6
- deepdoctection/utils/viz.py +48 -5
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
- deepdoctection-0.32.dist-info/RECORD +146 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
- deepdoctection-0.30.dist-info/RECORD +0 -143
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
"""
|
|
19
19
|
HF Layoutlm model for diverse downstream tasks.
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
22
23
|
from abc import ABC
|
|
23
24
|
from collections import defaultdict
|
|
@@ -26,14 +27,10 @@ from pathlib import Path
|
|
|
26
27
|
from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union
|
|
27
28
|
|
|
28
29
|
import numpy as np
|
|
30
|
+
from lazy_imports import try_import
|
|
29
31
|
|
|
30
32
|
from ..utils.detection_types import JsonDict, Requirement
|
|
31
|
-
from ..utils.file_utils import
|
|
32
|
-
get_pytorch_requirement,
|
|
33
|
-
get_transformers_requirement,
|
|
34
|
-
pytorch_available,
|
|
35
|
-
transformers_available,
|
|
36
|
-
)
|
|
33
|
+
from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
|
|
37
34
|
from ..utils.settings import (
|
|
38
35
|
BioTag,
|
|
39
36
|
ObjectTypes,
|
|
@@ -44,39 +41,85 @@ from ..utils.settings import (
|
|
|
44
41
|
token_class_with_tag_to_token_class_and_tag,
|
|
45
42
|
)
|
|
46
43
|
from .base import LMSequenceClassifier, LMTokenClassifier, SequenceClassResult, TokenClassResult
|
|
47
|
-
from .pt.ptutils import
|
|
44
|
+
from .pt.ptutils import get_torch_device
|
|
48
45
|
|
|
49
|
-
|
|
46
|
+
with try_import() as pt_import_guard:
|
|
50
47
|
import torch
|
|
51
48
|
import torch.nn.functional as F
|
|
52
|
-
from torch import Tensor # pylint: disable=W0611
|
|
53
49
|
|
|
54
|
-
|
|
50
|
+
with try_import() as tr_import_guard:
|
|
55
51
|
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD # type: ignore
|
|
56
52
|
from transformers import (
|
|
57
53
|
LayoutLMForSequenceClassification,
|
|
58
54
|
LayoutLMForTokenClassification,
|
|
55
|
+
LayoutLMTokenizerFast,
|
|
59
56
|
LayoutLMv2Config,
|
|
60
57
|
LayoutLMv2ForSequenceClassification,
|
|
61
58
|
LayoutLMv2ForTokenClassification,
|
|
62
59
|
LayoutLMv3Config,
|
|
63
60
|
LayoutLMv3ForSequenceClassification,
|
|
64
61
|
LayoutLMv3ForTokenClassification,
|
|
62
|
+
LiltForSequenceClassification,
|
|
63
|
+
LiltForTokenClassification,
|
|
65
64
|
PretrainedConfig,
|
|
65
|
+
RobertaTokenizerFast,
|
|
66
|
+
XLMRobertaTokenizerFast,
|
|
66
67
|
)
|
|
67
68
|
|
|
68
69
|
|
|
70
|
+
def get_tokenizer_from_model_class(model_class: str, use_xlm_tokenizer: bool) -> Any:
|
|
71
|
+
"""
|
|
72
|
+
We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
|
|
73
|
+
returns the tokenizer that should be used for a particular model.
|
|
74
|
+
|
|
75
|
+
:param model_class: The model as stated in the transformer library.
|
|
76
|
+
:param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
|
|
77
|
+
:return: Tokenizer instance to use.
|
|
78
|
+
"""
|
|
79
|
+
return {
|
|
80
|
+
("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
81
|
+
"microsoft/layoutlm-base-uncased"
|
|
82
|
+
),
|
|
83
|
+
("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
84
|
+
"microsoft/layoutlm-base-uncased"
|
|
85
|
+
),
|
|
86
|
+
("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
87
|
+
"microsoft/layoutlm-base-uncased"
|
|
88
|
+
),
|
|
89
|
+
("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
90
|
+
"microsoft/layoutlm-base-uncased"
|
|
91
|
+
),
|
|
92
|
+
("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
|
|
93
|
+
("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
|
|
94
|
+
("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
|
|
95
|
+
"roberta-base", add_prefix_space=True
|
|
96
|
+
),
|
|
97
|
+
("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
|
|
98
|
+
"roberta-base", add_prefix_space=True
|
|
99
|
+
),
|
|
100
|
+
("LiltForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
|
|
101
|
+
("LiltForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
|
|
102
|
+
"roberta-base", add_prefix_space=True
|
|
103
|
+
),
|
|
104
|
+
("LiltForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
|
|
105
|
+
("LiltForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
|
|
106
|
+
"roberta-base", add_prefix_space=True
|
|
107
|
+
),
|
|
108
|
+
("XLMRobertaForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained(
|
|
109
|
+
"FacebookAI/xlm-roberta-base"
|
|
110
|
+
),
|
|
111
|
+
}[(model_class, use_xlm_tokenizer)]
|
|
112
|
+
|
|
113
|
+
|
|
69
114
|
def predict_token_classes(
|
|
70
115
|
uuids: List[List[str]],
|
|
71
|
-
input_ids:
|
|
72
|
-
attention_mask:
|
|
73
|
-
token_type_ids:
|
|
74
|
-
boxes:
|
|
116
|
+
input_ids: torch.Tensor,
|
|
117
|
+
attention_mask: torch.Tensor,
|
|
118
|
+
token_type_ids: torch.Tensor,
|
|
119
|
+
boxes: torch.Tensor,
|
|
75
120
|
tokens: List[List[str]],
|
|
76
|
-
model: Union[
|
|
77
|
-
|
|
78
|
-
],
|
|
79
|
-
images: Optional["Tensor"] = None,
|
|
121
|
+
model: Union[LayoutLMForTokenClassification, LayoutLMv2ForTokenClassification, LayoutLMv3ForTokenClassification],
|
|
122
|
+
images: Optional[torch.Tensor] = None,
|
|
80
123
|
) -> List[TokenClassResult]:
|
|
81
124
|
"""
|
|
82
125
|
:param uuids: A list of uuids that correspond to a word that induces the resulting token
|
|
@@ -129,26 +172,28 @@ def predict_token_classes(
|
|
|
129
172
|
|
|
130
173
|
|
|
131
174
|
def predict_sequence_classes(
|
|
132
|
-
input_ids:
|
|
133
|
-
attention_mask:
|
|
134
|
-
token_type_ids:
|
|
135
|
-
boxes:
|
|
175
|
+
input_ids: torch.Tensor,
|
|
176
|
+
attention_mask: torch.Tensor,
|
|
177
|
+
token_type_ids: torch.Tensor,
|
|
178
|
+
boxes: torch.Tensor,
|
|
136
179
|
model: Union[
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
180
|
+
LayoutLMForSequenceClassification,
|
|
181
|
+
LayoutLMv2ForSequenceClassification,
|
|
182
|
+
LayoutLMv3ForSequenceClassification,
|
|
183
|
+
LiltForSequenceClassification,
|
|
140
184
|
],
|
|
141
|
-
images: Optional[
|
|
185
|
+
images: Optional[torch.Tensor] = None,
|
|
142
186
|
) -> SequenceClassResult:
|
|
143
187
|
"""
|
|
144
188
|
:param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
|
|
145
189
|
:param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
|
|
146
190
|
:param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
|
|
147
191
|
:param boxes: Torch tensor of bounding boxes of type 'xyxy'
|
|
148
|
-
:param model: layoutlm model for
|
|
192
|
+
:param model: layoutlm model for sequence classification
|
|
149
193
|
:param images: A list of torch image tensors or None
|
|
150
194
|
:return: SequenceClassResult
|
|
151
195
|
"""
|
|
196
|
+
|
|
152
197
|
if images is None:
|
|
153
198
|
outputs = model(input_ids=input_ids, bbox=boxes, attention_mask=attention_mask, token_type_ids=token_type_ids)
|
|
154
199
|
elif isinstance(model, LayoutLMv2ForSequenceClassification):
|
|
@@ -177,7 +222,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
177
222
|
Abstract base class for wrapping LayoutLM models for token classification into the deepdoctection framework.
|
|
178
223
|
"""
|
|
179
224
|
|
|
180
|
-
model: Union[
|
|
225
|
+
model: Union[LayoutLMForTokenClassification, LayoutLMv2ForTokenClassification]
|
|
181
226
|
|
|
182
227
|
def __init__(
|
|
183
228
|
self,
|
|
@@ -186,7 +231,8 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
186
231
|
categories_semantics: Optional[Sequence[TypeOrStr]] = None,
|
|
187
232
|
categories_bio: Optional[Sequence[TypeOrStr]] = None,
|
|
188
233
|
categories: Optional[Mapping[str, TypeOrStr]] = None,
|
|
189
|
-
device: Optional[Literal["cpu", "cuda"]] = None,
|
|
234
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
235
|
+
use_xlm_tokenizer: bool = False,
|
|
190
236
|
):
|
|
191
237
|
"""
|
|
192
238
|
:param path_config_json: path to .json config file
|
|
@@ -198,9 +244,10 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
198
244
|
consistent with detectors use only values>0. Conversion will be done internally.
|
|
199
245
|
:param categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
200
246
|
:param device: The device (cpu,"cuda"), where to place the model.
|
|
247
|
+
:param use_xlm_tokenizer: True if one uses the LayoutXLM or a lilt model built with a xlm language model, e.g.
|
|
248
|
+
info-xlm or roberta-xlm. (LayoutXLM cannot be distinguished from LayoutLMv2).
|
|
201
249
|
"""
|
|
202
250
|
|
|
203
|
-
self.name = "_".join(Path(path_weights).parts[-3:])
|
|
204
251
|
if categories is None:
|
|
205
252
|
if categories_semantics is None:
|
|
206
253
|
raise ValueError("If categories is None then categories_semantics cannot be None")
|
|
@@ -219,11 +266,9 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
219
266
|
self.categories = self._categories_orig_to_categories(
|
|
220
267
|
self.categories_semantics, self.categories_bio # type: ignore
|
|
221
268
|
)
|
|
222
|
-
|
|
223
|
-
self.device = device
|
|
224
|
-
else:
|
|
225
|
-
self.device = set_torch_auto_device()
|
|
269
|
+
self.device = get_torch_device(device)
|
|
226
270
|
self.model.to(self.device)
|
|
271
|
+
self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
|
|
227
272
|
|
|
228
273
|
@classmethod
|
|
229
274
|
def get_requirements(cls) -> List[Requirement]:
|
|
@@ -257,9 +302,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
257
302
|
|
|
258
303
|
def _validate_encodings(
|
|
259
304
|
self, **encodings: Any
|
|
260
|
-
) -> Tuple[
|
|
261
|
-
List[List[str]], List[str], "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", List[List[str]]
|
|
262
|
-
]:
|
|
305
|
+
) -> Tuple[List[List[str]], List[str], torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, List[List[str]]]:
|
|
263
306
|
image_ids = encodings.get("image_ids", [])
|
|
264
307
|
ann_ids = encodings.get("ann_ids")
|
|
265
308
|
input_ids = encodings.get("input_ids")
|
|
@@ -292,7 +335,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
292
335
|
|
|
293
336
|
return ann_ids, image_ids, input_ids, attention_mask, token_type_ids, boxes, tokens
|
|
294
337
|
|
|
295
|
-
def clone(self) ->
|
|
338
|
+
def clone(self) -> HFLayoutLmTokenClassifierBase:
|
|
296
339
|
return self.__class__(
|
|
297
340
|
self.path_config,
|
|
298
341
|
self.path_weights,
|
|
@@ -302,6 +345,29 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
302
345
|
self.device,
|
|
303
346
|
)
|
|
304
347
|
|
|
348
|
+
@staticmethod
|
|
349
|
+
def get_name(path_weights: str, architecture: str) -> str:
|
|
350
|
+
"""Returns the name of the model"""
|
|
351
|
+
return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
|
|
352
|
+
|
|
353
|
+
def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
|
|
354
|
+
"""A refinement for adding the tokenizer class name to the model configs.
|
|
355
|
+
|
|
356
|
+
:param use_xlm_tokenizer: Whether to use a XLM tokenizer.
|
|
357
|
+
"""
|
|
358
|
+
tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
|
|
359
|
+
return tokenizer.__class__.__name__
|
|
360
|
+
|
|
361
|
+
@staticmethod
|
|
362
|
+
def image_to_raw_features_mapping() -> str:
|
|
363
|
+
"""Returns the mapping function to convert images into raw features."""
|
|
364
|
+
return "image_to_raw_layoutlm_features"
|
|
365
|
+
|
|
366
|
+
@staticmethod
|
|
367
|
+
def image_to_features_mapping() -> str:
|
|
368
|
+
"""Returns the mapping function to convert images into features."""
|
|
369
|
+
return "image_to_layoutlm_features"
|
|
370
|
+
|
|
305
371
|
|
|
306
372
|
class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
307
373
|
"""
|
|
@@ -344,7 +410,8 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
344
410
|
categories_semantics: Optional[Sequence[TypeOrStr]] = None,
|
|
345
411
|
categories_bio: Optional[Sequence[TypeOrStr]] = None,
|
|
346
412
|
categories: Optional[Mapping[str, TypeOrStr]] = None,
|
|
347
|
-
device: Optional[Literal["cpu", "cuda"]] = None,
|
|
413
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
414
|
+
use_xlm_tokenizer: bool = False,
|
|
348
415
|
):
|
|
349
416
|
"""
|
|
350
417
|
:param path_config_json: path to .json config file
|
|
@@ -356,14 +423,17 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
356
423
|
consistent with detectors use only values>0. Conversion will be done internally.
|
|
357
424
|
:param categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
358
425
|
:param device: The device (cpu,"cuda"), where to place the model.
|
|
426
|
+
:param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
|
|
427
|
+
Tokenizer.
|
|
359
428
|
"""
|
|
360
|
-
|
|
361
|
-
self.
|
|
362
|
-
|
|
429
|
+
self.name = self.get_name(path_weights, "LayoutLM")
|
|
430
|
+
self.model_id = self.get_model_id()
|
|
431
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
432
|
+
super().__init__(
|
|
433
|
+
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
363
434
|
)
|
|
364
|
-
super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
|
|
365
435
|
|
|
366
|
-
def predict(self, **encodings: Union[List[List[str]],
|
|
436
|
+
def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
|
|
367
437
|
"""
|
|
368
438
|
Launch inference on LayoutLm for token classification. Pass the following arguments
|
|
369
439
|
|
|
@@ -388,6 +458,18 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
388
458
|
|
|
389
459
|
return self._map_category_names(results)
|
|
390
460
|
|
|
461
|
+
@staticmethod
|
|
462
|
+
def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
|
|
463
|
+
"""
|
|
464
|
+
Get the inner (wrapped) model.
|
|
465
|
+
|
|
466
|
+
:param path_config_json: path to .json config file
|
|
467
|
+
:param path_weights: path to model artifact
|
|
468
|
+
:return: 'nn.Module'
|
|
469
|
+
"""
|
|
470
|
+
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
471
|
+
return LayoutLMForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
|
|
472
|
+
|
|
391
473
|
|
|
392
474
|
class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
393
475
|
"""
|
|
@@ -432,7 +514,8 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
432
514
|
categories_semantics: Optional[Sequence[TypeOrStr]] = None,
|
|
433
515
|
categories_bio: Optional[Sequence[TypeOrStr]] = None,
|
|
434
516
|
categories: Optional[Mapping[str, TypeOrStr]] = None,
|
|
435
|
-
device: Optional[Literal["cpu", "cuda"]] = None,
|
|
517
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
518
|
+
use_xlm_tokenizer: bool = False,
|
|
436
519
|
):
|
|
437
520
|
"""
|
|
438
521
|
:param path_config_json: path to .json config file
|
|
@@ -444,14 +527,17 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
444
527
|
consistent with detectors use only values>0. Conversion will be done internally.
|
|
445
528
|
:param categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
446
529
|
:param device: The device (cpu,"cuda"), where to place the model.
|
|
530
|
+
:param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
|
|
531
|
+
default value.
|
|
447
532
|
"""
|
|
448
|
-
|
|
449
|
-
self.
|
|
450
|
-
|
|
533
|
+
self.name = self.get_name(path_weights, "LayoutLMv2")
|
|
534
|
+
self.model_id = self.get_model_id()
|
|
535
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
536
|
+
super().__init__(
|
|
537
|
+
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
451
538
|
)
|
|
452
|
-
super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
|
|
453
539
|
|
|
454
|
-
def predict(self, **encodings: Union[List[List[str]],
|
|
540
|
+
def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
|
|
455
541
|
"""
|
|
456
542
|
Launch inference on LayoutLm for token classification. Pass the following arguments
|
|
457
543
|
|
|
@@ -489,6 +575,20 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
489
575
|
"""
|
|
490
576
|
return {"image_width": 224, "image_height": 224}
|
|
491
577
|
|
|
578
|
+
@staticmethod
|
|
579
|
+
def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
|
|
580
|
+
"""
|
|
581
|
+
Get the inner (wrapped) model.
|
|
582
|
+
|
|
583
|
+
:param path_config_json: path to .json config file
|
|
584
|
+
:param path_weights: path to model artifact
|
|
585
|
+
:return: 'nn.Module'
|
|
586
|
+
"""
|
|
587
|
+
config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
588
|
+
return LayoutLMv2ForTokenClassification.from_pretrained(
|
|
589
|
+
pretrained_model_name_or_path=path_weights, config=config
|
|
590
|
+
)
|
|
591
|
+
|
|
492
592
|
|
|
493
593
|
class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
494
594
|
"""
|
|
@@ -533,7 +633,8 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
533
633
|
categories_semantics: Optional[Sequence[TypeOrStr]] = None,
|
|
534
634
|
categories_bio: Optional[Sequence[TypeOrStr]] = None,
|
|
535
635
|
categories: Optional[Mapping[str, TypeOrStr]] = None,
|
|
536
|
-
device: Optional[Literal["cpu", "cuda"]] = None,
|
|
636
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
637
|
+
use_xlm_tokenizer: bool = False,
|
|
537
638
|
):
|
|
538
639
|
"""
|
|
539
640
|
:param path_config_json: path to .json config file
|
|
@@ -545,14 +646,17 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
545
646
|
consistent with detectors use only values>0. Conversion will be done internally.
|
|
546
647
|
:param categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
547
648
|
:param device: The device (cpu,"cuda"), where to place the model.
|
|
649
|
+
:param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLMv3 model with a different
|
|
650
|
+
tokenizer.
|
|
548
651
|
"""
|
|
549
|
-
|
|
550
|
-
self.
|
|
551
|
-
|
|
652
|
+
self.name = self.get_name(path_weights, "LayoutLMv3")
|
|
653
|
+
self.model_id = self.get_model_id()
|
|
654
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
655
|
+
super().__init__(
|
|
656
|
+
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
552
657
|
)
|
|
553
|
-
super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
|
|
554
658
|
|
|
555
|
-
def predict(self, **encodings: Union[List[List[str]],
|
|
659
|
+
def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
|
|
556
660
|
"""
|
|
557
661
|
Launch inference on LayoutLm for token classification. Pass the following arguments
|
|
558
662
|
|
|
@@ -592,77 +696,54 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
592
696
|
"pixel_std": np.array(IMAGENET_DEFAULT_STD, dtype=np.float32),
|
|
593
697
|
}
|
|
594
698
|
|
|
699
|
+
@staticmethod
|
|
700
|
+
def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
|
|
701
|
+
"""
|
|
702
|
+
Get the inner (wrapped) model.
|
|
703
|
+
|
|
704
|
+
:param path_config_json: path to .json config file
|
|
705
|
+
:param path_weights: path to model artifact
|
|
706
|
+
:return: 'nn.Module'
|
|
707
|
+
"""
|
|
708
|
+
config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
709
|
+
return LayoutLMv3ForTokenClassification.from_pretrained(
|
|
710
|
+
pretrained_model_name_or_path=path_weights, config=config
|
|
711
|
+
)
|
|
712
|
+
|
|
595
713
|
|
|
596
714
|
class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
597
715
|
"""
|
|
598
716
|
Abstract base class for wrapping LayoutLM models for sequence classification into the deepdoctection framework.
|
|
599
717
|
"""
|
|
600
718
|
|
|
601
|
-
model: Union[
|
|
719
|
+
model: Union[LayoutLMForSequenceClassification, LayoutLMv2ForSequenceClassification]
|
|
602
720
|
|
|
603
721
|
def __init__(
|
|
604
722
|
self,
|
|
605
723
|
path_config_json: str,
|
|
606
724
|
path_weights: str,
|
|
607
725
|
categories: Mapping[str, TypeOrStr],
|
|
608
|
-
device: Optional[Literal["cpu", "cuda"]] = None,
|
|
726
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
727
|
+
use_xlm_tokenizer: bool = False,
|
|
609
728
|
):
|
|
610
|
-
self.name = "_".join(Path(path_weights).parts[-3:])
|
|
611
729
|
self.path_config = path_config_json
|
|
612
730
|
self.path_weights = path_weights
|
|
613
731
|
self.categories = copy(categories) # type: ignore
|
|
614
732
|
|
|
615
|
-
|
|
616
|
-
self.device = device
|
|
617
|
-
else:
|
|
618
|
-
self.device = set_torch_auto_device()
|
|
733
|
+
self.device = get_torch_device(device)
|
|
619
734
|
self.model.to(self.device)
|
|
620
|
-
|
|
621
|
-
def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
|
|
622
|
-
input_ids = encodings.get("input_ids")
|
|
623
|
-
attention_mask = encodings.get("attention_mask")
|
|
624
|
-
token_type_ids = encodings.get("token_type_ids")
|
|
625
|
-
boxes = encodings.get("bbox")
|
|
626
|
-
|
|
627
|
-
if isinstance(input_ids, torch.Tensor):
|
|
628
|
-
input_ids = input_ids.to(self.device)
|
|
629
|
-
else:
|
|
630
|
-
raise ValueError(f"input_ids must be list but is {type(input_ids)}")
|
|
631
|
-
if isinstance(attention_mask, torch.Tensor):
|
|
632
|
-
attention_mask = attention_mask.to(self.device)
|
|
633
|
-
else:
|
|
634
|
-
raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
|
|
635
|
-
if isinstance(token_type_ids, torch.Tensor):
|
|
636
|
-
token_type_ids = token_type_ids.to(self.device)
|
|
637
|
-
else:
|
|
638
|
-
raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
|
|
639
|
-
if isinstance(boxes, torch.Tensor):
|
|
640
|
-
boxes = boxes.to(self.device)
|
|
641
|
-
else:
|
|
642
|
-
raise ValueError(f"boxes must be list but is {type(boxes)}")
|
|
643
|
-
|
|
644
|
-
result = predict_sequence_classes(
|
|
645
|
-
input_ids,
|
|
646
|
-
attention_mask,
|
|
647
|
-
token_type_ids,
|
|
648
|
-
boxes,
|
|
649
|
-
self.model,
|
|
650
|
-
)
|
|
651
|
-
|
|
652
|
-
result.class_id += 1
|
|
653
|
-
result.class_name = self.categories[str(result.class_id)]
|
|
654
|
-
return result
|
|
735
|
+
self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
|
|
655
736
|
|
|
656
737
|
@classmethod
|
|
657
738
|
def get_requirements(cls) -> List[Requirement]:
|
|
658
739
|
return [get_pytorch_requirement(), get_transformers_requirement()]
|
|
659
740
|
|
|
660
|
-
def clone(self) ->
|
|
741
|
+
def clone(self) -> HFLayoutLmSequenceClassifierBase:
|
|
661
742
|
return self.__class__(self.path_config, self.path_weights, self.categories, self.device)
|
|
662
743
|
|
|
663
744
|
def _validate_encodings(
|
|
664
|
-
self, **encodings: Union[List[List[str]],
|
|
665
|
-
) -> Tuple[
|
|
745
|
+
self, **encodings: Union[List[List[str]], torch.Tensor]
|
|
746
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
666
747
|
input_ids = encodings.get("input_ids")
|
|
667
748
|
attention_mask = encodings.get("attention_mask")
|
|
668
749
|
token_type_ids = encodings.get("token_type_ids")
|
|
@@ -691,6 +772,29 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
|
691
772
|
boxes = boxes.to(self.device)
|
|
692
773
|
return input_ids, attention_mask, token_type_ids, boxes
|
|
693
774
|
|
|
775
|
+
@staticmethod
|
|
776
|
+
def get_name(path_weights: str, architecture: str) -> str:
|
|
777
|
+
"""Returns the name of the model"""
|
|
778
|
+
return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
|
|
779
|
+
|
|
780
|
+
def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
|
|
781
|
+
"""A refinement for adding the tokenizer class name to the model configs.
|
|
782
|
+
|
|
783
|
+
:param use_xlm_tokenizer: Whether to use a XLM tokenizer.
|
|
784
|
+
"""
|
|
785
|
+
tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
|
|
786
|
+
return tokenizer.__class__.__name__
|
|
787
|
+
|
|
788
|
+
@staticmethod
|
|
789
|
+
def image_to_raw_features_mapping() -> str:
|
|
790
|
+
"""Returns the mapping function to convert images into raw features."""
|
|
791
|
+
return "image_to_raw_layoutlm_features"
|
|
792
|
+
|
|
793
|
+
@staticmethod
|
|
794
|
+
def image_to_features_mapping() -> str:
|
|
795
|
+
"""Returns the mapping function to convert images into features."""
|
|
796
|
+
return "image_to_layoutlm_features"
|
|
797
|
+
|
|
694
798
|
|
|
695
799
|
class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
696
800
|
"""
|
|
@@ -728,15 +832,15 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
728
832
|
path_config_json: str,
|
|
729
833
|
path_weights: str,
|
|
730
834
|
categories: Mapping[str, TypeOrStr],
|
|
731
|
-
device: Optional[Literal["cpu", "cuda"]] = None,
|
|
835
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
836
|
+
use_xlm_tokenizer: bool = False,
|
|
732
837
|
):
|
|
733
|
-
|
|
734
|
-
self.
|
|
735
|
-
|
|
736
|
-
)
|
|
737
|
-
super().__init__(path_config_json, path_weights, categories, device)
|
|
838
|
+
self.name = self.get_name(path_weights, "LayoutLM")
|
|
839
|
+
self.model_id = self.get_model_id()
|
|
840
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
841
|
+
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
738
842
|
|
|
739
|
-
def predict(self, **encodings: Union[List[List[str]],
|
|
843
|
+
def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
|
|
740
844
|
input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
|
|
741
845
|
|
|
742
846
|
result = predict_sequence_classes(
|
|
@@ -751,6 +855,20 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
751
855
|
result.class_name = self.categories[str(result.class_id)]
|
|
752
856
|
return result
|
|
753
857
|
|
|
858
|
+
@staticmethod
|
|
859
|
+
def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
|
|
860
|
+
"""
|
|
861
|
+
Get the inner (wrapped) model.
|
|
862
|
+
|
|
863
|
+
:param path_config_json: path to .json config file
|
|
864
|
+
:param path_weights: path to model artifact
|
|
865
|
+
:return: 'nn.Module'
|
|
866
|
+
"""
|
|
867
|
+
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
868
|
+
return LayoutLMForSequenceClassification.from_pretrained(
|
|
869
|
+
pretrained_model_name_or_path=path_weights, config=config
|
|
870
|
+
)
|
|
871
|
+
|
|
754
872
|
|
|
755
873
|
class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
756
874
|
"""
|
|
@@ -788,15 +906,15 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
788
906
|
path_config_json: str,
|
|
789
907
|
path_weights: str,
|
|
790
908
|
categories: Mapping[str, TypeOrStr],
|
|
791
|
-
device: Optional[Literal["cpu", "cuda"]] = None,
|
|
909
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
910
|
+
use_xlm_tokenizer: bool = False,
|
|
792
911
|
):
|
|
793
|
-
|
|
794
|
-
self.
|
|
795
|
-
|
|
796
|
-
)
|
|
797
|
-
super().__init__(path_config_json, path_weights, categories, device)
|
|
912
|
+
self.name = self.get_name(path_weights, "LayoutLMv2")
|
|
913
|
+
self.model_id = self.get_model_id()
|
|
914
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
915
|
+
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
798
916
|
|
|
799
|
-
def predict(self, **encodings: Union[List[List[str]],
|
|
917
|
+
def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
|
|
800
918
|
input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
|
|
801
919
|
images = encodings.get("image")
|
|
802
920
|
if isinstance(images, torch.Tensor):
|
|
@@ -818,6 +936,20 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
818
936
|
"""
|
|
819
937
|
return {"image_width": 224, "image_height": 224}
|
|
820
938
|
|
|
939
|
+
@staticmethod
|
|
940
|
+
def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
|
|
941
|
+
"""
|
|
942
|
+
Get the inner (wrapped) model.
|
|
943
|
+
|
|
944
|
+
:param path_config_json: path to .json config file
|
|
945
|
+
:param path_weights: path to model artifact
|
|
946
|
+
:return: 'nn.Module'
|
|
947
|
+
"""
|
|
948
|
+
config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
949
|
+
return LayoutLMv2ForSequenceClassification.from_pretrained(
|
|
950
|
+
pretrained_model_name_or_path=path_weights, config=config
|
|
951
|
+
)
|
|
952
|
+
|
|
821
953
|
|
|
822
954
|
class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
823
955
|
"""
|
|
@@ -855,15 +987,15 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
855
987
|
path_config_json: str,
|
|
856
988
|
path_weights: str,
|
|
857
989
|
categories: Mapping[str, TypeOrStr],
|
|
858
|
-
device: Optional[Literal["cpu", "cuda"]] = None,
|
|
990
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
991
|
+
use_xlm_tokenizer: bool = False,
|
|
859
992
|
):
|
|
860
|
-
|
|
861
|
-
self.
|
|
862
|
-
|
|
863
|
-
)
|
|
864
|
-
super().__init__(path_config_json, path_weights, categories, device)
|
|
993
|
+
self.name = self.get_name(path_weights, "LayoutLMv3")
|
|
994
|
+
self.model_id = self.get_model_id()
|
|
995
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
996
|
+
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
865
997
|
|
|
866
|
-
def predict(self, **encodings: Union[List[List[str]],
|
|
998
|
+
def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
|
|
867
999
|
input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
|
|
868
1000
|
images = encodings.get("pixel_values")
|
|
869
1001
|
if isinstance(images, torch.Tensor):
|
|
@@ -890,3 +1022,190 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
890
1022
|
"pixel_mean": np.array(IMAGENET_DEFAULT_MEAN, dtype=np.float32),
|
|
891
1023
|
"pixel_std": np.array(IMAGENET_DEFAULT_STD, dtype=np.float32),
|
|
892
1024
|
}
|
|
1025
|
+
|
|
1026
|
+
@staticmethod
|
|
1027
|
+
def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
|
|
1028
|
+
"""
|
|
1029
|
+
Get the inner (wrapped) model.
|
|
1030
|
+
|
|
1031
|
+
:param path_config_json: path to .json config file
|
|
1032
|
+
:param path_weights: path to model artifact
|
|
1033
|
+
:return: 'nn.Module'
|
|
1034
|
+
"""
|
|
1035
|
+
config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
1036
|
+
return LayoutLMv3ForSequenceClassification.from_pretrained(
|
|
1037
|
+
pretrained_model_name_or_path=path_weights, config=config
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
1042
|
+
"""
|
|
1043
|
+
A wrapper class for `transformers.LiltForTokenClassification` to use within a pipeline component.
|
|
1044
|
+
Check <https://huggingface.co/docs/transformers/model_doc/lilt> for documentation of the model itself.
|
|
1045
|
+
Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
|
|
1046
|
+
classification and other things please use another model of the family.
|
|
1047
|
+
|
|
1048
|
+
**Example**
|
|
1049
|
+
|
|
1050
|
+
# setting up compulsory ocr service
|
|
1051
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
1052
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
1053
|
+
ocr_service = TextExtractionService(tess)
|
|
1054
|
+
|
|
1055
|
+
# hf tokenizer and token classifier
|
|
1056
|
+
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
|
1057
|
+
lilt = HFLiltTokenClassifier("path/to/config.json","path/to/model.bin",
|
|
1058
|
+
categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
|
|
1059
|
+
'E-header', 'E-question', 'I-answer', 'I-header',
|
|
1060
|
+
'I-question', 'O', 'S-answer', 'S-header',
|
|
1061
|
+
'S-question'])
|
|
1062
|
+
|
|
1063
|
+
# token classification service
|
|
1064
|
+
lilt_service = LMTokenClassifierService(tokenizer,lilt)
|
|
1065
|
+
|
|
1066
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
|
|
1067
|
+
|
|
1068
|
+
path = "path/to/some/form"
|
|
1069
|
+
df = pipe.analyze(path=path)
|
|
1070
|
+
|
|
1071
|
+
for dp in df:
|
|
1072
|
+
...
|
|
1073
|
+
"""
|
|
1074
|
+
|
|
1075
|
+
def __init__(
|
|
1076
|
+
self,
|
|
1077
|
+
path_config_json: str,
|
|
1078
|
+
path_weights: str,
|
|
1079
|
+
categories_semantics: Optional[Sequence[TypeOrStr]] = None,
|
|
1080
|
+
categories_bio: Optional[Sequence[TypeOrStr]] = None,
|
|
1081
|
+
categories: Optional[Mapping[str, TypeOrStr]] = None,
|
|
1082
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
1083
|
+
use_xlm_tokenizer: bool = False,
|
|
1084
|
+
):
|
|
1085
|
+
"""
|
|
1086
|
+
:param path_config_json: path to .json config file
|
|
1087
|
+
:param path_weights: path to model artifact
|
|
1088
|
+
:param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
|
|
1089
|
+
entities self. To be consistent with detectors use only values >0. Conversion will
|
|
1090
|
+
be done internally.
|
|
1091
|
+
:param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
|
|
1092
|
+
consistent with detectors use only values>0. Conversion will be done internally.
|
|
1093
|
+
:param categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
1094
|
+
:param device: The device (cpu,"cuda"), where to place the model.
|
|
1095
|
+
"""
|
|
1096
|
+
self.name = self.get_name(path_weights, "LiLT")
|
|
1097
|
+
self.model_id = self.get_model_id()
|
|
1098
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
1099
|
+
super().__init__(
|
|
1100
|
+
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
1101
|
+
)
|
|
1102
|
+
|
|
1103
|
+
def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
|
|
1104
|
+
"""
|
|
1105
|
+
Launch inference on LayoutLm for token classification. Pass the following arguments
|
|
1106
|
+
|
|
1107
|
+
`input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
1108
|
+
|
|
1109
|
+
`attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
|
|
1110
|
+
|
|
1111
|
+
`token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
1112
|
+
|
|
1113
|
+
`boxes:` Torch tensor of bounding boxes of type 'xyxy'
|
|
1114
|
+
|
|
1115
|
+
`tokens:` List of original tokens taken from `LayoutLMTokenizer`
|
|
1116
|
+
|
|
1117
|
+
:return: A list of TokenClassResults
|
|
1118
|
+
"""
|
|
1119
|
+
|
|
1120
|
+
ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
|
|
1121
|
+
|
|
1122
|
+
results = predict_token_classes(
|
|
1123
|
+
ann_ids, input_ids, attention_mask, token_type_ids, boxes, tokens, self.model, None
|
|
1124
|
+
)
|
|
1125
|
+
|
|
1126
|
+
return self._map_category_names(results)
|
|
1127
|
+
|
|
1128
|
+
@staticmethod
|
|
1129
|
+
def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
|
|
1130
|
+
"""
|
|
1131
|
+
Get the inner (wrapped) model.
|
|
1132
|
+
|
|
1133
|
+
:param path_config_json: path to .json config file
|
|
1134
|
+
:param path_weights: path to model artifact
|
|
1135
|
+
:return: 'nn.Module'
|
|
1136
|
+
"""
|
|
1137
|
+
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
1138
|
+
return LiltForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
|
|
1139
|
+
|
|
1140
|
+
|
|
1141
|
+
class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
1142
|
+
"""
|
|
1143
|
+
A wrapper class for `transformers.LiLTForSequenceClassification` to use within a pipeline component.
|
|
1144
|
+
Check <https://huggingface.co/docs/transformers/model_doc/lilt> for documentation of the model itself.
|
|
1145
|
+
Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
|
|
1146
|
+
classification and other things please use another model of the family.
|
|
1147
|
+
|
|
1148
|
+
**Example**
|
|
1149
|
+
|
|
1150
|
+
# setting up compulsory ocr service
|
|
1151
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
1152
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
1153
|
+
ocr_service = TextExtractionService(tess)
|
|
1154
|
+
|
|
1155
|
+
# hf tokenizer and sequence classifier
|
|
1156
|
+
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
|
|
1157
|
+
lilt = HFLiltSequenceClassifier("path/to/config.json",
|
|
1158
|
+
"path/to/model.bin",
|
|
1159
|
+
categories=["handwritten", "presentation", "resume"])
|
|
1160
|
+
|
|
1161
|
+
# sequence classification service
|
|
1162
|
+
lilt_service = LMSequenceClassifierService(tokenizer,lilt)
|
|
1163
|
+
|
|
1164
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
|
|
1165
|
+
|
|
1166
|
+
path = "path/to/some/form"
|
|
1167
|
+
df = pipe.analyze(path=path)
|
|
1168
|
+
|
|
1169
|
+
for dp in df:
|
|
1170
|
+
...
|
|
1171
|
+
"""
|
|
1172
|
+
|
|
1173
|
+
def __init__(
|
|
1174
|
+
self,
|
|
1175
|
+
path_config_json: str,
|
|
1176
|
+
path_weights: str,
|
|
1177
|
+
categories: Mapping[str, TypeOrStr],
|
|
1178
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
1179
|
+
use_xlm_tokenizer: bool = False,
|
|
1180
|
+
):
|
|
1181
|
+
self.name = self.get_name(path_weights, "LiLT")
|
|
1182
|
+
self.model_id = self.get_model_id()
|
|
1183
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
1184
|
+
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
1185
|
+
|
|
1186
|
+
def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
|
|
1187
|
+
input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
|
|
1188
|
+
|
|
1189
|
+
result = predict_sequence_classes(
|
|
1190
|
+
input_ids,
|
|
1191
|
+
attention_mask,
|
|
1192
|
+
token_type_ids,
|
|
1193
|
+
boxes,
|
|
1194
|
+
self.model,
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
result.class_id += 1
|
|
1198
|
+
result.class_name = self.categories[str(result.class_id)]
|
|
1199
|
+
return result
|
|
1200
|
+
|
|
1201
|
+
@staticmethod
|
|
1202
|
+
def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
|
|
1203
|
+
"""
|
|
1204
|
+
Get the inner (wrapped) model.
|
|
1205
|
+
|
|
1206
|
+
:param path_config_json: path to .json config file
|
|
1207
|
+
:param path_weights: path to model artifact
|
|
1208
|
+
:return: 'nn.Module'
|
|
1209
|
+
"""
|
|
1210
|
+
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
1211
|
+
return LiltForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
|