deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +16 -29
- deepdoctection/analyzer/dd.py +70 -59
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +41 -56
- deepdoctection/datapoint/box.py +9 -8
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +56 -44
- deepdoctection/datapoint/view.py +245 -150
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +35 -26
- deepdoctection/datasets/base.py +14 -12
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +24 -26
- deepdoctection/datasets/instances/doclaynet.py +51 -51
- deepdoctection/datasets/instances/fintabnet.py +46 -46
- deepdoctection/datasets/instances/funsd.py +25 -24
- deepdoctection/datasets/instances/iiitar13k.py +13 -10
- deepdoctection/datasets/instances/layouttest.py +4 -3
- deepdoctection/datasets/instances/publaynet.py +5 -5
- deepdoctection/datasets/instances/pubtables1m.py +24 -21
- deepdoctection/datasets/instances/pubtabnet.py +32 -30
- deepdoctection/datasets/instances/rvlcdip.py +30 -30
- deepdoctection/datasets/instances/xfund.py +26 -26
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +15 -13
- deepdoctection/eval/eval.py +41 -37
- deepdoctection/eval/tedsmetric.py +30 -23
- deepdoctection/eval/tp_eval_callback.py +16 -19
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +85 -113
- deepdoctection/extern/deskew.py +14 -11
- deepdoctection/extern/doctrocr.py +141 -130
- deepdoctection/extern/fastlang.py +27 -18
- deepdoctection/extern/hfdetr.py +71 -62
- deepdoctection/extern/hflayoutlm.py +504 -211
- deepdoctection/extern/hflm.py +230 -0
- deepdoctection/extern/model.py +488 -302
- deepdoctection/extern/pdftext.py +23 -19
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +29 -19
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +18 -18
- deepdoctection/extern/tp/tfutils.py +57 -9
- deepdoctection/extern/tp/tpcompat.py +21 -14
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +45 -53
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/cats.py +27 -29
- deepdoctection/mapper/cocostruct.py +10 -10
- deepdoctection/mapper/d2struct.py +27 -26
- deepdoctection/mapper/hfstruct.py +13 -8
- deepdoctection/mapper/laylmstruct.py +178 -37
- deepdoctection/mapper/maputils.py +12 -11
- deepdoctection/mapper/match.py +2 -2
- deepdoctection/mapper/misc.py +11 -9
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +5 -5
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +5 -5
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +12 -14
- deepdoctection/pipe/base.py +52 -106
- deepdoctection/pipe/common.py +72 -59
- deepdoctection/pipe/concurrency.py +16 -11
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +20 -16
- deepdoctection/pipe/lm.py +75 -105
- deepdoctection/pipe/order.py +194 -89
- deepdoctection/pipe/refine.py +111 -124
- deepdoctection/pipe/segment.py +156 -161
- deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +48 -41
- deepdoctection/train/hf_detr_train.py +41 -30
- deepdoctection/train/hf_layoutlm_train.py +153 -135
- deepdoctection/train/tp_frcnn_train.py +32 -31
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +87 -125
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +22 -18
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +16 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +11 -11
- deepdoctection/utils/settings.py +185 -181
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +74 -72
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
- deepdoctection-0.33.dist-info/RECORD +146 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.31.dist-info/RECORD +0 -144
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
deepdoctection/pipe/lm.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# File:
|
|
2
|
+
# File: lm.py
|
|
3
3
|
|
|
4
4
|
# Copyright 2021 Dr. Janis Meyer. All rights reserved.
|
|
5
5
|
#
|
|
@@ -18,60 +18,23 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for token classification pipeline
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
22
23
|
from copy import copy
|
|
23
|
-
from typing import Any,
|
|
24
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
|
|
24
25
|
|
|
25
26
|
from ..datapoint.image import Image
|
|
26
|
-
from ..
|
|
27
|
-
from ..mapper.laylmstruct import image_to_layoutlm_features
|
|
28
|
-
from ..utils.detection_types import JsonDict
|
|
29
|
-
from ..utils.file_utils import transformers_available
|
|
27
|
+
from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
|
|
30
28
|
from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
|
|
31
|
-
from .base import
|
|
29
|
+
from .base import MetaAnnotation, PipelineComponent
|
|
32
30
|
from .registry import pipeline_component_registry
|
|
33
31
|
|
|
34
|
-
if
|
|
35
|
-
from
|
|
36
|
-
|
|
37
|
-
_ARCHITECTURES_TO_TOKENIZER = {
|
|
38
|
-
("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
39
|
-
"microsoft/layoutlm-base-uncased"
|
|
40
|
-
),
|
|
41
|
-
("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
42
|
-
"microsoft/layoutlm-base-uncased"
|
|
43
|
-
),
|
|
44
|
-
("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
45
|
-
"microsoft/layoutlm-base-uncased"
|
|
46
|
-
),
|
|
47
|
-
("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
48
|
-
"microsoft/layoutlm-base-uncased"
|
|
49
|
-
),
|
|
50
|
-
("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
|
|
51
|
-
("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
|
|
52
|
-
("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
|
|
53
|
-
"roberta-base", add_prefix_space=True
|
|
54
|
-
),
|
|
55
|
-
("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
|
|
56
|
-
"roberta-base", add_prefix_space=True
|
|
57
|
-
),
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def get_tokenizer_from_architecture(architecture_name: str, use_xlm_tokenizer: bool) -> Any:
|
|
62
|
-
"""
|
|
63
|
-
We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
|
|
64
|
-
returns the tokenizer that should be used for a particular model.
|
|
65
|
-
|
|
66
|
-
:param architecture_name: The model as stated in the transformer library.
|
|
67
|
-
:param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
|
|
68
|
-
:return: Tokenizer instance to use.
|
|
69
|
-
"""
|
|
70
|
-
return _ARCHITECTURES_TO_TOKENIZER[(architecture_name, use_xlm_tokenizer)]
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from ..extern.hflayoutlm import HfLayoutSequenceModels, HfLayoutTokenModels
|
|
71
34
|
|
|
72
35
|
|
|
73
36
|
@pipeline_component_registry.register("LMTokenClassifierService")
|
|
74
|
-
class LMTokenClassifierService(
|
|
37
|
+
class LMTokenClassifierService(PipelineComponent):
|
|
75
38
|
"""
|
|
76
39
|
Pipeline component for token classification
|
|
77
40
|
|
|
@@ -103,7 +66,7 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
103
66
|
def __init__(
|
|
104
67
|
self,
|
|
105
68
|
tokenizer: Any,
|
|
106
|
-
language_model:
|
|
69
|
+
language_model: HfLayoutTokenModels,
|
|
107
70
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
108
71
|
truncation: bool = True,
|
|
109
72
|
return_overflowing_tokens: bool = False,
|
|
@@ -147,14 +110,16 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
147
110
|
self.segment_positions = segment_positions
|
|
148
111
|
self.sliding_window_stride = sliding_window_stride
|
|
149
112
|
if self.use_other_as_default_category:
|
|
150
|
-
categories_name_as_key = {val: key for key, val in self.language_model.categories.items()}
|
|
113
|
+
categories_name_as_key = {val: key for key, val in self.language_model.categories.categories.items()}
|
|
151
114
|
self.default_key: ObjectTypes
|
|
152
|
-
if BioTag.
|
|
153
|
-
self.default_key = BioTag.
|
|
115
|
+
if BioTag.OUTSIDE in categories_name_as_key:
|
|
116
|
+
self.default_key = BioTag.OUTSIDE
|
|
154
117
|
else:
|
|
155
|
-
self.default_key = TokenClasses.
|
|
118
|
+
self.default_key = TokenClasses.OTHER
|
|
156
119
|
self.other_name_as_key = {self.default_key: categories_name_as_key[self.default_key]}
|
|
157
|
-
|
|
120
|
+
self.tokenizer = tokenizer
|
|
121
|
+
self.mapping_to_lm_input_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
|
|
122
|
+
super().__init__(self._get_name(), self.language_model.model_id)
|
|
158
123
|
self.required_kwargs = {
|
|
159
124
|
"tokenizer": self.tokenizer,
|
|
160
125
|
"padding": self.padding,
|
|
@@ -164,7 +129,7 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
164
129
|
"segment_positions": self.segment_positions,
|
|
165
130
|
"sliding_window_stride": self.sliding_window_stride,
|
|
166
131
|
}
|
|
167
|
-
self.required_kwargs.update(self.language_model.
|
|
132
|
+
self.required_kwargs.update(self.language_model.default_kwargs_for_image_to_features_mapping())
|
|
168
133
|
self._init_sanity_checks()
|
|
169
134
|
|
|
170
135
|
def serve(self, dp: Image) -> None:
|
|
@@ -182,7 +147,7 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
182
147
|
and not token.token.startswith("##")
|
|
183
148
|
]
|
|
184
149
|
|
|
185
|
-
words_populated:
|
|
150
|
+
words_populated: list[str] = []
|
|
186
151
|
for token in lm_output:
|
|
187
152
|
if token.uuid not in words_populated:
|
|
188
153
|
if token.class_name == token.semantic_name:
|
|
@@ -190,35 +155,37 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
190
155
|
else:
|
|
191
156
|
token_class_name_id = None
|
|
192
157
|
self.dp_manager.set_category_annotation(
|
|
193
|
-
token.semantic_name, token_class_name_id, WordType.
|
|
158
|
+
token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid
|
|
194
159
|
)
|
|
195
|
-
self.dp_manager.set_category_annotation(token.bio_tag, None, WordType.
|
|
160
|
+
self.dp_manager.set_category_annotation(token.bio_tag, None, WordType.TAG, token.uuid)
|
|
196
161
|
self.dp_manager.set_category_annotation(
|
|
197
|
-
token.class_name, token.class_id, WordType.
|
|
162
|
+
token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid
|
|
198
163
|
)
|
|
199
164
|
words_populated.append(token.uuid)
|
|
200
165
|
|
|
201
166
|
if self.use_other_as_default_category:
|
|
202
|
-
word_anns = dp.get_annotation(LayoutType.
|
|
167
|
+
word_anns = dp.get_annotation(LayoutType.WORD)
|
|
203
168
|
for word in word_anns:
|
|
204
|
-
if WordType.
|
|
169
|
+
if WordType.TOKEN_CLASS not in word.sub_categories:
|
|
205
170
|
self.dp_manager.set_category_annotation(
|
|
206
|
-
TokenClasses.
|
|
171
|
+
TokenClasses.OTHER,
|
|
207
172
|
self.other_name_as_key[self.default_key],
|
|
208
|
-
WordType.
|
|
173
|
+
WordType.TOKEN_CLASS,
|
|
209
174
|
word.annotation_id,
|
|
210
175
|
)
|
|
211
|
-
if WordType.
|
|
212
|
-
self.dp_manager.set_category_annotation(BioTag.
|
|
213
|
-
if WordType.
|
|
176
|
+
if WordType.TAG not in word.sub_categories:
|
|
177
|
+
self.dp_manager.set_category_annotation(BioTag.OUTSIDE, None, WordType.TAG, word.annotation_id)
|
|
178
|
+
if WordType.TOKEN_TAG not in word.sub_categories:
|
|
214
179
|
self.dp_manager.set_category_annotation(
|
|
215
180
|
self.default_key,
|
|
216
181
|
self.other_name_as_key[self.default_key],
|
|
217
|
-
WordType.
|
|
182
|
+
WordType.TOKEN_TAG,
|
|
218
183
|
word.annotation_id,
|
|
219
184
|
)
|
|
220
185
|
|
|
221
|
-
def clone(self) ->
|
|
186
|
+
def clone(self) -> LMTokenClassifierService:
|
|
187
|
+
# ToDo: replace copying of tokenizer with a proper clone method. Otherwise we cannot run the evaluation with
|
|
188
|
+
# multiple threads
|
|
222
189
|
return self.__class__(
|
|
223
190
|
copy(self.tokenizer),
|
|
224
191
|
self.language_model.clone(),
|
|
@@ -230,36 +197,38 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
230
197
|
self.sliding_window_stride,
|
|
231
198
|
)
|
|
232
199
|
|
|
233
|
-
def get_meta_annotation(self) ->
|
|
234
|
-
return
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
("summaries", []),
|
|
240
|
-
]
|
|
200
|
+
def get_meta_annotation(self) -> MetaAnnotation:
|
|
201
|
+
return MetaAnnotation(
|
|
202
|
+
image_annotations=(),
|
|
203
|
+
sub_categories={LayoutType.WORD: {WordType.TOKEN_CLASS, WordType.TAG, WordType.TOKEN_TAG}},
|
|
204
|
+
relationships={},
|
|
205
|
+
summaries=(),
|
|
241
206
|
)
|
|
242
207
|
|
|
243
208
|
def _get_name(self) -> str:
|
|
244
209
|
return f"lm_token_class_{self.language_model.name}"
|
|
245
210
|
|
|
246
211
|
def _init_sanity_checks(self) -> None:
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
if tokenizer_class is not None:
|
|
250
|
-
use_xlm_tokenizer = True
|
|
251
|
-
tokenizer_reference = get_tokenizer_from_architecture(
|
|
252
|
-
self.language_model.model.__class__.__name__, use_xlm_tokenizer
|
|
253
|
-
)
|
|
254
|
-
if not isinstance(self.tokenizer, type(tokenizer_reference)):
|
|
212
|
+
tokenizer_class_name = self.language_model.model.config.tokenizer_class
|
|
213
|
+
if tokenizer_class_name != self.tokenizer.__class__.__name__:
|
|
255
214
|
raise TypeError(
|
|
256
|
-
f"You want to use {type(self.tokenizer)} but you should use {
|
|
215
|
+
f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
|
|
257
216
|
f"in this framework"
|
|
258
217
|
)
|
|
259
218
|
|
|
219
|
+
@staticmethod
|
|
220
|
+
def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
|
|
221
|
+
"""Replacing eval functions"""
|
|
222
|
+
return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
|
|
223
|
+
mapping_str
|
|
224
|
+
]
|
|
225
|
+
|
|
226
|
+
def clear_predictor(self) -> None:
|
|
227
|
+
self.language_model.clear_model()
|
|
228
|
+
|
|
260
229
|
|
|
261
230
|
@pipeline_component_registry.register("LMSequenceClassifierService")
|
|
262
|
-
class LMSequenceClassifierService(
|
|
231
|
+
class LMSequenceClassifierService(PipelineComponent):
|
|
263
232
|
"""
|
|
264
233
|
Pipeline component for sequence classification
|
|
265
234
|
|
|
@@ -291,7 +260,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
291
260
|
def __init__(
|
|
292
261
|
self,
|
|
293
262
|
tokenizer: Any,
|
|
294
|
-
language_model:
|
|
263
|
+
language_model: HfLayoutSequenceModels,
|
|
295
264
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
296
265
|
truncation: bool = True,
|
|
297
266
|
return_overflowing_tokens: bool = False,
|
|
@@ -315,7 +284,9 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
315
284
|
self.padding = padding
|
|
316
285
|
self.truncation = truncation
|
|
317
286
|
self.return_overflowing_tokens = return_overflowing_tokens
|
|
318
|
-
|
|
287
|
+
self.tokenizer = tokenizer
|
|
288
|
+
self.mapping_to_lm_input_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
|
|
289
|
+
super().__init__(self._get_name(), self.language_model.model_id)
|
|
319
290
|
self.required_kwargs = {
|
|
320
291
|
"tokenizer": self.tokenizer,
|
|
321
292
|
"padding": self.padding,
|
|
@@ -323,7 +294,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
323
294
|
"return_overflowing_tokens": self.return_overflowing_tokens,
|
|
324
295
|
"return_tensors": "pt",
|
|
325
296
|
}
|
|
326
|
-
self.required_kwargs.update(self.language_model.
|
|
297
|
+
self.required_kwargs.update(self.language_model.default_kwargs_for_image_to_features_mapping())
|
|
327
298
|
self._init_sanity_checks()
|
|
328
299
|
|
|
329
300
|
def serve(self, dp: Image) -> None:
|
|
@@ -332,10 +303,10 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
332
303
|
return
|
|
333
304
|
lm_output = self.language_model.predict(**lm_input)
|
|
334
305
|
self.dp_manager.set_summary_annotation(
|
|
335
|
-
PageType.
|
|
306
|
+
PageType.DOCUMENT_TYPE, lm_output.class_name, lm_output.class_id, None, lm_output.score
|
|
336
307
|
)
|
|
337
308
|
|
|
338
|
-
def clone(self) ->
|
|
309
|
+
def clone(self) -> LMSequenceClassifierService:
|
|
339
310
|
return self.__class__(
|
|
340
311
|
copy(self.tokenizer),
|
|
341
312
|
self.language_model.clone(),
|
|
@@ -344,29 +315,28 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
344
315
|
self.return_overflowing_tokens,
|
|
345
316
|
)
|
|
346
317
|
|
|
347
|
-
def get_meta_annotation(self) ->
|
|
348
|
-
return
|
|
349
|
-
|
|
350
|
-
("image_annotations", []),
|
|
351
|
-
("sub_categories", {}),
|
|
352
|
-
("relationships", {}),
|
|
353
|
-
("summaries", [PageType.document_type]),
|
|
354
|
-
]
|
|
318
|
+
def get_meta_annotation(self) -> MetaAnnotation:
|
|
319
|
+
return MetaAnnotation(
|
|
320
|
+
image_annotations=(), sub_categories={}, relationships={}, summaries=(PageType.DOCUMENT_TYPE,)
|
|
355
321
|
)
|
|
356
322
|
|
|
357
323
|
def _get_name(self) -> str:
|
|
358
324
|
return f"lm_sequence_class_{self.language_model.name}"
|
|
359
325
|
|
|
360
326
|
def _init_sanity_checks(self) -> None:
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
if tokenizer_class is not None:
|
|
364
|
-
use_xlm_tokenizer = True
|
|
365
|
-
tokenizer_reference = get_tokenizer_from_architecture(
|
|
366
|
-
self.language_model.model.__class__.__name__, use_xlm_tokenizer
|
|
367
|
-
)
|
|
368
|
-
if not isinstance(self.tokenizer, type(tokenizer_reference)):
|
|
327
|
+
tokenizer_class_name = self.language_model.model.config.tokenizer_class
|
|
328
|
+
if tokenizer_class_name != self.tokenizer.__class__.__name__:
|
|
369
329
|
raise TypeError(
|
|
370
|
-
f"You want to use {type(self.tokenizer)} but you should use {
|
|
330
|
+
f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
|
|
371
331
|
f"in this framework"
|
|
372
332
|
)
|
|
333
|
+
|
|
334
|
+
@staticmethod
|
|
335
|
+
def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
|
|
336
|
+
"""Replacing eval functions"""
|
|
337
|
+
return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
|
|
338
|
+
mapping_str
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
def clear_predictor(self) -> None:
|
|
342
|
+
self.language_model.clear_model()
|