deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show
  1. deepdoctection/__init__.py +16 -29
  2. deepdoctection/analyzer/dd.py +70 -59
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/common.py +9 -5
  5. deepdoctection/dataflow/custom.py +5 -5
  6. deepdoctection/dataflow/custom_serialize.py +75 -18
  7. deepdoctection/dataflow/parallel_map.py +3 -3
  8. deepdoctection/dataflow/serialize.py +4 -4
  9. deepdoctection/dataflow/stats.py +3 -3
  10. deepdoctection/datapoint/annotation.py +41 -56
  11. deepdoctection/datapoint/box.py +9 -8
  12. deepdoctection/datapoint/convert.py +6 -6
  13. deepdoctection/datapoint/image.py +56 -44
  14. deepdoctection/datapoint/view.py +245 -150
  15. deepdoctection/datasets/__init__.py +1 -4
  16. deepdoctection/datasets/adapter.py +35 -26
  17. deepdoctection/datasets/base.py +14 -12
  18. deepdoctection/datasets/dataflow_builder.py +3 -3
  19. deepdoctection/datasets/info.py +24 -26
  20. deepdoctection/datasets/instances/doclaynet.py +51 -51
  21. deepdoctection/datasets/instances/fintabnet.py +46 -46
  22. deepdoctection/datasets/instances/funsd.py +25 -24
  23. deepdoctection/datasets/instances/iiitar13k.py +13 -10
  24. deepdoctection/datasets/instances/layouttest.py +4 -3
  25. deepdoctection/datasets/instances/publaynet.py +5 -5
  26. deepdoctection/datasets/instances/pubtables1m.py +24 -21
  27. deepdoctection/datasets/instances/pubtabnet.py +32 -30
  28. deepdoctection/datasets/instances/rvlcdip.py +30 -30
  29. deepdoctection/datasets/instances/xfund.py +26 -26
  30. deepdoctection/datasets/save.py +6 -6
  31. deepdoctection/eval/__init__.py +1 -4
  32. deepdoctection/eval/accmetric.py +32 -33
  33. deepdoctection/eval/base.py +8 -9
  34. deepdoctection/eval/cocometric.py +15 -13
  35. deepdoctection/eval/eval.py +41 -37
  36. deepdoctection/eval/tedsmetric.py +30 -23
  37. deepdoctection/eval/tp_eval_callback.py +16 -19
  38. deepdoctection/extern/__init__.py +2 -7
  39. deepdoctection/extern/base.py +339 -134
  40. deepdoctection/extern/d2detect.py +85 -113
  41. deepdoctection/extern/deskew.py +14 -11
  42. deepdoctection/extern/doctrocr.py +141 -130
  43. deepdoctection/extern/fastlang.py +27 -18
  44. deepdoctection/extern/hfdetr.py +71 -62
  45. deepdoctection/extern/hflayoutlm.py +504 -211
  46. deepdoctection/extern/hflm.py +230 -0
  47. deepdoctection/extern/model.py +488 -302
  48. deepdoctection/extern/pdftext.py +23 -19
  49. deepdoctection/extern/pt/__init__.py +1 -3
  50. deepdoctection/extern/pt/nms.py +6 -2
  51. deepdoctection/extern/pt/ptutils.py +29 -19
  52. deepdoctection/extern/tessocr.py +39 -38
  53. deepdoctection/extern/texocr.py +18 -18
  54. deepdoctection/extern/tp/tfutils.py +57 -9
  55. deepdoctection/extern/tp/tpcompat.py +21 -14
  56. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
  60. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  61. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
  62. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
  67. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
  68. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  69. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  70. deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
  71. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  72. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  73. deepdoctection/extern/tpdetect.py +45 -53
  74. deepdoctection/mapper/__init__.py +3 -8
  75. deepdoctection/mapper/cats.py +27 -29
  76. deepdoctection/mapper/cocostruct.py +10 -10
  77. deepdoctection/mapper/d2struct.py +27 -26
  78. deepdoctection/mapper/hfstruct.py +13 -8
  79. deepdoctection/mapper/laylmstruct.py +178 -37
  80. deepdoctection/mapper/maputils.py +12 -11
  81. deepdoctection/mapper/match.py +2 -2
  82. deepdoctection/mapper/misc.py +11 -9
  83. deepdoctection/mapper/pascalstruct.py +4 -4
  84. deepdoctection/mapper/prodigystruct.py +5 -5
  85. deepdoctection/mapper/pubstruct.py +84 -92
  86. deepdoctection/mapper/tpstruct.py +5 -5
  87. deepdoctection/mapper/xfundstruct.py +33 -33
  88. deepdoctection/pipe/__init__.py +1 -1
  89. deepdoctection/pipe/anngen.py +12 -14
  90. deepdoctection/pipe/base.py +52 -106
  91. deepdoctection/pipe/common.py +72 -59
  92. deepdoctection/pipe/concurrency.py +16 -11
  93. deepdoctection/pipe/doctectionpipe.py +24 -21
  94. deepdoctection/pipe/language.py +20 -25
  95. deepdoctection/pipe/layout.py +20 -16
  96. deepdoctection/pipe/lm.py +75 -105
  97. deepdoctection/pipe/order.py +194 -89
  98. deepdoctection/pipe/refine.py +111 -124
  99. deepdoctection/pipe/segment.py +156 -161
  100. deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
  101. deepdoctection/pipe/text.py +37 -36
  102. deepdoctection/pipe/transform.py +19 -16
  103. deepdoctection/train/__init__.py +6 -12
  104. deepdoctection/train/d2_frcnn_train.py +48 -41
  105. deepdoctection/train/hf_detr_train.py +41 -30
  106. deepdoctection/train/hf_layoutlm_train.py +153 -135
  107. deepdoctection/train/tp_frcnn_train.py +32 -31
  108. deepdoctection/utils/concurrency.py +1 -1
  109. deepdoctection/utils/context.py +13 -6
  110. deepdoctection/utils/develop.py +4 -4
  111. deepdoctection/utils/env_info.py +87 -125
  112. deepdoctection/utils/file_utils.py +6 -11
  113. deepdoctection/utils/fs.py +22 -18
  114. deepdoctection/utils/identifier.py +2 -2
  115. deepdoctection/utils/logger.py +16 -15
  116. deepdoctection/utils/metacfg.py +7 -7
  117. deepdoctection/utils/mocks.py +93 -0
  118. deepdoctection/utils/pdf_utils.py +11 -11
  119. deepdoctection/utils/settings.py +185 -181
  120. deepdoctection/utils/tqdm.py +1 -1
  121. deepdoctection/utils/transform.py +14 -9
  122. deepdoctection/utils/types.py +104 -0
  123. deepdoctection/utils/utils.py +7 -7
  124. deepdoctection/utils/viz.py +74 -72
  125. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
  126. deepdoctection-0.33.dist-info/RECORD +146 -0
  127. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
  128. deepdoctection/utils/detection_types.py +0 -68
  129. deepdoctection-0.31.dist-info/RECORD +0 -144
  130. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
  131. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
deepdoctection/pipe/lm.py CHANGED
@@ -1,5 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
- # File: tokenclass.py
2
+ # File: lm.py
3
3
 
4
4
  # Copyright 2021 Dr. Janis Meyer. All rights reserved.
5
5
  #
@@ -18,60 +18,23 @@
18
18
  """
19
19
  Module for token classification pipeline
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  from copy import copy
23
- from typing import Any, List, Literal, Optional, Sequence, Union
24
+ from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
24
25
 
25
26
  from ..datapoint.image import Image
26
- from ..extern.hflayoutlm import HFLayoutLmSequenceClassifierBase, HFLayoutLmTokenClassifierBase
27
- from ..mapper.laylmstruct import image_to_layoutlm_features
28
- from ..utils.detection_types import JsonDict
29
- from ..utils.file_utils import transformers_available
27
+ from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
30
28
  from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
31
- from .base import LanguageModelPipelineComponent
29
+ from .base import MetaAnnotation, PipelineComponent
32
30
  from .registry import pipeline_component_registry
33
31
 
34
- if transformers_available():
35
- from transformers import LayoutLMTokenizerFast, RobertaTokenizerFast, XLMRobertaTokenizerFast
36
-
37
- _ARCHITECTURES_TO_TOKENIZER = {
38
- ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
39
- "microsoft/layoutlm-base-uncased"
40
- ),
41
- ("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
42
- "microsoft/layoutlm-base-uncased"
43
- ),
44
- ("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
45
- "microsoft/layoutlm-base-uncased"
46
- ),
47
- ("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
48
- "microsoft/layoutlm-base-uncased"
49
- ),
50
- ("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
51
- ("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
52
- ("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
53
- "roberta-base", add_prefix_space=True
54
- ),
55
- ("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
56
- "roberta-base", add_prefix_space=True
57
- ),
58
- }
59
-
60
-
61
- def get_tokenizer_from_architecture(architecture_name: str, use_xlm_tokenizer: bool) -> Any:
62
- """
63
- We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
64
- returns the tokenizer that should be used for a particular model.
65
-
66
- :param architecture_name: The model as stated in the transformer library.
67
- :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
68
- :return: Tokenizer instance to use.
69
- """
70
- return _ARCHITECTURES_TO_TOKENIZER[(architecture_name, use_xlm_tokenizer)]
32
+ if TYPE_CHECKING:
33
+ from ..extern.hflayoutlm import HfLayoutSequenceModels, HfLayoutTokenModels
71
34
 
72
35
 
73
36
  @pipeline_component_registry.register("LMTokenClassifierService")
74
- class LMTokenClassifierService(LanguageModelPipelineComponent):
37
+ class LMTokenClassifierService(PipelineComponent):
75
38
  """
76
39
  Pipeline component for token classification
77
40
 
@@ -103,7 +66,7 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
103
66
  def __init__(
104
67
  self,
105
68
  tokenizer: Any,
106
- language_model: HFLayoutLmTokenClassifierBase,
69
+ language_model: HfLayoutTokenModels,
107
70
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
108
71
  truncation: bool = True,
109
72
  return_overflowing_tokens: bool = False,
@@ -147,14 +110,16 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
147
110
  self.segment_positions = segment_positions
148
111
  self.sliding_window_stride = sliding_window_stride
149
112
  if self.use_other_as_default_category:
150
- categories_name_as_key = {val: key for key, val in self.language_model.categories.items()}
113
+ categories_name_as_key = {val: key for key, val in self.language_model.categories.categories.items()}
151
114
  self.default_key: ObjectTypes
152
- if BioTag.outside in categories_name_as_key:
153
- self.default_key = BioTag.outside
115
+ if BioTag.OUTSIDE in categories_name_as_key:
116
+ self.default_key = BioTag.OUTSIDE
154
117
  else:
155
- self.default_key = TokenClasses.other
118
+ self.default_key = TokenClasses.OTHER
156
119
  self.other_name_as_key = {self.default_key: categories_name_as_key[self.default_key]}
157
- super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
120
+ self.tokenizer = tokenizer
121
+ self.mapping_to_lm_input_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
122
+ super().__init__(self._get_name(), self.language_model.model_id)
158
123
  self.required_kwargs = {
159
124
  "tokenizer": self.tokenizer,
160
125
  "padding": self.padding,
@@ -164,7 +129,7 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
164
129
  "segment_positions": self.segment_positions,
165
130
  "sliding_window_stride": self.sliding_window_stride,
166
131
  }
167
- self.required_kwargs.update(self.language_model.default_kwargs_for_input_mapping())
132
+ self.required_kwargs.update(self.language_model.default_kwargs_for_image_to_features_mapping())
168
133
  self._init_sanity_checks()
169
134
 
170
135
  def serve(self, dp: Image) -> None:
@@ -182,7 +147,7 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
182
147
  and not token.token.startswith("##")
183
148
  ]
184
149
 
185
- words_populated: List[str] = []
150
+ words_populated: list[str] = []
186
151
  for token in lm_output:
187
152
  if token.uuid not in words_populated:
188
153
  if token.class_name == token.semantic_name:
@@ -190,35 +155,37 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
190
155
  else:
191
156
  token_class_name_id = None
192
157
  self.dp_manager.set_category_annotation(
193
- token.semantic_name, token_class_name_id, WordType.token_class, token.uuid
158
+ token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid
194
159
  )
195
- self.dp_manager.set_category_annotation(token.bio_tag, None, WordType.tag, token.uuid)
160
+ self.dp_manager.set_category_annotation(token.bio_tag, None, WordType.TAG, token.uuid)
196
161
  self.dp_manager.set_category_annotation(
197
- token.class_name, token.class_id, WordType.token_tag, token.uuid
162
+ token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid
198
163
  )
199
164
  words_populated.append(token.uuid)
200
165
 
201
166
  if self.use_other_as_default_category:
202
- word_anns = dp.get_annotation(LayoutType.word)
167
+ word_anns = dp.get_annotation(LayoutType.WORD)
203
168
  for word in word_anns:
204
- if WordType.token_class not in word.sub_categories:
169
+ if WordType.TOKEN_CLASS not in word.sub_categories:
205
170
  self.dp_manager.set_category_annotation(
206
- TokenClasses.other,
171
+ TokenClasses.OTHER,
207
172
  self.other_name_as_key[self.default_key],
208
- WordType.token_class,
173
+ WordType.TOKEN_CLASS,
209
174
  word.annotation_id,
210
175
  )
211
- if WordType.tag not in word.sub_categories:
212
- self.dp_manager.set_category_annotation(BioTag.outside, None, WordType.tag, word.annotation_id)
213
- if WordType.token_tag not in word.sub_categories:
176
+ if WordType.TAG not in word.sub_categories:
177
+ self.dp_manager.set_category_annotation(BioTag.OUTSIDE, None, WordType.TAG, word.annotation_id)
178
+ if WordType.TOKEN_TAG not in word.sub_categories:
214
179
  self.dp_manager.set_category_annotation(
215
180
  self.default_key,
216
181
  self.other_name_as_key[self.default_key],
217
- WordType.token_tag,
182
+ WordType.TOKEN_TAG,
218
183
  word.annotation_id,
219
184
  )
220
185
 
221
- def clone(self) -> "LMTokenClassifierService":
186
+ def clone(self) -> LMTokenClassifierService:
187
+ # ToDo: replace copying of tokenizer with a proper clone method. Otherwise we cannot run the evaluation with
188
+ # multiple threads
222
189
  return self.__class__(
223
190
  copy(self.tokenizer),
224
191
  self.language_model.clone(),
@@ -230,36 +197,38 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
230
197
  self.sliding_window_stride,
231
198
  )
232
199
 
233
- def get_meta_annotation(self) -> JsonDict:
234
- return dict(
235
- [
236
- ("image_annotations", []),
237
- ("sub_categories", {LayoutType.word: {WordType.token_class, WordType.tag, WordType.token_tag}}),
238
- ("relationships", {}),
239
- ("summaries", []),
240
- ]
200
+ def get_meta_annotation(self) -> MetaAnnotation:
201
+ return MetaAnnotation(
202
+ image_annotations=(),
203
+ sub_categories={LayoutType.WORD: {WordType.TOKEN_CLASS, WordType.TAG, WordType.TOKEN_TAG}},
204
+ relationships={},
205
+ summaries=(),
241
206
  )
242
207
 
243
208
  def _get_name(self) -> str:
244
209
  return f"lm_token_class_{self.language_model.name}"
245
210
 
246
211
  def _init_sanity_checks(self) -> None:
247
- tokenizer_class = self.language_model.model.config.tokenizer_class
248
- use_xlm_tokenizer = False
249
- if tokenizer_class is not None:
250
- use_xlm_tokenizer = True
251
- tokenizer_reference = get_tokenizer_from_architecture(
252
- self.language_model.model.__class__.__name__, use_xlm_tokenizer
253
- )
254
- if not isinstance(self.tokenizer, type(tokenizer_reference)):
212
+ tokenizer_class_name = self.language_model.model.config.tokenizer_class
213
+ if tokenizer_class_name != self.tokenizer.__class__.__name__:
255
214
  raise TypeError(
256
- f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
215
+ f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
257
216
  f"in this framework"
258
217
  )
259
218
 
219
+ @staticmethod
220
+ def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
221
+ """Replacing eval functions"""
222
+ return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
223
+ mapping_str
224
+ ]
225
+
226
+ def clear_predictor(self) -> None:
227
+ self.language_model.clear_model()
228
+
260
229
 
261
230
  @pipeline_component_registry.register("LMSequenceClassifierService")
262
- class LMSequenceClassifierService(LanguageModelPipelineComponent):
231
+ class LMSequenceClassifierService(PipelineComponent):
263
232
  """
264
233
  Pipeline component for sequence classification
265
234
 
@@ -291,7 +260,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
291
260
  def __init__(
292
261
  self,
293
262
  tokenizer: Any,
294
- language_model: HFLayoutLmSequenceClassifierBase,
263
+ language_model: HfLayoutSequenceModels,
295
264
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
296
265
  truncation: bool = True,
297
266
  return_overflowing_tokens: bool = False,
@@ -315,7 +284,9 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
315
284
  self.padding = padding
316
285
  self.truncation = truncation
317
286
  self.return_overflowing_tokens = return_overflowing_tokens
318
- super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
287
+ self.tokenizer = tokenizer
288
+ self.mapping_to_lm_input_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
289
+ super().__init__(self._get_name(), self.language_model.model_id)
319
290
  self.required_kwargs = {
320
291
  "tokenizer": self.tokenizer,
321
292
  "padding": self.padding,
@@ -323,7 +294,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
323
294
  "return_overflowing_tokens": self.return_overflowing_tokens,
324
295
  "return_tensors": "pt",
325
296
  }
326
- self.required_kwargs.update(self.language_model.default_kwargs_for_input_mapping())
297
+ self.required_kwargs.update(self.language_model.default_kwargs_for_image_to_features_mapping())
327
298
  self._init_sanity_checks()
328
299
 
329
300
  def serve(self, dp: Image) -> None:
@@ -332,10 +303,10 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
332
303
  return
333
304
  lm_output = self.language_model.predict(**lm_input)
334
305
  self.dp_manager.set_summary_annotation(
335
- PageType.document_type, lm_output.class_name, lm_output.class_id, None, lm_output.score
306
+ PageType.DOCUMENT_TYPE, lm_output.class_name, lm_output.class_id, None, lm_output.score
336
307
  )
337
308
 
338
- def clone(self) -> "LMSequenceClassifierService":
309
+ def clone(self) -> LMSequenceClassifierService:
339
310
  return self.__class__(
340
311
  copy(self.tokenizer),
341
312
  self.language_model.clone(),
@@ -344,29 +315,28 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
344
315
  self.return_overflowing_tokens,
345
316
  )
346
317
 
347
- def get_meta_annotation(self) -> JsonDict:
348
- return dict(
349
- [
350
- ("image_annotations", []),
351
- ("sub_categories", {}),
352
- ("relationships", {}),
353
- ("summaries", [PageType.document_type]),
354
- ]
318
+ def get_meta_annotation(self) -> MetaAnnotation:
319
+ return MetaAnnotation(
320
+ image_annotations=(), sub_categories={}, relationships={}, summaries=(PageType.DOCUMENT_TYPE,)
355
321
  )
356
322
 
357
323
  def _get_name(self) -> str:
358
324
  return f"lm_sequence_class_{self.language_model.name}"
359
325
 
360
326
  def _init_sanity_checks(self) -> None:
361
- tokenizer_class = self.language_model.model.config.tokenizer_class
362
- use_xlm_tokenizer = False
363
- if tokenizer_class is not None:
364
- use_xlm_tokenizer = True
365
- tokenizer_reference = get_tokenizer_from_architecture(
366
- self.language_model.model.__class__.__name__, use_xlm_tokenizer
367
- )
368
- if not isinstance(self.tokenizer, type(tokenizer_reference)):
327
+ tokenizer_class_name = self.language_model.model.config.tokenizer_class
328
+ if tokenizer_class_name != self.tokenizer.__class__.__name__:
369
329
  raise TypeError(
370
- f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
330
+ f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
371
331
  f"in this framework"
372
332
  )
333
+
334
+ @staticmethod
335
+ def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
336
+ """Replacing eval functions"""
337
+ return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
338
+ mapping_str
339
+ ]
340
+
341
+ def clear_predictor(self) -> None:
342
+ self.language_model.clear_model()