deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show
  1. deepdoctection/__init__.py +38 -29
  2. deepdoctection/analyzer/dd.py +36 -29
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/base.py +0 -19
  5. deepdoctection/dataflow/custom.py +4 -3
  6. deepdoctection/dataflow/custom_serialize.py +14 -5
  7. deepdoctection/dataflow/parallel_map.py +12 -11
  8. deepdoctection/dataflow/serialize.py +5 -4
  9. deepdoctection/datapoint/annotation.py +35 -13
  10. deepdoctection/datapoint/box.py +3 -5
  11. deepdoctection/datapoint/convert.py +3 -1
  12. deepdoctection/datapoint/image.py +79 -36
  13. deepdoctection/datapoint/view.py +152 -49
  14. deepdoctection/datasets/__init__.py +1 -4
  15. deepdoctection/datasets/adapter.py +6 -3
  16. deepdoctection/datasets/base.py +86 -11
  17. deepdoctection/datasets/dataflow_builder.py +1 -1
  18. deepdoctection/datasets/info.py +4 -4
  19. deepdoctection/datasets/instances/doclaynet.py +3 -2
  20. deepdoctection/datasets/instances/fintabnet.py +2 -1
  21. deepdoctection/datasets/instances/funsd.py +2 -1
  22. deepdoctection/datasets/instances/iiitar13k.py +5 -2
  23. deepdoctection/datasets/instances/layouttest.py +4 -8
  24. deepdoctection/datasets/instances/publaynet.py +2 -2
  25. deepdoctection/datasets/instances/pubtables1m.py +6 -3
  26. deepdoctection/datasets/instances/pubtabnet.py +2 -1
  27. deepdoctection/datasets/instances/rvlcdip.py +2 -1
  28. deepdoctection/datasets/instances/xfund.py +2 -1
  29. deepdoctection/eval/__init__.py +1 -4
  30. deepdoctection/eval/accmetric.py +1 -1
  31. deepdoctection/eval/base.py +5 -4
  32. deepdoctection/eval/cocometric.py +2 -1
  33. deepdoctection/eval/eval.py +19 -15
  34. deepdoctection/eval/tedsmetric.py +14 -11
  35. deepdoctection/eval/tp_eval_callback.py +14 -7
  36. deepdoctection/extern/__init__.py +2 -7
  37. deepdoctection/extern/base.py +39 -13
  38. deepdoctection/extern/d2detect.py +182 -90
  39. deepdoctection/extern/deskew.py +36 -9
  40. deepdoctection/extern/doctrocr.py +265 -83
  41. deepdoctection/extern/fastlang.py +49 -9
  42. deepdoctection/extern/hfdetr.py +106 -55
  43. deepdoctection/extern/hflayoutlm.py +441 -122
  44. deepdoctection/extern/hflm.py +225 -0
  45. deepdoctection/extern/model.py +56 -47
  46. deepdoctection/extern/pdftext.py +10 -5
  47. deepdoctection/extern/pt/__init__.py +1 -3
  48. deepdoctection/extern/pt/nms.py +6 -2
  49. deepdoctection/extern/pt/ptutils.py +27 -18
  50. deepdoctection/extern/tessocr.py +134 -22
  51. deepdoctection/extern/texocr.py +6 -2
  52. deepdoctection/extern/tp/tfutils.py +43 -9
  53. deepdoctection/extern/tp/tpcompat.py +14 -11
  54. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  55. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  56. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  58. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  60. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  61. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  62. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  67. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  68. deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
  69. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  70. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  71. deepdoctection/extern/tpdetect.py +54 -30
  72. deepdoctection/mapper/__init__.py +3 -8
  73. deepdoctection/mapper/d2struct.py +9 -7
  74. deepdoctection/mapper/hfstruct.py +7 -2
  75. deepdoctection/mapper/laylmstruct.py +164 -21
  76. deepdoctection/mapper/maputils.py +16 -3
  77. deepdoctection/mapper/misc.py +6 -3
  78. deepdoctection/mapper/prodigystruct.py +1 -1
  79. deepdoctection/mapper/pubstruct.py +10 -10
  80. deepdoctection/mapper/tpstruct.py +3 -3
  81. deepdoctection/pipe/__init__.py +1 -1
  82. deepdoctection/pipe/anngen.py +35 -8
  83. deepdoctection/pipe/base.py +53 -19
  84. deepdoctection/pipe/common.py +23 -13
  85. deepdoctection/pipe/concurrency.py +2 -1
  86. deepdoctection/pipe/doctectionpipe.py +2 -2
  87. deepdoctection/pipe/language.py +3 -2
  88. deepdoctection/pipe/layout.py +6 -3
  89. deepdoctection/pipe/lm.py +34 -66
  90. deepdoctection/pipe/order.py +142 -35
  91. deepdoctection/pipe/refine.py +26 -24
  92. deepdoctection/pipe/segment.py +21 -16
  93. deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
  94. deepdoctection/pipe/text.py +14 -8
  95. deepdoctection/pipe/transform.py +16 -9
  96. deepdoctection/train/__init__.py +6 -12
  97. deepdoctection/train/d2_frcnn_train.py +36 -28
  98. deepdoctection/train/hf_detr_train.py +26 -17
  99. deepdoctection/train/hf_layoutlm_train.py +133 -111
  100. deepdoctection/train/tp_frcnn_train.py +21 -19
  101. deepdoctection/utils/__init__.py +3 -0
  102. deepdoctection/utils/concurrency.py +1 -1
  103. deepdoctection/utils/context.py +2 -2
  104. deepdoctection/utils/env_info.py +41 -84
  105. deepdoctection/utils/error.py +84 -0
  106. deepdoctection/utils/file_utils.py +4 -15
  107. deepdoctection/utils/fs.py +7 -7
  108. deepdoctection/utils/logger.py +1 -0
  109. deepdoctection/utils/mocks.py +93 -0
  110. deepdoctection/utils/pdf_utils.py +5 -4
  111. deepdoctection/utils/settings.py +6 -1
  112. deepdoctection/utils/transform.py +1 -1
  113. deepdoctection/utils/utils.py +0 -6
  114. deepdoctection/utils/viz.py +48 -5
  115. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
  116. deepdoctection-0.32.dist-info/RECORD +146 -0
  117. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
  118. deepdoctection-0.30.dist-info/RECORD +0 -143
  119. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
  120. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@
18
18
  """
19
19
  HF Layoutlm model for diverse downstream tasks.
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  from abc import ABC
23
24
  from collections import defaultdict
@@ -26,14 +27,10 @@ from pathlib import Path
26
27
  from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union
27
28
 
28
29
  import numpy as np
30
+ from lazy_imports import try_import
29
31
 
30
32
  from ..utils.detection_types import JsonDict, Requirement
31
- from ..utils.file_utils import (
32
- get_pytorch_requirement,
33
- get_transformers_requirement,
34
- pytorch_available,
35
- transformers_available,
36
- )
33
+ from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
37
34
  from ..utils.settings import (
38
35
  BioTag,
39
36
  ObjectTypes,
@@ -44,39 +41,85 @@ from ..utils.settings import (
44
41
  token_class_with_tag_to_token_class_and_tag,
45
42
  )
46
43
  from .base import LMSequenceClassifier, LMTokenClassifier, SequenceClassResult, TokenClassResult
47
- from .pt.ptutils import set_torch_auto_device
44
+ from .pt.ptutils import get_torch_device
48
45
 
49
- if pytorch_available():
46
+ with try_import() as pt_import_guard:
50
47
  import torch
51
48
  import torch.nn.functional as F
52
- from torch import Tensor # pylint: disable=W0611
53
49
 
54
- if transformers_available():
50
+ with try_import() as tr_import_guard:
55
51
  from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD # type: ignore
56
52
  from transformers import (
57
53
  LayoutLMForSequenceClassification,
58
54
  LayoutLMForTokenClassification,
55
+ LayoutLMTokenizerFast,
59
56
  LayoutLMv2Config,
60
57
  LayoutLMv2ForSequenceClassification,
61
58
  LayoutLMv2ForTokenClassification,
62
59
  LayoutLMv3Config,
63
60
  LayoutLMv3ForSequenceClassification,
64
61
  LayoutLMv3ForTokenClassification,
62
+ LiltForSequenceClassification,
63
+ LiltForTokenClassification,
65
64
  PretrainedConfig,
65
+ RobertaTokenizerFast,
66
+ XLMRobertaTokenizerFast,
66
67
  )
67
68
 
68
69
 
70
+ def get_tokenizer_from_model_class(model_class: str, use_xlm_tokenizer: bool) -> Any:
71
+ """
72
+ We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
73
+ returns the tokenizer that should be used for a particular model.
74
+
75
+ :param model_class: The model as stated in the transformer library.
76
+ :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
77
+ :return: Tokenizer instance to use.
78
+ """
79
+ return {
80
+ ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
81
+ "microsoft/layoutlm-base-uncased"
82
+ ),
83
+ ("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
84
+ "microsoft/layoutlm-base-uncased"
85
+ ),
86
+ ("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
87
+ "microsoft/layoutlm-base-uncased"
88
+ ),
89
+ ("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
90
+ "microsoft/layoutlm-base-uncased"
91
+ ),
92
+ ("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
93
+ ("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
94
+ ("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
95
+ "roberta-base", add_prefix_space=True
96
+ ),
97
+ ("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
98
+ "roberta-base", add_prefix_space=True
99
+ ),
100
+ ("LiltForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
101
+ ("LiltForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
102
+ "roberta-base", add_prefix_space=True
103
+ ),
104
+ ("LiltForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
105
+ ("LiltForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
106
+ "roberta-base", add_prefix_space=True
107
+ ),
108
+ ("XLMRobertaForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained(
109
+ "FacebookAI/xlm-roberta-base"
110
+ ),
111
+ }[(model_class, use_xlm_tokenizer)]
112
+
113
+
69
114
  def predict_token_classes(
70
115
  uuids: List[List[str]],
71
- input_ids: "Tensor",
72
- attention_mask: "Tensor",
73
- token_type_ids: "Tensor",
74
- boxes: "Tensor",
116
+ input_ids: torch.Tensor,
117
+ attention_mask: torch.Tensor,
118
+ token_type_ids: torch.Tensor,
119
+ boxes: torch.Tensor,
75
120
  tokens: List[List[str]],
76
- model: Union[
77
- "LayoutLMForTokenClassification", "LayoutLMv2ForTokenClassification", "LayoutLMv3ForTokenClassification"
78
- ],
79
- images: Optional["Tensor"] = None,
121
+ model: Union[LayoutLMForTokenClassification, LayoutLMv2ForTokenClassification, LayoutLMv3ForTokenClassification],
122
+ images: Optional[torch.Tensor] = None,
80
123
  ) -> List[TokenClassResult]:
81
124
  """
82
125
  :param uuids: A list of uuids that correspond to a word that induces the resulting token
@@ -129,26 +172,28 @@ def predict_token_classes(
129
172
 
130
173
 
131
174
  def predict_sequence_classes(
132
- input_ids: "Tensor",
133
- attention_mask: "Tensor",
134
- token_type_ids: "Tensor",
135
- boxes: "Tensor",
175
+ input_ids: torch.Tensor,
176
+ attention_mask: torch.Tensor,
177
+ token_type_ids: torch.Tensor,
178
+ boxes: torch.Tensor,
136
179
  model: Union[
137
- "LayoutLMForSequenceClassification",
138
- "LayoutLMv2ForSequenceClassification",
139
- "LayoutLMv3ForSequenceClassification",
180
+ LayoutLMForSequenceClassification,
181
+ LayoutLMv2ForSequenceClassification,
182
+ LayoutLMv3ForSequenceClassification,
183
+ LiltForSequenceClassification,
140
184
  ],
141
- images: Optional["Tensor"] = None,
185
+ images: Optional[torch.Tensor] = None,
142
186
  ) -> SequenceClassResult:
143
187
  """
144
188
  :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
145
189
  :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
146
190
  :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
147
191
  :param boxes: Torch tensor of bounding boxes of type 'xyxy'
148
- :param model: layoutlm model for token classification
192
+ :param model: layoutlm model for sequence classification
149
193
  :param images: A list of torch image tensors or None
150
194
  :return: SequenceClassResult
151
195
  """
196
+
152
197
  if images is None:
153
198
  outputs = model(input_ids=input_ids, bbox=boxes, attention_mask=attention_mask, token_type_ids=token_type_ids)
154
199
  elif isinstance(model, LayoutLMv2ForSequenceClassification):
@@ -177,7 +222,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
177
222
  Abstract base class for wrapping LayoutLM models for token classification into the deepdoctection framework.
178
223
  """
179
224
 
180
- model: Union["LayoutLMForTokenClassification", "LayoutLMv2ForTokenClassification"]
225
+ model: Union[LayoutLMForTokenClassification, LayoutLMv2ForTokenClassification]
181
226
 
182
227
  def __init__(
183
228
  self,
@@ -186,7 +231,8 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
186
231
  categories_semantics: Optional[Sequence[TypeOrStr]] = None,
187
232
  categories_bio: Optional[Sequence[TypeOrStr]] = None,
188
233
  categories: Optional[Mapping[str, TypeOrStr]] = None,
189
- device: Optional[Literal["cpu", "cuda"]] = None,
234
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
235
+ use_xlm_tokenizer: bool = False,
190
236
  ):
191
237
  """
192
238
  :param path_config_json: path to .json config file
@@ -198,9 +244,10 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
198
244
  consistent with detectors use only values>0. Conversion will be done internally.
199
245
  :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
200
246
  :param device: The device (cpu,"cuda"), where to place the model.
247
+ :param use_xlm_tokenizer: True if one uses the LayoutXLM or a lilt model built with a xlm language model, e.g.
248
+ info-xlm or roberta-xlm. (LayoutXLM cannot be distinguished from LayoutLMv2).
201
249
  """
202
250
 
203
- self.name = "_".join(Path(path_weights).parts[-3:])
204
251
  if categories is None:
205
252
  if categories_semantics is None:
206
253
  raise ValueError("If categories is None then categories_semantics cannot be None")
@@ -219,11 +266,9 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
219
266
  self.categories = self._categories_orig_to_categories(
220
267
  self.categories_semantics, self.categories_bio # type: ignore
221
268
  )
222
- if device is not None:
223
- self.device = device
224
- else:
225
- self.device = set_torch_auto_device()
269
+ self.device = get_torch_device(device)
226
270
  self.model.to(self.device)
271
+ self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
227
272
 
228
273
  @classmethod
229
274
  def get_requirements(cls) -> List[Requirement]:
@@ -257,9 +302,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
257
302
 
258
303
  def _validate_encodings(
259
304
  self, **encodings: Any
260
- ) -> Tuple[
261
- List[List[str]], List[str], "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", List[List[str]]
262
- ]:
305
+ ) -> Tuple[List[List[str]], List[str], torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, List[List[str]]]:
263
306
  image_ids = encodings.get("image_ids", [])
264
307
  ann_ids = encodings.get("ann_ids")
265
308
  input_ids = encodings.get("input_ids")
@@ -292,7 +335,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
292
335
 
293
336
  return ann_ids, image_ids, input_ids, attention_mask, token_type_ids, boxes, tokens
294
337
 
295
- def clone(self) -> "HFLayoutLmTokenClassifierBase":
338
+ def clone(self) -> HFLayoutLmTokenClassifierBase:
296
339
  return self.__class__(
297
340
  self.path_config,
298
341
  self.path_weights,
@@ -302,6 +345,29 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
302
345
  self.device,
303
346
  )
304
347
 
348
+ @staticmethod
349
+ def get_name(path_weights: str, architecture: str) -> str:
350
+ """Returns the name of the model"""
351
+ return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
352
+
353
+ def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
354
+ """A refinement for adding the tokenizer class name to the model configs.
355
+
356
+ :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
357
+ """
358
+ tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
359
+ return tokenizer.__class__.__name__
360
+
361
+ @staticmethod
362
+ def image_to_raw_features_mapping() -> str:
363
+ """Returns the mapping function to convert images into raw features."""
364
+ return "image_to_raw_layoutlm_features"
365
+
366
+ @staticmethod
367
+ def image_to_features_mapping() -> str:
368
+ """Returns the mapping function to convert images into features."""
369
+ return "image_to_layoutlm_features"
370
+
305
371
 
306
372
  class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
307
373
  """
@@ -344,7 +410,8 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
344
410
  categories_semantics: Optional[Sequence[TypeOrStr]] = None,
345
411
  categories_bio: Optional[Sequence[TypeOrStr]] = None,
346
412
  categories: Optional[Mapping[str, TypeOrStr]] = None,
347
- device: Optional[Literal["cpu", "cuda"]] = None,
413
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
414
+ use_xlm_tokenizer: bool = False,
348
415
  ):
349
416
  """
350
417
  :param path_config_json: path to .json config file
@@ -356,14 +423,17 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
356
423
  consistent with detectors use only values>0. Conversion will be done internally.
357
424
  :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
358
425
  :param device: The device (cpu,"cuda"), where to place the model.
426
+ :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
427
+ Tokenizer.
359
428
  """
360
- config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
361
- self.model = LayoutLMForTokenClassification.from_pretrained(
362
- pretrained_model_name_or_path=path_weights, config=config
429
+ self.name = self.get_name(path_weights, "LayoutLM")
430
+ self.model_id = self.get_model_id()
431
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
432
+ super().__init__(
433
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
363
434
  )
364
- super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
365
435
 
366
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
436
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
367
437
  """
368
438
  Launch inference on LayoutLm for token classification. Pass the following arguments
369
439
 
@@ -388,6 +458,18 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
388
458
 
389
459
  return self._map_category_names(results)
390
460
 
461
+ @staticmethod
462
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
463
+ """
464
+ Get the inner (wrapped) model.
465
+
466
+ :param path_config_json: path to .json config file
467
+ :param path_weights: path to model artifact
468
+ :return: 'nn.Module'
469
+ """
470
+ config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
471
+ return LayoutLMForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
472
+
391
473
 
392
474
  class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
393
475
  """
@@ -432,7 +514,8 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
432
514
  categories_semantics: Optional[Sequence[TypeOrStr]] = None,
433
515
  categories_bio: Optional[Sequence[TypeOrStr]] = None,
434
516
  categories: Optional[Mapping[str, TypeOrStr]] = None,
435
- device: Optional[Literal["cpu", "cuda"]] = None,
517
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
518
+ use_xlm_tokenizer: bool = False,
436
519
  ):
437
520
  """
438
521
  :param path_config_json: path to .json config file
@@ -444,14 +527,17 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
444
527
  consistent with detectors use only values>0. Conversion will be done internally.
445
528
  :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
446
529
  :param device: The device (cpu,"cuda"), where to place the model.
530
+ :param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
531
+ default value.
447
532
  """
448
- config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
449
- self.model = LayoutLMv2ForTokenClassification.from_pretrained(
450
- pretrained_model_name_or_path=path_weights, config=config
533
+ self.name = self.get_name(path_weights, "LayoutLMv2")
534
+ self.model_id = self.get_model_id()
535
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
536
+ super().__init__(
537
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
451
538
  )
452
- super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
453
539
 
454
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
540
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
455
541
  """
456
542
  Launch inference on LayoutLm for token classification. Pass the following arguments
457
543
 
@@ -489,6 +575,20 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
489
575
  """
490
576
  return {"image_width": 224, "image_height": 224}
491
577
 
578
+ @staticmethod
579
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
580
+ """
581
+ Get the inner (wrapped) model.
582
+
583
+ :param path_config_json: path to .json config file
584
+ :param path_weights: path to model artifact
585
+ :return: 'nn.Module'
586
+ """
587
+ config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
588
+ return LayoutLMv2ForTokenClassification.from_pretrained(
589
+ pretrained_model_name_or_path=path_weights, config=config
590
+ )
591
+
492
592
 
493
593
  class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
494
594
  """
@@ -533,7 +633,8 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
533
633
  categories_semantics: Optional[Sequence[TypeOrStr]] = None,
534
634
  categories_bio: Optional[Sequence[TypeOrStr]] = None,
535
635
  categories: Optional[Mapping[str, TypeOrStr]] = None,
536
- device: Optional[Literal["cpu", "cuda"]] = None,
636
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
637
+ use_xlm_tokenizer: bool = False,
537
638
  ):
538
639
  """
539
640
  :param path_config_json: path to .json config file
@@ -545,14 +646,17 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
545
646
  consistent with detectors use only values>0. Conversion will be done internally.
546
647
  :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
547
648
  :param device: The device (cpu,"cuda"), where to place the model.
649
+ :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLMv3 model with a different
650
+ tokenizer.
548
651
  """
549
- config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
550
- self.model = LayoutLMv3ForTokenClassification.from_pretrained(
551
- pretrained_model_name_or_path=path_weights, config=config
652
+ self.name = self.get_name(path_weights, "LayoutLMv3")
653
+ self.model_id = self.get_model_id()
654
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
655
+ super().__init__(
656
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
552
657
  )
553
- super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
554
658
 
555
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
659
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
556
660
  """
557
661
  Launch inference on LayoutLm for token classification. Pass the following arguments
558
662
 
@@ -592,77 +696,54 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
592
696
  "pixel_std": np.array(IMAGENET_DEFAULT_STD, dtype=np.float32),
593
697
  }
594
698
 
699
+ @staticmethod
700
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
701
+ """
702
+ Get the inner (wrapped) model.
703
+
704
+ :param path_config_json: path to .json config file
705
+ :param path_weights: path to model artifact
706
+ :return: 'nn.Module'
707
+ """
708
+ config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
709
+ return LayoutLMv3ForTokenClassification.from_pretrained(
710
+ pretrained_model_name_or_path=path_weights, config=config
711
+ )
712
+
595
713
 
596
714
  class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
597
715
  """
598
716
  Abstract base class for wrapping LayoutLM models for sequence classification into the deepdoctection framework.
599
717
  """
600
718
 
601
- model: Union["LayoutLMForSequenceClassification", "LayoutLMv2ForSequenceClassification"]
719
+ model: Union[LayoutLMForSequenceClassification, LayoutLMv2ForSequenceClassification]
602
720
 
603
721
  def __init__(
604
722
  self,
605
723
  path_config_json: str,
606
724
  path_weights: str,
607
725
  categories: Mapping[str, TypeOrStr],
608
- device: Optional[Literal["cpu", "cuda"]] = None,
726
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
727
+ use_xlm_tokenizer: bool = False,
609
728
  ):
610
- self.name = "_".join(Path(path_weights).parts[-3:])
611
729
  self.path_config = path_config_json
612
730
  self.path_weights = path_weights
613
731
  self.categories = copy(categories) # type: ignore
614
732
 
615
- if device is not None:
616
- self.device = device
617
- else:
618
- self.device = set_torch_auto_device()
733
+ self.device = get_torch_device(device)
619
734
  self.model.to(self.device)
620
-
621
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
622
- input_ids = encodings.get("input_ids")
623
- attention_mask = encodings.get("attention_mask")
624
- token_type_ids = encodings.get("token_type_ids")
625
- boxes = encodings.get("bbox")
626
-
627
- if isinstance(input_ids, torch.Tensor):
628
- input_ids = input_ids.to(self.device)
629
- else:
630
- raise ValueError(f"input_ids must be list but is {type(input_ids)}")
631
- if isinstance(attention_mask, torch.Tensor):
632
- attention_mask = attention_mask.to(self.device)
633
- else:
634
- raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
635
- if isinstance(token_type_ids, torch.Tensor):
636
- token_type_ids = token_type_ids.to(self.device)
637
- else:
638
- raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
639
- if isinstance(boxes, torch.Tensor):
640
- boxes = boxes.to(self.device)
641
- else:
642
- raise ValueError(f"boxes must be list but is {type(boxes)}")
643
-
644
- result = predict_sequence_classes(
645
- input_ids,
646
- attention_mask,
647
- token_type_ids,
648
- boxes,
649
- self.model,
650
- )
651
-
652
- result.class_id += 1
653
- result.class_name = self.categories[str(result.class_id)]
654
- return result
735
+ self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
655
736
 
656
737
  @classmethod
657
738
  def get_requirements(cls) -> List[Requirement]:
658
739
  return [get_pytorch_requirement(), get_transformers_requirement()]
659
740
 
660
- def clone(self) -> "HFLayoutLmSequenceClassifierBase":
741
+ def clone(self) -> HFLayoutLmSequenceClassifierBase:
661
742
  return self.__class__(self.path_config, self.path_weights, self.categories, self.device)
662
743
 
663
744
  def _validate_encodings(
664
- self, **encodings: Union[List[List[str]], "torch.Tensor"]
665
- ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
745
+ self, **encodings: Union[List[List[str]], torch.Tensor]
746
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
666
747
  input_ids = encodings.get("input_ids")
667
748
  attention_mask = encodings.get("attention_mask")
668
749
  token_type_ids = encodings.get("token_type_ids")
@@ -691,6 +772,29 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
691
772
  boxes = boxes.to(self.device)
692
773
  return input_ids, attention_mask, token_type_ids, boxes
693
774
 
775
+ @staticmethod
776
+ def get_name(path_weights: str, architecture: str) -> str:
777
+ """Returns the name of the model"""
778
+ return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
779
+
780
+ def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
781
+ """A refinement for adding the tokenizer class name to the model configs.
782
+
783
+ :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
784
+ """
785
+ tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
786
+ return tokenizer.__class__.__name__
787
+
788
+ @staticmethod
789
+ def image_to_raw_features_mapping() -> str:
790
+ """Returns the mapping function to convert images into raw features."""
791
+ return "image_to_raw_layoutlm_features"
792
+
793
+ @staticmethod
794
+ def image_to_features_mapping() -> str:
795
+ """Returns the mapping function to convert images into features."""
796
+ return "image_to_layoutlm_features"
797
+
694
798
 
695
799
  class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
696
800
  """
@@ -728,15 +832,15 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
728
832
  path_config_json: str,
729
833
  path_weights: str,
730
834
  categories: Mapping[str, TypeOrStr],
731
- device: Optional[Literal["cpu", "cuda"]] = None,
835
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
836
+ use_xlm_tokenizer: bool = False,
732
837
  ):
733
- config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
734
- self.model = LayoutLMForSequenceClassification.from_pretrained(
735
- pretrained_model_name_or_path=path_weights, config=config
736
- )
737
- super().__init__(path_config_json, path_weights, categories, device)
838
+ self.name = self.get_name(path_weights, "LayoutLM")
839
+ self.model_id = self.get_model_id()
840
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
841
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
738
842
 
739
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
843
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
740
844
  input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
741
845
 
742
846
  result = predict_sequence_classes(
@@ -751,6 +855,20 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
751
855
  result.class_name = self.categories[str(result.class_id)]
752
856
  return result
753
857
 
858
+ @staticmethod
859
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
860
+ """
861
+ Get the inner (wrapped) model.
862
+
863
+ :param path_config_json: path to .json config file
864
+ :param path_weights: path to model artifact
865
+ :return: 'nn.Module'
866
+ """
867
+ config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
868
+ return LayoutLMForSequenceClassification.from_pretrained(
869
+ pretrained_model_name_or_path=path_weights, config=config
870
+ )
871
+
754
872
 
755
873
  class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
756
874
  """
@@ -788,15 +906,15 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
788
906
  path_config_json: str,
789
907
  path_weights: str,
790
908
  categories: Mapping[str, TypeOrStr],
791
- device: Optional[Literal["cpu", "cuda"]] = None,
909
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
910
+ use_xlm_tokenizer: bool = False,
792
911
  ):
793
- config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
794
- self.model = LayoutLMv2ForSequenceClassification.from_pretrained(
795
- pretrained_model_name_or_path=path_weights, config=config
796
- )
797
- super().__init__(path_config_json, path_weights, categories, device)
912
+ self.name = self.get_name(path_weights, "LayoutLMv2")
913
+ self.model_id = self.get_model_id()
914
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
915
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
798
916
 
799
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
917
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
800
918
  input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
801
919
  images = encodings.get("image")
802
920
  if isinstance(images, torch.Tensor):
@@ -818,6 +936,20 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
818
936
  """
819
937
  return {"image_width": 224, "image_height": 224}
820
938
 
939
+ @staticmethod
940
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
941
+ """
942
+ Get the inner (wrapped) model.
943
+
944
+ :param path_config_json: path to .json config file
945
+ :param path_weights: path to model artifact
946
+ :return: 'nn.Module'
947
+ """
948
+ config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
949
+ return LayoutLMv2ForSequenceClassification.from_pretrained(
950
+ pretrained_model_name_or_path=path_weights, config=config
951
+ )
952
+
821
953
 
822
954
  class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
823
955
  """
@@ -855,15 +987,15 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
855
987
  path_config_json: str,
856
988
  path_weights: str,
857
989
  categories: Mapping[str, TypeOrStr],
858
- device: Optional[Literal["cpu", "cuda"]] = None,
990
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
991
+ use_xlm_tokenizer: bool = False,
859
992
  ):
860
- config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
861
- self.model = LayoutLMv3ForSequenceClassification.from_pretrained(
862
- pretrained_model_name_or_path=path_weights, config=config
863
- )
864
- super().__init__(path_config_json, path_weights, categories, device)
993
+ self.name = self.get_name(path_weights, "LayoutLMv3")
994
+ self.model_id = self.get_model_id()
995
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
996
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
865
997
 
866
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
998
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
867
999
  input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
868
1000
  images = encodings.get("pixel_values")
869
1001
  if isinstance(images, torch.Tensor):
@@ -890,3 +1022,190 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
890
1022
  "pixel_mean": np.array(IMAGENET_DEFAULT_MEAN, dtype=np.float32),
891
1023
  "pixel_std": np.array(IMAGENET_DEFAULT_STD, dtype=np.float32),
892
1024
  }
1025
+
1026
+ @staticmethod
1027
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
1028
+ """
1029
+ Get the inner (wrapped) model.
1030
+
1031
+ :param path_config_json: path to .json config file
1032
+ :param path_weights: path to model artifact
1033
+ :return: 'nn.Module'
1034
+ """
1035
+ config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=path_config_json)
1036
+ return LayoutLMv3ForSequenceClassification.from_pretrained(
1037
+ pretrained_model_name_or_path=path_weights, config=config
1038
+ )
1039
+
1040
+
1041
+ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
1042
+ """
1043
+ A wrapper class for `transformers.LiltForTokenClassification` to use within a pipeline component.
1044
+ Check <https://huggingface.co/docs/transformers/model_doc/lilt> for documentation of the model itself.
1045
+ Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
1046
+ classification and other things please use another model of the family.
1047
+
1048
+ **Example**
1049
+
1050
+ # setting up compulsory ocr service
1051
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
1052
+ tess = TesseractOcrDetector(tesseract_config_path)
1053
+ ocr_service = TextExtractionService(tess)
1054
+
1055
+ # hf tokenizer and token classifier
1056
+ tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
1057
+ lilt = HFLiltTokenClassifier("path/to/config.json","path/to/model.bin",
1058
+ categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
1059
+ 'E-header', 'E-question', 'I-answer', 'I-header',
1060
+ 'I-question', 'O', 'S-answer', 'S-header',
1061
+ 'S-question'])
1062
+
1063
+ # token classification service
1064
+ lilt_service = LMTokenClassifierService(tokenizer,lilt)
1065
+
1066
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
1067
+
1068
+ path = "path/to/some/form"
1069
+ df = pipe.analyze(path=path)
1070
+
1071
+ for dp in df:
1072
+ ...
1073
+ """
1074
+
1075
+ def __init__(
1076
+ self,
1077
+ path_config_json: str,
1078
+ path_weights: str,
1079
+ categories_semantics: Optional[Sequence[TypeOrStr]] = None,
1080
+ categories_bio: Optional[Sequence[TypeOrStr]] = None,
1081
+ categories: Optional[Mapping[str, TypeOrStr]] = None,
1082
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
1083
+ use_xlm_tokenizer: bool = False,
1084
+ ):
1085
+ """
1086
+ :param path_config_json: path to .json config file
1087
+ :param path_weights: path to model artifact
1088
+ :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
1089
+ entities self. To be consistent with detectors use only values >0. Conversion will
1090
+ be done internally.
1091
+ :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
1092
+ consistent with detectors use only values>0. Conversion will be done internally.
1093
+ :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
1094
+ :param device: The device (cpu,"cuda"), where to place the model.
1095
+ """
1096
+ self.name = self.get_name(path_weights, "LiLT")
1097
+ self.model_id = self.get_model_id()
1098
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
1099
+ super().__init__(
1100
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
1101
+ )
1102
+
1103
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
1104
+ """
1105
+ Launch inference on LayoutLm for token classification. Pass the following arguments
1106
+
1107
+ `input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
1108
+
1109
+ `attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
1110
+
1111
+ `token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
1112
+
1113
+ `boxes:` Torch tensor of bounding boxes of type 'xyxy'
1114
+
1115
+ `tokens:` List of original tokens taken from `LayoutLMTokenizer`
1116
+
1117
+ :return: A list of TokenClassResults
1118
+ """
1119
+
1120
+ ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
1121
+
1122
+ results = predict_token_classes(
1123
+ ann_ids, input_ids, attention_mask, token_type_ids, boxes, tokens, self.model, None
1124
+ )
1125
+
1126
+ return self._map_category_names(results)
1127
+
1128
+ @staticmethod
1129
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
1130
+ """
1131
+ Get the inner (wrapped) model.
1132
+
1133
+ :param path_config_json: path to .json config file
1134
+ :param path_weights: path to model artifact
1135
+ :return: 'nn.Module'
1136
+ """
1137
+ config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
1138
+ return LiltForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
1139
+
1140
+
1141
+ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
1142
+ """
1143
+ A wrapper class for `transformers.LiLTForSequenceClassification` to use within a pipeline component.
1144
+ Check <https://huggingface.co/docs/transformers/model_doc/lilt> for documentation of the model itself.
1145
+ Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
1146
+ classification and other things please use another model of the family.
1147
+
1148
+ **Example**
1149
+
1150
+ # setting up compulsory ocr service
1151
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
1152
+ tess = TesseractOcrDetector(tesseract_config_path)
1153
+ ocr_service = TextExtractionService(tess)
1154
+
1155
+ # hf tokenizer and sequence classifier
1156
+ tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
1157
+ lilt = HFLiltSequenceClassifier("path/to/config.json",
1158
+ "path/to/model.bin",
1159
+ categories=["handwritten", "presentation", "resume"])
1160
+
1161
+ # sequence classification service
1162
+ lilt_service = LMSequenceClassifierService(tokenizer,lilt)
1163
+
1164
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
1165
+
1166
+ path = "path/to/some/form"
1167
+ df = pipe.analyze(path=path)
1168
+
1169
+ for dp in df:
1170
+ ...
1171
+ """
1172
+
1173
+ def __init__(
1174
+ self,
1175
+ path_config_json: str,
1176
+ path_weights: str,
1177
+ categories: Mapping[str, TypeOrStr],
1178
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
1179
+ use_xlm_tokenizer: bool = False,
1180
+ ):
1181
+ self.name = self.get_name(path_weights, "LiLT")
1182
+ self.model_id = self.get_model_id()
1183
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
1184
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
1185
+
1186
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
1187
+ input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
1188
+
1189
+ result = predict_sequence_classes(
1190
+ input_ids,
1191
+ attention_mask,
1192
+ token_type_ids,
1193
+ boxes,
1194
+ self.model,
1195
+ )
1196
+
1197
+ result.class_id += 1
1198
+ result.class_name = self.categories[str(result.class_id)]
1199
+ return result
1200
+
1201
+ @staticmethod
1202
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
1203
+ """
1204
+ Get the inner (wrapped) model.
1205
+
1206
+ :param path_config_json: path to .json config file
1207
+ :param path_weights: path to model artifact
1208
+ :return: 'nn.Module'
1209
+ """
1210
+ config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
1211
+ return LiltForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)