deepdoctection 0.31__py3-none-any.whl → 0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (91) hide show
  1. deepdoctection/__init__.py +35 -28
  2. deepdoctection/analyzer/dd.py +30 -24
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/datapoint/annotation.py +2 -1
  5. deepdoctection/datapoint/box.py +2 -1
  6. deepdoctection/datapoint/image.py +13 -7
  7. deepdoctection/datapoint/view.py +95 -24
  8. deepdoctection/datasets/__init__.py +1 -4
  9. deepdoctection/datasets/adapter.py +5 -2
  10. deepdoctection/datasets/base.py +5 -3
  11. deepdoctection/datasets/info.py +2 -2
  12. deepdoctection/datasets/instances/doclaynet.py +3 -2
  13. deepdoctection/datasets/instances/fintabnet.py +2 -1
  14. deepdoctection/datasets/instances/funsd.py +2 -1
  15. deepdoctection/datasets/instances/iiitar13k.py +5 -2
  16. deepdoctection/datasets/instances/layouttest.py +2 -1
  17. deepdoctection/datasets/instances/publaynet.py +2 -2
  18. deepdoctection/datasets/instances/pubtables1m.py +6 -3
  19. deepdoctection/datasets/instances/pubtabnet.py +2 -1
  20. deepdoctection/datasets/instances/rvlcdip.py +2 -1
  21. deepdoctection/datasets/instances/xfund.py +2 -1
  22. deepdoctection/eval/__init__.py +1 -4
  23. deepdoctection/eval/cocometric.py +2 -1
  24. deepdoctection/eval/eval.py +17 -13
  25. deepdoctection/eval/tedsmetric.py +14 -11
  26. deepdoctection/eval/tp_eval_callback.py +9 -3
  27. deepdoctection/extern/__init__.py +2 -7
  28. deepdoctection/extern/d2detect.py +24 -32
  29. deepdoctection/extern/deskew.py +4 -2
  30. deepdoctection/extern/doctrocr.py +75 -81
  31. deepdoctection/extern/fastlang.py +4 -2
  32. deepdoctection/extern/hfdetr.py +22 -28
  33. deepdoctection/extern/hflayoutlm.py +335 -103
  34. deepdoctection/extern/hflm.py +225 -0
  35. deepdoctection/extern/model.py +56 -47
  36. deepdoctection/extern/pdftext.py +8 -4
  37. deepdoctection/extern/pt/__init__.py +1 -3
  38. deepdoctection/extern/pt/nms.py +6 -2
  39. deepdoctection/extern/pt/ptutils.py +27 -19
  40. deepdoctection/extern/texocr.py +4 -2
  41. deepdoctection/extern/tp/tfutils.py +43 -9
  42. deepdoctection/extern/tp/tpcompat.py +10 -7
  43. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  44. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  45. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  46. deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  47. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  48. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  49. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  50. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  51. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  52. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  53. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  54. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  55. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  56. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  57. deepdoctection/extern/tp/tpfrcnn/preproc.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  60. deepdoctection/extern/tpdetect.py +5 -8
  61. deepdoctection/mapper/__init__.py +3 -8
  62. deepdoctection/mapper/d2struct.py +8 -6
  63. deepdoctection/mapper/hfstruct.py +6 -1
  64. deepdoctection/mapper/laylmstruct.py +163 -20
  65. deepdoctection/mapper/maputils.py +3 -1
  66. deepdoctection/mapper/misc.py +6 -3
  67. deepdoctection/mapper/tpstruct.py +2 -2
  68. deepdoctection/pipe/__init__.py +1 -1
  69. deepdoctection/pipe/common.py +11 -9
  70. deepdoctection/pipe/concurrency.py +2 -1
  71. deepdoctection/pipe/layout.py +3 -1
  72. deepdoctection/pipe/lm.py +32 -64
  73. deepdoctection/pipe/order.py +142 -35
  74. deepdoctection/pipe/refine.py +8 -14
  75. deepdoctection/pipe/{cell.py → sub_layout.py} +1 -1
  76. deepdoctection/train/__init__.py +6 -12
  77. deepdoctection/train/d2_frcnn_train.py +21 -16
  78. deepdoctection/train/hf_detr_train.py +18 -11
  79. deepdoctection/train/hf_layoutlm_train.py +118 -101
  80. deepdoctection/train/tp_frcnn_train.py +21 -19
  81. deepdoctection/utils/env_info.py +41 -117
  82. deepdoctection/utils/logger.py +1 -0
  83. deepdoctection/utils/mocks.py +93 -0
  84. deepdoctection/utils/settings.py +1 -0
  85. deepdoctection/utils/viz.py +4 -3
  86. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/METADATA +27 -18
  87. deepdoctection-0.32.dist-info/RECORD +146 -0
  88. deepdoctection-0.31.dist-info/RECORD +0 -144
  89. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
  90. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/WHEEL +0 -0
  91. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@
18
18
  """
19
19
  HF Layoutlm model for diverse downstream tasks.
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  from abc import ABC
23
24
  from collections import defaultdict
@@ -26,14 +27,10 @@ from pathlib import Path
26
27
  from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Tuple, Union
27
28
 
28
29
  import numpy as np
30
+ from lazy_imports import try_import
29
31
 
30
32
  from ..utils.detection_types import JsonDict, Requirement
31
- from ..utils.file_utils import (
32
- get_pytorch_requirement,
33
- get_transformers_requirement,
34
- pytorch_available,
35
- transformers_available,
36
- )
33
+ from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
37
34
  from ..utils.settings import (
38
35
  BioTag,
39
36
  ObjectTypes,
@@ -44,39 +41,85 @@ from ..utils.settings import (
44
41
  token_class_with_tag_to_token_class_and_tag,
45
42
  )
46
43
  from .base import LMSequenceClassifier, LMTokenClassifier, SequenceClassResult, TokenClassResult
47
- from .pt.ptutils import set_torch_auto_device
44
+ from .pt.ptutils import get_torch_device
48
45
 
49
- if pytorch_available():
46
+ with try_import() as pt_import_guard:
50
47
  import torch
51
48
  import torch.nn.functional as F
52
- from torch import Tensor # pylint: disable=W0611
53
49
 
54
- if transformers_available():
50
+ with try_import() as tr_import_guard:
55
51
  from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD # type: ignore
56
52
  from transformers import (
57
53
  LayoutLMForSequenceClassification,
58
54
  LayoutLMForTokenClassification,
55
+ LayoutLMTokenizerFast,
59
56
  LayoutLMv2Config,
60
57
  LayoutLMv2ForSequenceClassification,
61
58
  LayoutLMv2ForTokenClassification,
62
59
  LayoutLMv3Config,
63
60
  LayoutLMv3ForSequenceClassification,
64
61
  LayoutLMv3ForTokenClassification,
62
+ LiltForSequenceClassification,
63
+ LiltForTokenClassification,
65
64
  PretrainedConfig,
65
+ RobertaTokenizerFast,
66
+ XLMRobertaTokenizerFast,
66
67
  )
67
68
 
68
69
 
70
+ def get_tokenizer_from_model_class(model_class: str, use_xlm_tokenizer: bool) -> Any:
71
+ """
72
+ We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
73
+ returns the tokenizer that should be used for a particular model.
74
+
75
+ :param model_class: The model as stated in the transformer library.
76
+ :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
77
+ :return: Tokenizer instance to use.
78
+ """
79
+ return {
80
+ ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
81
+ "microsoft/layoutlm-base-uncased"
82
+ ),
83
+ ("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
84
+ "microsoft/layoutlm-base-uncased"
85
+ ),
86
+ ("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
87
+ "microsoft/layoutlm-base-uncased"
88
+ ),
89
+ ("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
90
+ "microsoft/layoutlm-base-uncased"
91
+ ),
92
+ ("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
93
+ ("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
94
+ ("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
95
+ "roberta-base", add_prefix_space=True
96
+ ),
97
+ ("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
98
+ "roberta-base", add_prefix_space=True
99
+ ),
100
+ ("LiltForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
101
+ ("LiltForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
102
+ "roberta-base", add_prefix_space=True
103
+ ),
104
+ ("LiltForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
105
+ ("LiltForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
106
+ "roberta-base", add_prefix_space=True
107
+ ),
108
+ ("XLMRobertaForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained(
109
+ "FacebookAI/xlm-roberta-base"
110
+ ),
111
+ }[(model_class, use_xlm_tokenizer)]
112
+
113
+
69
114
  def predict_token_classes(
70
115
  uuids: List[List[str]],
71
- input_ids: "Tensor",
72
- attention_mask: "Tensor",
73
- token_type_ids: "Tensor",
74
- boxes: "Tensor",
116
+ input_ids: torch.Tensor,
117
+ attention_mask: torch.Tensor,
118
+ token_type_ids: torch.Tensor,
119
+ boxes: torch.Tensor,
75
120
  tokens: List[List[str]],
76
- model: Union[
77
- "LayoutLMForTokenClassification", "LayoutLMv2ForTokenClassification", "LayoutLMv3ForTokenClassification"
78
- ],
79
- images: Optional["Tensor"] = None,
121
+ model: Union[LayoutLMForTokenClassification, LayoutLMv2ForTokenClassification, LayoutLMv3ForTokenClassification],
122
+ images: Optional[torch.Tensor] = None,
80
123
  ) -> List[TokenClassResult]:
81
124
  """
82
125
  :param uuids: A list of uuids that correspond to a word that induces the resulting token
@@ -129,26 +172,28 @@ def predict_token_classes(
129
172
 
130
173
 
131
174
  def predict_sequence_classes(
132
- input_ids: "Tensor",
133
- attention_mask: "Tensor",
134
- token_type_ids: "Tensor",
135
- boxes: "Tensor",
175
+ input_ids: torch.Tensor,
176
+ attention_mask: torch.Tensor,
177
+ token_type_ids: torch.Tensor,
178
+ boxes: torch.Tensor,
136
179
  model: Union[
137
- "LayoutLMForSequenceClassification",
138
- "LayoutLMv2ForSequenceClassification",
139
- "LayoutLMv3ForSequenceClassification",
180
+ LayoutLMForSequenceClassification,
181
+ LayoutLMv2ForSequenceClassification,
182
+ LayoutLMv3ForSequenceClassification,
183
+ LiltForSequenceClassification,
140
184
  ],
141
- images: Optional["Tensor"] = None,
185
+ images: Optional[torch.Tensor] = None,
142
186
  ) -> SequenceClassResult:
143
187
  """
144
188
  :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
145
189
  :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
146
190
  :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
147
191
  :param boxes: Torch tensor of bounding boxes of type 'xyxy'
148
- :param model: layoutlm model for token classification
192
+ :param model: layoutlm model for sequence classification
149
193
  :param images: A list of torch image tensors or None
150
194
  :return: SequenceClassResult
151
195
  """
196
+
152
197
  if images is None:
153
198
  outputs = model(input_ids=input_ids, bbox=boxes, attention_mask=attention_mask, token_type_ids=token_type_ids)
154
199
  elif isinstance(model, LayoutLMv2ForSequenceClassification):
@@ -177,7 +222,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
177
222
  Abstract base class for wrapping LayoutLM models for token classification into the deepdoctection framework.
178
223
  """
179
224
 
180
- model: Union["LayoutLMForTokenClassification", "LayoutLMv2ForTokenClassification"]
225
+ model: Union[LayoutLMForTokenClassification, LayoutLMv2ForTokenClassification]
181
226
 
182
227
  def __init__(
183
228
  self,
@@ -186,7 +231,8 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
186
231
  categories_semantics: Optional[Sequence[TypeOrStr]] = None,
187
232
  categories_bio: Optional[Sequence[TypeOrStr]] = None,
188
233
  categories: Optional[Mapping[str, TypeOrStr]] = None,
189
- device: Optional[Literal["cpu", "cuda"]] = None,
234
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
235
+ use_xlm_tokenizer: bool = False,
190
236
  ):
191
237
  """
192
238
  :param path_config_json: path to .json config file
@@ -198,6 +244,8 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
198
244
  consistent with detectors use only values>0. Conversion will be done internally.
199
245
  :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
200
246
  :param device: The device (cpu,"cuda"), where to place the model.
247
+ :param use_xlm_tokenizer: True if one uses the LayoutXLM or a lilt model built with a xlm language model, e.g.
248
+ info-xlm or roberta-xlm. (LayoutXLM cannot be distinguished from LayoutLMv2).
201
249
  """
202
250
 
203
251
  if categories is None:
@@ -218,11 +266,9 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
218
266
  self.categories = self._categories_orig_to_categories(
219
267
  self.categories_semantics, self.categories_bio # type: ignore
220
268
  )
221
- if device is not None:
222
- self.device = device
223
- else:
224
- self.device = set_torch_auto_device()
269
+ self.device = get_torch_device(device)
225
270
  self.model.to(self.device)
271
+ self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
226
272
 
227
273
  @classmethod
228
274
  def get_requirements(cls) -> List[Requirement]:
@@ -256,9 +302,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
256
302
 
257
303
  def _validate_encodings(
258
304
  self, **encodings: Any
259
- ) -> Tuple[
260
- List[List[str]], List[str], "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", List[List[str]]
261
- ]:
305
+ ) -> Tuple[List[List[str]], List[str], torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, List[List[str]]]:
262
306
  image_ids = encodings.get("image_ids", [])
263
307
  ann_ids = encodings.get("ann_ids")
264
308
  input_ids = encodings.get("input_ids")
@@ -291,7 +335,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
291
335
 
292
336
  return ann_ids, image_ids, input_ids, attention_mask, token_type_ids, boxes, tokens
293
337
 
294
- def clone(self) -> "HFLayoutLmTokenClassifierBase":
338
+ def clone(self) -> HFLayoutLmTokenClassifierBase:
295
339
  return self.__class__(
296
340
  self.path_config,
297
341
  self.path_weights,
@@ -306,6 +350,24 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
306
350
  """Returns the name of the model"""
307
351
  return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
308
352
 
353
+ def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
354
+ """A refinement for adding the tokenizer class name to the model configs.
355
+
356
+ :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
357
+ """
358
+ tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
359
+ return tokenizer.__class__.__name__
360
+
361
+ @staticmethod
362
+ def image_to_raw_features_mapping() -> str:
363
+ """Returns the mapping function to convert images into raw features."""
364
+ return "image_to_raw_layoutlm_features"
365
+
366
+ @staticmethod
367
+ def image_to_features_mapping() -> str:
368
+ """Returns the mapping function to convert images into features."""
369
+ return "image_to_layoutlm_features"
370
+
309
371
 
310
372
  class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
311
373
  """
@@ -348,7 +410,8 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
348
410
  categories_semantics: Optional[Sequence[TypeOrStr]] = None,
349
411
  categories_bio: Optional[Sequence[TypeOrStr]] = None,
350
412
  categories: Optional[Mapping[str, TypeOrStr]] = None,
351
- device: Optional[Literal["cpu", "cuda"]] = None,
413
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
414
+ use_xlm_tokenizer: bool = False,
352
415
  ):
353
416
  """
354
417
  :param path_config_json: path to .json config file
@@ -360,13 +423,17 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
360
423
  consistent with detectors use only values>0. Conversion will be done internally.
361
424
  :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
362
425
  :param device: The device (cpu,"cuda"), where to place the model.
426
+ :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
427
+ Tokenizer.
363
428
  """
364
429
  self.name = self.get_name(path_weights, "LayoutLM")
365
430
  self.model_id = self.get_model_id()
366
431
  self.model = self.get_wrapped_model(path_config_json, path_weights)
367
- super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
432
+ super().__init__(
433
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
434
+ )
368
435
 
369
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
436
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
370
437
  """
371
438
  Launch inference on LayoutLm for token classification. Pass the following arguments
372
439
 
@@ -447,7 +514,8 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
447
514
  categories_semantics: Optional[Sequence[TypeOrStr]] = None,
448
515
  categories_bio: Optional[Sequence[TypeOrStr]] = None,
449
516
  categories: Optional[Mapping[str, TypeOrStr]] = None,
450
- device: Optional[Literal["cpu", "cuda"]] = None,
517
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
518
+ use_xlm_tokenizer: bool = False,
451
519
  ):
452
520
  """
453
521
  :param path_config_json: path to .json config file
@@ -459,13 +527,17 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
459
527
  consistent with detectors use only values>0. Conversion will be done internally.
460
528
  :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
461
529
  :param device: The device (cpu,"cuda"), where to place the model.
530
+ :param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
531
+ default value.
462
532
  """
463
533
  self.name = self.get_name(path_weights, "LayoutLMv2")
464
534
  self.model_id = self.get_model_id()
465
535
  self.model = self.get_wrapped_model(path_config_json, path_weights)
466
- super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
536
+ super().__init__(
537
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
538
+ )
467
539
 
468
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
540
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
469
541
  """
470
542
  Launch inference on LayoutLm for token classification. Pass the following arguments
471
543
 
@@ -561,7 +633,8 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
561
633
  categories_semantics: Optional[Sequence[TypeOrStr]] = None,
562
634
  categories_bio: Optional[Sequence[TypeOrStr]] = None,
563
635
  categories: Optional[Mapping[str, TypeOrStr]] = None,
564
- device: Optional[Literal["cpu", "cuda"]] = None,
636
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
637
+ use_xlm_tokenizer: bool = False,
565
638
  ):
566
639
  """
567
640
  :param path_config_json: path to .json config file
@@ -573,13 +646,17 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
573
646
  consistent with detectors use only values>0. Conversion will be done internally.
574
647
  :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
575
648
  :param device: The device (cpu,"cuda"), where to place the model.
649
+ :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLMv3 model with a different
650
+ tokenizer.
576
651
  """
577
652
  self.name = self.get_name(path_weights, "LayoutLMv3")
578
653
  self.model_id = self.get_model_id()
579
654
  self.model = self.get_wrapped_model(path_config_json, path_weights)
580
- super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
655
+ super().__init__(
656
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
657
+ )
581
658
 
582
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> List[TokenClassResult]:
659
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
583
660
  """
584
661
  Launch inference on LayoutLm for token classification. Pass the following arguments
585
662
 
@@ -639,70 +716,34 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
639
716
  Abstract base class for wrapping LayoutLM models for sequence classification into the deepdoctection framework.
640
717
  """
641
718
 
642
- model: Union["LayoutLMForSequenceClassification", "LayoutLMv2ForSequenceClassification"]
719
+ model: Union[LayoutLMForSequenceClassification, LayoutLMv2ForSequenceClassification]
643
720
 
644
721
  def __init__(
645
722
  self,
646
723
  path_config_json: str,
647
724
  path_weights: str,
648
725
  categories: Mapping[str, TypeOrStr],
649
- device: Optional[Literal["cpu", "cuda"]] = None,
726
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
727
+ use_xlm_tokenizer: bool = False,
650
728
  ):
651
729
  self.path_config = path_config_json
652
730
  self.path_weights = path_weights
653
731
  self.categories = copy(categories) # type: ignore
654
732
 
655
- if device is not None:
656
- self.device = device
657
- else:
658
- self.device = set_torch_auto_device()
733
+ self.device = get_torch_device(device)
659
734
  self.model.to(self.device)
660
-
661
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
662
- input_ids = encodings.get("input_ids")
663
- attention_mask = encodings.get("attention_mask")
664
- token_type_ids = encodings.get("token_type_ids")
665
- boxes = encodings.get("bbox")
666
-
667
- if isinstance(input_ids, torch.Tensor):
668
- input_ids = input_ids.to(self.device)
669
- else:
670
- raise ValueError(f"input_ids must be list but is {type(input_ids)}")
671
- if isinstance(attention_mask, torch.Tensor):
672
- attention_mask = attention_mask.to(self.device)
673
- else:
674
- raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
675
- if isinstance(token_type_ids, torch.Tensor):
676
- token_type_ids = token_type_ids.to(self.device)
677
- else:
678
- raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
679
- if isinstance(boxes, torch.Tensor):
680
- boxes = boxes.to(self.device)
681
- else:
682
- raise ValueError(f"boxes must be list but is {type(boxes)}")
683
-
684
- result = predict_sequence_classes(
685
- input_ids,
686
- attention_mask,
687
- token_type_ids,
688
- boxes,
689
- self.model,
690
- )
691
-
692
- result.class_id += 1
693
- result.class_name = self.categories[str(result.class_id)]
694
- return result
735
+ self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
695
736
 
696
737
  @classmethod
697
738
  def get_requirements(cls) -> List[Requirement]:
698
739
  return [get_pytorch_requirement(), get_transformers_requirement()]
699
740
 
700
- def clone(self) -> "HFLayoutLmSequenceClassifierBase":
741
+ def clone(self) -> HFLayoutLmSequenceClassifierBase:
701
742
  return self.__class__(self.path_config, self.path_weights, self.categories, self.device)
702
743
 
703
744
  def _validate_encodings(
704
- self, **encodings: Union[List[List[str]], "torch.Tensor"]
705
- ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
745
+ self, **encodings: Union[List[List[str]], torch.Tensor]
746
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
706
747
  input_ids = encodings.get("input_ids")
707
748
  attention_mask = encodings.get("attention_mask")
708
749
  token_type_ids = encodings.get("token_type_ids")
@@ -736,6 +777,24 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
736
777
  """Returns the name of the model"""
737
778
  return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
738
779
 
780
+ def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
781
+ """A refinement for adding the tokenizer class name to the model configs.
782
+
783
+ :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
784
+ """
785
+ tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
786
+ return tokenizer.__class__.__name__
787
+
788
+ @staticmethod
789
+ def image_to_raw_features_mapping() -> str:
790
+ """Returns the mapping function to convert images into raw features."""
791
+ return "image_to_raw_layoutlm_features"
792
+
793
+ @staticmethod
794
+ def image_to_features_mapping() -> str:
795
+ """Returns the mapping function to convert images into features."""
796
+ return "image_to_layoutlm_features"
797
+
739
798
 
740
799
  class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
741
800
  """
@@ -773,17 +832,15 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
773
832
  path_config_json: str,
774
833
  path_weights: str,
775
834
  categories: Mapping[str, TypeOrStr],
776
- device: Optional[Literal["cpu", "cuda"]] = None,
835
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
836
+ use_xlm_tokenizer: bool = False,
777
837
  ):
778
838
  self.name = self.get_name(path_weights, "LayoutLM")
779
839
  self.model_id = self.get_model_id()
780
- config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
781
- self.model = LayoutLMForSequenceClassification.from_pretrained(
782
- pretrained_model_name_or_path=path_weights, config=config
783
- )
784
- super().__init__(path_config_json, path_weights, categories, device)
840
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
841
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
785
842
 
786
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
843
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
787
844
  input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
788
845
 
789
846
  result = predict_sequence_classes(
@@ -849,14 +906,15 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
849
906
  path_config_json: str,
850
907
  path_weights: str,
851
908
  categories: Mapping[str, TypeOrStr],
852
- device: Optional[Literal["cpu", "cuda"]] = None,
909
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
910
+ use_xlm_tokenizer: bool = False,
853
911
  ):
854
912
  self.name = self.get_name(path_weights, "LayoutLMv2")
855
913
  self.model_id = self.get_model_id()
856
914
  self.model = self.get_wrapped_model(path_config_json, path_weights)
857
- super().__init__(path_config_json, path_weights, categories, device)
915
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
858
916
 
859
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
917
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
860
918
  input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
861
919
  images = encodings.get("image")
862
920
  if isinstance(images, torch.Tensor):
@@ -929,14 +987,15 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
929
987
  path_config_json: str,
930
988
  path_weights: str,
931
989
  categories: Mapping[str, TypeOrStr],
932
- device: Optional[Literal["cpu", "cuda"]] = None,
990
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
991
+ use_xlm_tokenizer: bool = False,
933
992
  ):
934
993
  self.name = self.get_name(path_weights, "LayoutLMv3")
935
994
  self.model_id = self.get_model_id()
936
995
  self.model = self.get_wrapped_model(path_config_json, path_weights)
937
- super().__init__(path_config_json, path_weights, categories, device)
996
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
938
997
 
939
- def predict(self, **encodings: Union[List[List[str]], "torch.Tensor"]) -> SequenceClassResult:
998
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
940
999
  input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
941
1000
  images = encodings.get("pixel_values")
942
1001
  if isinstance(images, torch.Tensor):
@@ -977,3 +1036,176 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
977
1036
  return LayoutLMv3ForSequenceClassification.from_pretrained(
978
1037
  pretrained_model_name_or_path=path_weights, config=config
979
1038
  )
1039
+
1040
+
1041
+ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
1042
+ """
1043
+ A wrapper class for `transformers.LiltForTokenClassification` to use within a pipeline component.
1044
+ Check <https://huggingface.co/docs/transformers/model_doc/lilt> for documentation of the model itself.
1045
+ Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
1046
+ classification and other things please use another model of the family.
1047
+
1048
+ **Example**
1049
+
1050
+ # setting up compulsory ocr service
1051
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
1052
+ tess = TesseractOcrDetector(tesseract_config_path)
1053
+ ocr_service = TextExtractionService(tess)
1054
+
1055
+ # hf tokenizer and token classifier
1056
+ tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
1057
+ lilt = HFLiltTokenClassifier("path/to/config.json","path/to/model.bin",
1058
+ categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
1059
+ 'E-header', 'E-question', 'I-answer', 'I-header',
1060
+ 'I-question', 'O', 'S-answer', 'S-header',
1061
+ 'S-question'])
1062
+
1063
+ # token classification service
1064
+ lilt_service = LMTokenClassifierService(tokenizer,lilt)
1065
+
1066
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
1067
+
1068
+ path = "path/to/some/form"
1069
+ df = pipe.analyze(path=path)
1070
+
1071
+ for dp in df:
1072
+ ...
1073
+ """
1074
+
1075
+ def __init__(
1076
+ self,
1077
+ path_config_json: str,
1078
+ path_weights: str,
1079
+ categories_semantics: Optional[Sequence[TypeOrStr]] = None,
1080
+ categories_bio: Optional[Sequence[TypeOrStr]] = None,
1081
+ categories: Optional[Mapping[str, TypeOrStr]] = None,
1082
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
1083
+ use_xlm_tokenizer: bool = False,
1084
+ ):
1085
+ """
1086
+ :param path_config_json: path to .json config file
1087
+ :param path_weights: path to model artifact
1088
+ :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
1089
+ entities self. To be consistent with detectors use only values >0. Conversion will
1090
+ be done internally.
1091
+ :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
1092
+ consistent with detectors use only values>0. Conversion will be done internally.
1093
+ :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
1094
+ :param device: The device (cpu,"cuda"), where to place the model.
1095
+ """
1096
+ self.name = self.get_name(path_weights, "LiLT")
1097
+ self.model_id = self.get_model_id()
1098
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
1099
+ super().__init__(
1100
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
1101
+ )
1102
+
1103
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> List[TokenClassResult]:
1104
+ """
1105
+ Launch inference on LayoutLm for token classification. Pass the following arguments
1106
+
1107
+ `input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
1108
+
1109
+ `attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
1110
+
1111
+ `token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
1112
+
1113
+ `boxes:` Torch tensor of bounding boxes of type 'xyxy'
1114
+
1115
+ `tokens:` List of original tokens taken from `LayoutLMTokenizer`
1116
+
1117
+ :return: A list of TokenClassResults
1118
+ """
1119
+
1120
+ ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
1121
+
1122
+ results = predict_token_classes(
1123
+ ann_ids, input_ids, attention_mask, token_type_ids, boxes, tokens, self.model, None
1124
+ )
1125
+
1126
+ return self._map_category_names(results)
1127
+
1128
+ @staticmethod
1129
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
1130
+ """
1131
+ Get the inner (wrapped) model.
1132
+
1133
+ :param path_config_json: path to .json config file
1134
+ :param path_weights: path to model artifact
1135
+ :return: 'nn.Module'
1136
+ """
1137
+ config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
1138
+ return LiltForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
1139
+
1140
+
1141
+ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
1142
+ """
1143
+ A wrapper class for `transformers.LiLTForSequenceClassification` to use within a pipeline component.
1144
+ Check <https://huggingface.co/docs/transformers/model_doc/lilt> for documentation of the model itself.
1145
+ Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
1146
+ classification and other things please use another model of the family.
1147
+
1148
+ **Example**
1149
+
1150
+ # setting up compulsory ocr service
1151
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
1152
+ tess = TesseractOcrDetector(tesseract_config_path)
1153
+ ocr_service = TextExtractionService(tess)
1154
+
1155
+ # hf tokenizer and sequence classifier
1156
+ tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
1157
+ lilt = HFLiltSequenceClassifier("path/to/config.json",
1158
+ "path/to/model.bin",
1159
+ categories=["handwritten", "presentation", "resume"])
1160
+
1161
+ # sequence classification service
1162
+ lilt_service = LMSequenceClassifierService(tokenizer,lilt)
1163
+
1164
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
1165
+
1166
+ path = "path/to/some/form"
1167
+ df = pipe.analyze(path=path)
1168
+
1169
+ for dp in df:
1170
+ ...
1171
+ """
1172
+
1173
+ def __init__(
1174
+ self,
1175
+ path_config_json: str,
1176
+ path_weights: str,
1177
+ categories: Mapping[str, TypeOrStr],
1178
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
1179
+ use_xlm_tokenizer: bool = False,
1180
+ ):
1181
+ self.name = self.get_name(path_weights, "LiLT")
1182
+ self.model_id = self.get_model_id()
1183
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
1184
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
1185
+
1186
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
1187
+ input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
1188
+
1189
+ result = predict_sequence_classes(
1190
+ input_ids,
1191
+ attention_mask,
1192
+ token_type_ids,
1193
+ boxes,
1194
+ self.model,
1195
+ )
1196
+
1197
+ result.class_id += 1
1198
+ result.class_name = self.categories[str(result.class_id)]
1199
+ return result
1200
+
1201
+ @staticmethod
1202
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
1203
+ """
1204
+ Get the inner (wrapped) model.
1205
+
1206
+ :param path_config_json: path to .json config file
1207
+ :param path_weights: path to model artifact
1208
+ :return: 'nn.Module'
1209
+ """
1210
+ config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
1211
+ return LiltForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)