deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show
  1. deepdoctection/__init__.py +16 -29
  2. deepdoctection/analyzer/dd.py +70 -59
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/common.py +9 -5
  5. deepdoctection/dataflow/custom.py +5 -5
  6. deepdoctection/dataflow/custom_serialize.py +75 -18
  7. deepdoctection/dataflow/parallel_map.py +3 -3
  8. deepdoctection/dataflow/serialize.py +4 -4
  9. deepdoctection/dataflow/stats.py +3 -3
  10. deepdoctection/datapoint/annotation.py +41 -56
  11. deepdoctection/datapoint/box.py +9 -8
  12. deepdoctection/datapoint/convert.py +6 -6
  13. deepdoctection/datapoint/image.py +56 -44
  14. deepdoctection/datapoint/view.py +245 -150
  15. deepdoctection/datasets/__init__.py +1 -4
  16. deepdoctection/datasets/adapter.py +35 -26
  17. deepdoctection/datasets/base.py +14 -12
  18. deepdoctection/datasets/dataflow_builder.py +3 -3
  19. deepdoctection/datasets/info.py +24 -26
  20. deepdoctection/datasets/instances/doclaynet.py +51 -51
  21. deepdoctection/datasets/instances/fintabnet.py +46 -46
  22. deepdoctection/datasets/instances/funsd.py +25 -24
  23. deepdoctection/datasets/instances/iiitar13k.py +13 -10
  24. deepdoctection/datasets/instances/layouttest.py +4 -3
  25. deepdoctection/datasets/instances/publaynet.py +5 -5
  26. deepdoctection/datasets/instances/pubtables1m.py +24 -21
  27. deepdoctection/datasets/instances/pubtabnet.py +32 -30
  28. deepdoctection/datasets/instances/rvlcdip.py +30 -30
  29. deepdoctection/datasets/instances/xfund.py +26 -26
  30. deepdoctection/datasets/save.py +6 -6
  31. deepdoctection/eval/__init__.py +1 -4
  32. deepdoctection/eval/accmetric.py +32 -33
  33. deepdoctection/eval/base.py +8 -9
  34. deepdoctection/eval/cocometric.py +15 -13
  35. deepdoctection/eval/eval.py +41 -37
  36. deepdoctection/eval/tedsmetric.py +30 -23
  37. deepdoctection/eval/tp_eval_callback.py +16 -19
  38. deepdoctection/extern/__init__.py +2 -7
  39. deepdoctection/extern/base.py +339 -134
  40. deepdoctection/extern/d2detect.py +85 -113
  41. deepdoctection/extern/deskew.py +14 -11
  42. deepdoctection/extern/doctrocr.py +141 -130
  43. deepdoctection/extern/fastlang.py +27 -18
  44. deepdoctection/extern/hfdetr.py +71 -62
  45. deepdoctection/extern/hflayoutlm.py +504 -211
  46. deepdoctection/extern/hflm.py +230 -0
  47. deepdoctection/extern/model.py +488 -302
  48. deepdoctection/extern/pdftext.py +23 -19
  49. deepdoctection/extern/pt/__init__.py +1 -3
  50. deepdoctection/extern/pt/nms.py +6 -2
  51. deepdoctection/extern/pt/ptutils.py +29 -19
  52. deepdoctection/extern/tessocr.py +39 -38
  53. deepdoctection/extern/texocr.py +18 -18
  54. deepdoctection/extern/tp/tfutils.py +57 -9
  55. deepdoctection/extern/tp/tpcompat.py +21 -14
  56. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
  60. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  61. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
  62. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
  67. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
  68. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  69. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  70. deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
  71. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  72. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  73. deepdoctection/extern/tpdetect.py +45 -53
  74. deepdoctection/mapper/__init__.py +3 -8
  75. deepdoctection/mapper/cats.py +27 -29
  76. deepdoctection/mapper/cocostruct.py +10 -10
  77. deepdoctection/mapper/d2struct.py +27 -26
  78. deepdoctection/mapper/hfstruct.py +13 -8
  79. deepdoctection/mapper/laylmstruct.py +178 -37
  80. deepdoctection/mapper/maputils.py +12 -11
  81. deepdoctection/mapper/match.py +2 -2
  82. deepdoctection/mapper/misc.py +11 -9
  83. deepdoctection/mapper/pascalstruct.py +4 -4
  84. deepdoctection/mapper/prodigystruct.py +5 -5
  85. deepdoctection/mapper/pubstruct.py +84 -92
  86. deepdoctection/mapper/tpstruct.py +5 -5
  87. deepdoctection/mapper/xfundstruct.py +33 -33
  88. deepdoctection/pipe/__init__.py +1 -1
  89. deepdoctection/pipe/anngen.py +12 -14
  90. deepdoctection/pipe/base.py +52 -106
  91. deepdoctection/pipe/common.py +72 -59
  92. deepdoctection/pipe/concurrency.py +16 -11
  93. deepdoctection/pipe/doctectionpipe.py +24 -21
  94. deepdoctection/pipe/language.py +20 -25
  95. deepdoctection/pipe/layout.py +20 -16
  96. deepdoctection/pipe/lm.py +75 -105
  97. deepdoctection/pipe/order.py +194 -89
  98. deepdoctection/pipe/refine.py +111 -124
  99. deepdoctection/pipe/segment.py +156 -161
  100. deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
  101. deepdoctection/pipe/text.py +37 -36
  102. deepdoctection/pipe/transform.py +19 -16
  103. deepdoctection/train/__init__.py +6 -12
  104. deepdoctection/train/d2_frcnn_train.py +48 -41
  105. deepdoctection/train/hf_detr_train.py +41 -30
  106. deepdoctection/train/hf_layoutlm_train.py +153 -135
  107. deepdoctection/train/tp_frcnn_train.py +32 -31
  108. deepdoctection/utils/concurrency.py +1 -1
  109. deepdoctection/utils/context.py +13 -6
  110. deepdoctection/utils/develop.py +4 -4
  111. deepdoctection/utils/env_info.py +87 -125
  112. deepdoctection/utils/file_utils.py +6 -11
  113. deepdoctection/utils/fs.py +22 -18
  114. deepdoctection/utils/identifier.py +2 -2
  115. deepdoctection/utils/logger.py +16 -15
  116. deepdoctection/utils/metacfg.py +7 -7
  117. deepdoctection/utils/mocks.py +93 -0
  118. deepdoctection/utils/pdf_utils.py +11 -11
  119. deepdoctection/utils/settings.py +185 -181
  120. deepdoctection/utils/tqdm.py +1 -1
  121. deepdoctection/utils/transform.py +14 -9
  122. deepdoctection/utils/types.py +104 -0
  123. deepdoctection/utils/utils.py +7 -7
  124. deepdoctection/utils/viz.py +74 -72
  125. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
  126. deepdoctection-0.33.dist-info/RECORD +146 -0
  127. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
  128. deepdoctection/utils/detection_types.py +0 -68
  129. deepdoctection-0.31.dist-info/RECORD +0 -144
  130. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
  131. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
@@ -19,26 +19,28 @@
19
19
  Module for mapping annotations into standard Detectron2 dataset dict. Also providing some tools for W&B mapping and
20
20
  visualising
21
21
  """
22
-
22
+ from __future__ import annotations
23
23
 
24
24
  import os.path
25
- from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union
25
+ from typing import Mapping, Optional, Sequence, Union
26
26
 
27
27
  import numpy as np
28
- import torch
28
+ from lazy_imports import try_import
29
29
 
30
- from ..datapoint.annotation import ImageAnnotation
30
+ from ..datapoint.annotation import DEFAULT_CATEGORY_ID, ImageAnnotation
31
31
  from ..datapoint.image import Image
32
32
  from ..extern.pt.nms import batched_nms
33
33
  from ..mapper.maputils import curry
34
- from ..utils.detection_types import JsonDict
35
- from ..utils.file_utils import detectron2_available, wandb_available
36
- from ..utils.settings import ObjectTypes, TypeOrStr, get_type
34
+ from ..utils.settings import DefaultType, ObjectTypes, TypeOrStr, get_type
35
+ from ..utils.types import Detectron2Dict
36
+
37
+ with try_import() as pt_import_guard:
38
+ import torch
37
39
 
38
- if detectron2_available():
40
+ with try_import() as d2_import_guard:
39
41
  from detectron2.structures import BoxMode
40
42
 
41
- if wandb_available():
43
+ with try_import() as wb_import_guard:
42
44
  from wandb import Classes
43
45
  from wandb import Image as Wbimage
44
46
 
@@ -47,8 +49,8 @@ if wandb_available():
47
49
  def image_to_d2_frcnn_training(
48
50
  dp: Image,
49
51
  add_mask: bool = False,
50
- category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
51
- ) -> Optional[JsonDict]:
52
+ category_names: Optional[Union[TypeOrStr, Sequence[TypeOrStr]]] = None,
53
+ ) -> Optional[Detectron2Dict]:
52
54
  """
53
55
  Maps an image to a standard dataset dict as described in
54
56
  <https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html>. It further checks if the image is physically
@@ -64,7 +66,7 @@ def image_to_d2_frcnn_training(
64
66
  if not os.path.isfile(dp.location) and dp.image is None:
65
67
  return None
66
68
 
67
- output: JsonDict = {"file_name": str(dp.location)}
69
+ output: Detectron2Dict = {"file_name": str(dp.location)}
68
70
 
69
71
  if dp.image is not None:
70
72
  output["image"] = dp.image.astype("float32")
@@ -85,10 +87,10 @@ def image_to_d2_frcnn_training(
85
87
  box = box.transform(dp.width, dp.height, absolute_coords=True)
86
88
 
87
89
  # Detectron2 does not fully support BoxMode.XYXY_REL
88
- mapped_ann: Dict[str, Union[str, int, List[float]]] = {
90
+ mapped_ann: dict[str, Union[str, int, list[float]]] = {
89
91
  "bbox_mode": BoxMode.XYXY_ABS,
90
92
  "bbox": box.to_list(mode="xyxy"),
91
- "category_id": int(ann.category_id) - 1,
93
+ "category_id": ann.category_id - 1,
92
94
  }
93
95
  annotations.append(mapped_ann)
94
96
 
@@ -147,23 +149,23 @@ def pt_nms_image_annotations(
147
149
 
148
150
  def _get_category_attributes(
149
151
  ann: ImageAnnotation, cat_to_sub_cat: Optional[Mapping[ObjectTypes, ObjectTypes]] = None
150
- ) -> Tuple[str, str, Optional[float]]:
152
+ ) -> tuple[ObjectTypes, int, Optional[float]]:
151
153
  if cat_to_sub_cat:
152
154
  sub_cat_key = cat_to_sub_cat.get(get_type(ann.category_name))
153
155
  if sub_cat_key in ann.sub_categories:
154
156
  sub_cat = ann.get_sub_category(sub_cat_key)
155
- return sub_cat.category_name, sub_cat.category_id, sub_cat.score
156
- return "", "", 0.0
157
- return ann.category_name, ann.category_id, ann.score
157
+ return get_type(sub_cat.category_name), sub_cat.category_id, sub_cat.score
158
+ return DefaultType.DEFAULT_TYPE, DEFAULT_CATEGORY_ID, 0.0
159
+ return get_type(ann.category_name), ann.category_id, ann.score
158
160
 
159
161
 
160
162
  @curry
161
163
  def to_wandb_image(
162
164
  dp: Image,
163
- categories: Mapping[str, TypeOrStr],
164
- sub_categories: Optional[Mapping[str, TypeOrStr]] = None,
165
+ categories: Mapping[int, TypeOrStr],
166
+ sub_categories: Optional[Mapping[int, TypeOrStr]] = None,
165
167
  cat_to_sub_cat: Optional[Mapping[ObjectTypes, ObjectTypes]] = None,
166
- ) -> Tuple[str, "Wbimage"]:
168
+ ) -> tuple[str, Wbimage]:
167
169
  """
168
170
  Converting a deepdoctection image into a wandb image
169
171
 
@@ -183,11 +185,10 @@ def to_wandb_image(
183
185
  anns = dp.get_annotation(category_names=list(categories.values()))
184
186
 
185
187
  if sub_categories:
186
- class_labels = {int(key): val for key, val in sub_categories.items()}
187
- class_set = Classes([{"name": val, "id": int(key)} for key, val in sub_categories.items()])
188
+ class_labels = dict(sub_categories.items())
189
+ class_set = Classes([{"name": val, "id": key} for key, val in sub_categories.items()])
188
190
  else:
189
- class_labels = {int(key): val for key, val in categories.items()}
190
- class_set = Classes([{"name": val, "id": int(key)} for key, val in categories.items()])
191
+ class_set = Classes([{"name": val, "id": key} for key, val in categories.items()])
191
192
 
192
193
  for ann in anns:
193
194
  bounding_box = ann.get_bounding_box(dp.image_id)
@@ -198,7 +199,7 @@ def to_wandb_image(
198
199
  box = {
199
200
  "position": {"middle": bounding_box.center, "width": bounding_box.width, "height": bounding_box.height},
200
201
  "domain": "pixel",
201
- "class_id": int(category_id),
202
+ "class_id": category_id,
202
203
  "box_caption": category_name,
203
204
  }
204
205
  if score:
@@ -19,26 +19,31 @@
19
19
  Module for mapping annotations into standard Huggingface Detr input structure for training
20
20
  """
21
21
 
22
+ from __future__ import annotations
23
+
22
24
  import os
23
25
  from dataclasses import dataclass, field
24
- from typing import Dict, List, Literal, Optional, Sequence, Union
26
+ from typing import Literal, Optional, Sequence, Union
25
27
 
26
28
  import numpy as np
27
- from transformers import BatchFeature, DetrFeatureExtractor
29
+ from lazy_imports import try_import
28
30
 
29
31
  from ..datapoint.image import Image
30
32
  from ..mapper.maputils import curry
31
33
  from ..mapper.misc import get_load_image_func
32
- from ..utils.detection_types import JsonDict
33
- from ..utils.settings import ObjectTypes
34
+ from ..utils.settings import TypeOrStr
34
35
  from ..utils.transform import PadTransform
36
+ from ..utils.types import JsonDict
37
+
38
+ with try_import() as tr_import_guard:
39
+ from transformers import BatchFeature, DetrFeatureExtractor
35
40
 
36
41
 
37
42
  @curry
38
43
  def image_to_hf_detr_training(
39
44
  dp: Image,
40
45
  add_mask: bool = False,
41
- category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
46
+ category_names: Optional[Union[TypeOrStr, Sequence[Union[TypeOrStr]]]] = None,
42
47
  ) -> Optional[JsonDict]:
43
48
  """
44
49
  Maps an image to a detr input datapoint dict, that, after collating can be used for training.
@@ -71,11 +76,11 @@ def image_to_hf_detr_training(
71
76
  for ann in anns:
72
77
  box = ann.get_bounding_box(dp.image_id)
73
78
 
74
- mapped_ann: Dict[str, Union[str, int, float, List[float]]] = {
79
+ mapped_ann: dict[str, Union[str, int, float, list[float]]] = {
75
80
  "id": "".join([c for c in ann.annotation_id if c.isdigit()])[:8],
76
81
  "image_id": "".join([c for c in dp.image_id if c.isdigit()])[:8],
77
82
  "bbox": box.to_list(mode="xywh"),
78
- "category_id": int(ann.category_id) - 1,
83
+ "category_id": ann.category_id - 1,
79
84
  "area": box.area,
80
85
  }
81
86
  annotations.append(mapped_ann)
@@ -103,7 +108,7 @@ class DetrDataCollator:
103
108
  padder: Optional[PadTransform] = None
104
109
  return_tensors: Optional[Literal["pt"]] = field(default="pt")
105
110
 
106
- def __call__(self, raw_features: List[JsonDict]) -> BatchFeature:
111
+ def __call__(self, raw_features: list[JsonDict]) -> BatchFeature:
107
112
  """
108
113
  Creating BatchFeature from a list of dict of raw features.
109
114
 
@@ -20,32 +20,30 @@ Module for mapping annotations from image to layout lm input structure. Heavily
20
20
  <https://github.com/NielsRogge/Transformers-Tutorials>
21
21
  """
22
22
 
23
+ from __future__ import annotations
24
+
23
25
  import random
24
26
  from dataclasses import dataclass, field
25
- from typing import Any, Callable, Dict, List, Literal, NewType, Optional, Sequence, Union
27
+ from typing import Any, Callable, Literal, NewType, Optional, Sequence, Union
26
28
 
27
29
  import numpy as np
28
30
  import numpy.typing as npt
31
+ from lazy_imports import try_import
29
32
 
30
33
  from ..datapoint.annotation import ContainerAnnotation
31
34
  from ..datapoint.convert import box_to_point4, point4_to_box
32
35
  from ..datapoint.image import Image
33
- from ..utils.detection_types import JsonDict
34
- from ..utils.file_utils import pytorch_available, transformers_available
36
+ from ..datapoint.view import Page
35
37
  from ..utils.settings import DatasetType, LayoutType, PageType, Relationships, WordType
36
38
  from ..utils.transform import ResizeTransform, normalize_image
39
+ from ..utils.types import JsonDict
37
40
  from .maputils import curry
38
41
 
39
- if pytorch_available():
42
+ with try_import() as import_guard:
40
43
  import torch
41
44
 
42
- if transformers_available():
43
- from transformers import ( # pylint: disable=W0611
44
- BatchEncoding,
45
- PreTrainedTokenizerFast,
46
- RobertaTokenizerFast,
47
- XLMRobertaTokenizerFast,
48
- )
45
+ with try_import() as tr_import_guard:
46
+ from transformers import BatchEncoding, PreTrainedTokenizerFast # pylint: disable=W0611
49
47
 
50
48
  __all__ = [
51
49
  "image_to_raw_layoutlm_features",
@@ -54,19 +52,24 @@ __all__ = [
54
52
  "image_to_layoutlm_features",
55
53
  "DataCollator",
56
54
  "LayoutLMFeatures",
55
+ "image_to_raw_lm_features",
56
+ "image_to_lm_features",
57
57
  ]
58
58
 
59
59
  RawLayoutLMFeatures = NewType("RawLayoutLMFeatures", JsonDict)
60
+ RawLMFeatures = NewType("RawLMFeatures", JsonDict)
60
61
  LayoutLMFeatures = NewType("LayoutLMFeatures", JsonDict)
62
+ LMFeatures = NewType("LMFeatures", JsonDict)
61
63
  InputDataClass = NewType("InputDataClass", JsonDict)
62
64
 
65
+
63
66
  """
64
67
  <https://github.com/huggingface/transformers/src/transformers/data/data_collator.py>
65
68
  A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
66
69
  of PyTorch/TensorFlow tensors or NumPy arrays.
67
70
  """
68
71
 
69
- DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]]) # type: ignore
72
+ DataCollator = NewType("DataCollator", Callable[[list[InputDataClass]], dict[str, Any]]) # type: ignore
70
73
 
71
74
  _CLS_BOX = [0.0, 0.0, 1000.0, 1000.0]
72
75
  _SEP_BOX = [1000.0, 1000.0, 1000.0, 1000.0]
@@ -122,9 +125,9 @@ def image_to_raw_layoutlm_features(
122
125
  all_ann_ids = []
123
126
  all_words = []
124
127
  all_boxes = []
125
- all_labels: List[int] = []
128
+ all_labels: list[int] = []
126
129
 
127
- anns = dp.get_annotation_iter(category_names=LayoutType.word)
130
+ anns = dp.get_annotation_iter(category_names=LayoutType.WORD)
128
131
 
129
132
  word_id_to_segment_box = {}
130
133
  if segment_positions:
@@ -136,12 +139,12 @@ def image_to_raw_layoutlm_features(
136
139
  if not bounding_box.absolute_coords:
137
140
  bounding_box = bounding_box.transform(dp.width, dp.height, absolute_coords=True)
138
141
  word_id_to_segment_box.update(
139
- {word_ann: bounding_box for word_ann in segm_ann.get_relationship(Relationships.child)}
142
+ {word_ann: bounding_box for word_ann in segm_ann.get_relationship(Relationships.CHILD)}
140
143
  )
141
144
 
142
145
  for ann in anns:
143
146
  all_ann_ids.append(ann.annotation_id)
144
- char_cat = ann.get_sub_category(WordType.characters)
147
+ char_cat = ann.get_sub_category(WordType.CHARACTERS)
145
148
  if not isinstance(char_cat, ContainerAnnotation):
146
149
  raise TypeError(f"char_cat must be of type ContainerAnnotation but is of type {type(char_cat)}")
147
150
  word = char_cat.value
@@ -155,15 +158,15 @@ def image_to_raw_layoutlm_features(
155
158
  all_boxes.append(word_id_to_segment_box.get(ann.annotation_id, box).to_list(mode="xyxy"))
156
159
 
157
160
  if (
158
- WordType.token_tag in ann.sub_categories or WordType.token_class in ann.sub_categories
159
- ) and dataset_type == DatasetType.token_classification:
161
+ WordType.TOKEN_TAG in ann.sub_categories or WordType.TOKEN_CLASS in ann.sub_categories
162
+ ) and dataset_type == DatasetType.TOKEN_CLASSIFICATION:
160
163
  if use_token_tag:
161
- all_labels.append(int(ann.get_sub_category(WordType.token_tag).category_id) - 1)
164
+ all_labels.append(ann.get_sub_category(WordType.TOKEN_TAG).category_id - 1)
162
165
  else:
163
- all_labels.append(int(ann.get_sub_category(WordType.token_class).category_id) - 1)
166
+ all_labels.append(ann.get_sub_category(WordType.TOKEN_CLASS).category_id - 1)
164
167
 
165
- if dp.summary is not None and dataset_type == DatasetType.sequence_classification:
166
- all_labels.append(int(dp.summary.get_sub_category(PageType.document_type).category_id) - 1)
168
+ if dataset_type == DatasetType.SEQUENCE_CLASSIFICATION:
169
+ all_labels.append(dp.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id - 1)
167
170
 
168
171
  boxes = np.asarray(all_boxes, dtype="float32")
169
172
  if boxes.ndim == 1:
@@ -208,7 +211,7 @@ def image_to_raw_layoutlm_features(
208
211
  return raw_features
209
212
 
210
213
 
211
- def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
214
+ def layoutlm_features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
212
215
  """
213
216
  Converting list of floats to pytorch tensors
214
217
  :param features: LayoutLMFeatures
@@ -216,7 +219,8 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
216
219
  """
217
220
 
218
221
  _image_key = "pixel_values" if "pixel_values" in features else "image"
219
- features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
222
+ if "bbox" in features:
223
+ features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
220
224
  if "labels" in features:
221
225
  features["labels"] = torch.tensor(features["labels"], dtype=torch.long)
222
226
  if _image_key in features:
@@ -230,12 +234,12 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
230
234
 
231
235
 
232
236
  def _tokenize_with_sliding_window(
233
- raw_features: List[RawLayoutLMFeatures],
234
- tokenizer: "PreTrainedTokenizerFast",
237
+ raw_features: list[Union[RawLayoutLMFeatures, RawLMFeatures]],
238
+ tokenizer: PreTrainedTokenizerFast,
235
239
  sliding_window_stride: int,
236
240
  max_batch_size: int,
237
241
  return_tensors: Optional[Literal["pt"]] = None,
238
- ) -> Union[JsonDict, "BatchEncoding"]:
242
+ ) -> Union[JsonDict, BatchEncoding]:
239
243
  """
240
244
  Runs a tokenizer: If there are no overflowing tokens, the tokenizer output will be returned as it is.
241
245
  If there are overflowing tokens, sliding windows have to be built. As it is easier to prepare the sliding windows
@@ -381,7 +385,7 @@ def _tokenize_with_sliding_window(
381
385
  )
382
386
  )
383
387
 
384
- slided_tokenized_inputs: Dict[str, Union[List[Union[str, int]], torch.Tensor]] = {}
388
+ slided_tokenized_inputs: dict[str, Union[list[Union[str, int]], torch.Tensor]] = {}
385
389
  if return_tensors == "pt":
386
390
  slided_tokenized_inputs["overflow_to_sample_mapping"] = torch.tensor(overflow_to_sample_mapping)
387
391
  slided_tokenized_inputs["input_ids"] = torch.tensor(all_input_ids)
@@ -398,8 +402,8 @@ def _tokenize_with_sliding_window(
398
402
 
399
403
 
400
404
  def raw_features_to_layoutlm_features(
401
- raw_features: Union[RawLayoutLMFeatures, List[RawLayoutLMFeatures]],
402
- tokenizer: "PreTrainedTokenizerFast",
405
+ raw_features: Union[RawLayoutLMFeatures, RawLMFeatures, list[Union[RawLayoutLMFeatures, RawLMFeatures]]],
406
+ tokenizer: PreTrainedTokenizerFast,
403
407
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
404
408
  truncation: bool = True,
405
409
  return_overflowing_tokens: bool = False,
@@ -407,6 +411,7 @@ def raw_features_to_layoutlm_features(
407
411
  remove_columns_for_training: bool = False,
408
412
  sliding_window_stride: int = 0,
409
413
  max_batch_size: int = 0,
414
+ remove_bounding_boxes: bool = False,
410
415
  ) -> LayoutLMFeatures:
411
416
  """
412
417
  Mapping raw features to tokenized input sequences for LayoutLM models.
@@ -442,11 +447,11 @@ def raw_features_to_layoutlm_features(
442
447
  raw_features = [raw_features]
443
448
 
444
449
  _has_token_labels = (
445
- raw_features[0]["dataset_type"] == DatasetType.token_classification
450
+ raw_features[0]["dataset_type"] == DatasetType.TOKEN_CLASSIFICATION
446
451
  and raw_features[0].get("labels") is not None
447
452
  )
448
453
  _has_sequence_labels = (
449
- raw_features[0]["dataset_type"] == DatasetType.sequence_classification
454
+ raw_features[0]["dataset_type"] == DatasetType.SEQUENCE_CLASSIFICATION
450
455
  and raw_features[0].get("labels") is not None
451
456
  )
452
457
  _has_labels = bool(_has_token_labels or _has_sequence_labels)
@@ -563,8 +568,11 @@ def raw_features_to_layoutlm_features(
563
568
  input_dict.pop("ann_ids")
564
569
  input_dict.pop("tokens")
565
570
 
571
+ if remove_bounding_boxes:
572
+ input_dict.pop("bbox")
573
+
566
574
  if return_tensors == "pt":
567
- return features_to_pt_tensors(LayoutLMFeatures(input_dict))
575
+ return layoutlm_features_to_pt_tensors(LayoutLMFeatures(input_dict))
568
576
  return LayoutLMFeatures(input_dict)
569
577
 
570
578
 
@@ -595,13 +603,14 @@ class LayoutLMDataCollator:
595
603
  with windows shifted `sliding_window_stride` to the right.
596
604
  """
597
605
 
598
- tokenizer: "PreTrainedTokenizerFast"
606
+ tokenizer: PreTrainedTokenizerFast
599
607
  padding: Literal["max_length", "do_not_pad", "longest"] = field(default="max_length")
600
608
  truncation: bool = field(default=True)
601
609
  return_overflowing_tokens: bool = field(default=False)
602
610
  return_tensors: Optional[Literal["pt"]] = field(default=None)
603
611
  sliding_window_stride: int = field(default=0)
604
612
  max_batch_size: int = field(default=0)
613
+ remove_bounding_box_features: bool = field(default=False)
605
614
 
606
615
  def __post_init__(self) -> None:
607
616
  assert isinstance(self.tokenizer, PreTrainedTokenizerFast), "Tokenizer must be a fast tokenizer"
@@ -611,7 +620,7 @@ class LayoutLMDataCollator:
611
620
  if self.return_overflowing_tokens:
612
621
  assert self.truncation, self.truncation
613
622
 
614
- def __call__(self, raw_features: Union[RawLayoutLMFeatures, List[RawLayoutLMFeatures]]) -> LayoutLMFeatures:
623
+ def __call__(self, raw_features: Union[RawLayoutLMFeatures, list[RawLayoutLMFeatures]]) -> LayoutLMFeatures:
615
624
  """
616
625
  Calling the DataCollator to form model inputs for training and inference. Takes a single raw
617
626
  :param raw_features: A dictionary with the following arguments: `image_id, width, height, ann_ids, words,
@@ -620,7 +629,7 @@ class LayoutLMDataCollator:
620
629
  token_type_ids, attention_masks, boxes, labels`.
621
630
  """
622
631
  return raw_features_to_layoutlm_features(
623
- raw_features,
632
+ raw_features, # type: ignore
624
633
  self.tokenizer,
625
634
  self.padding,
626
635
  self.truncation,
@@ -629,13 +638,14 @@ class LayoutLMDataCollator:
629
638
  True,
630
639
  self.sliding_window_stride,
631
640
  self.max_batch_size,
641
+ self.remove_bounding_box_features,
632
642
  )
633
643
 
634
644
 
635
645
  @curry
636
646
  def image_to_layoutlm_features(
637
647
  dp: Image,
638
- tokenizer: "PreTrainedTokenizerFast",
648
+ tokenizer: PreTrainedTokenizerFast,
639
649
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
640
650
  truncation: bool = True,
641
651
  return_overflowing_tokens: bool = False,
@@ -724,3 +734,134 @@ def image_to_layoutlm_features(
724
734
  sliding_window_stride=sliding_window_stride,
725
735
  )
726
736
  return features
737
+
738
+
739
+ @curry
740
+ def image_to_raw_lm_features(
741
+ dp: Image,
742
+ dataset_type: Optional[Literal["sequence_classification", "token_classification"]] = None,
743
+ use_token_tag: bool = True,
744
+ text_container: Optional[LayoutType] = LayoutType.WORD,
745
+ floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
746
+ include_residual_text_container: bool = False,
747
+ ) -> Optional[RawLMFeatures]:
748
+ """
749
+ Mapping a datapoint into an intermediate format for bert-like models. Features will be provided into a dict and
750
+ this mapping can be used for sequence or token classification as well as for inference. To generate input features
751
+ for the model please `use raw_features_to_layoutlm_features`.
752
+
753
+
754
+ :param dp: Image
755
+ :param dataset_type: Either SEQUENCE_CLASSIFICATION or TOKEN_CLASSIFICATION. When using a built-in dataset use
756
+ :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
757
+ labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
758
+ `WordType.token_class`.
759
+ :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
760
+ :param floating_text_block_categories: A list of top level layout objects
761
+ :param include_residual_text_container: This will regard synthetic text line annotations as floating text
762
+ blocks and therefore incorporate all image annotations of category
763
+ `word` when building text strings.
764
+ :return: dictionary with the following arguments:
765
+ 'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
766
+ """
767
+
768
+ raw_features: RawLMFeatures = RawLMFeatures({})
769
+
770
+ page = Page.from_image(dp, text_container, floating_text_block_categories, include_residual_text_container)
771
+
772
+ text_ = page.text_
773
+
774
+ # pylint: disable=E1137 #3162
775
+ raw_features["image_id"] = page.image_id
776
+ raw_features["width"] = page.width
777
+ raw_features["height"] = page.height
778
+ raw_features["ann_ids"] = text_["ann_ids"]
779
+ raw_features["words"] = text_["words"]
780
+ # We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
781
+ # raw_features_to_layoutlm_features
782
+ raw_features["bbox"] = [_CLS_BOX] * len(text_["words"])
783
+ raw_features["dataset_type"] = dataset_type
784
+
785
+ if use_token_tag and text_["token_tags"]:
786
+ raw_features["labels"] = text_["token_tags"]
787
+ elif text_["token_classes"]:
788
+ raw_features["labels"] = text_["token_classes"]
789
+ elif page.document_type is not None:
790
+ document_type_id = page.image_orig.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id - 1
791
+ raw_features["labels"] = [document_type_id]
792
+
793
+ raw_features["dataset_type"] = dataset_type
794
+ # pylint: enable=E1137
795
+ return raw_features
796
+
797
+
798
+ @curry
799
+ def image_to_lm_features(
800
+ dp: Image,
801
+ tokenizer: PreTrainedTokenizerFast,
802
+ padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
803
+ truncation: bool = True,
804
+ return_overflowing_tokens: bool = False,
805
+ return_tensors: Optional[Literal["pt"]] = "pt",
806
+ sliding_window_stride: int = 0,
807
+ text_container: Optional[LayoutType] = LayoutType.WORD,
808
+ floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
809
+ include_residual_text_container: bool = False,
810
+ ) -> Optional[LayoutLMFeatures]:
811
+ """
812
+ Mapping function to generate layoutlm features from `Image` to be used for inference in a pipeline component.
813
+ `LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
814
+ with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
815
+ used internally in `LMTokenClassifierService`.
816
+
817
+ tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
818
+ layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
819
+ categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
820
+
821
+ layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
822
+
823
+ :param dp: Image datapoint
824
+ :param tokenizer: Tokenizer compatible with the language model
825
+ :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
826
+ `do_not_pad`.
827
+ :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
828
+ maximum acceptable input length for the model if that argument is not provided. This will
829
+ truncate token by token, removing a token from the longest sequence in the pair if a pair of
830
+ sequences (or a batch of pairs) is provided.
831
+ If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
832
+ model maximum admissible input size).
833
+ :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
834
+ can be returned as an additional batch element. Not that in this case, the number
835
+ of input batch samples will be smaller than the output batch samples.
836
+ :param return_tensors: Output tensor features. Either 'pt' for PyTorch models or None, if features should be
837
+ returned in list objects.
838
+ :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length a sliding
839
+ windows will be created with each window having max_length sequence input. When using
840
+ `sliding_window_stride=0` no strides will be created, otherwise it will create slides
841
+ with windows shifted `sliding_window_stride` to the right.
842
+ :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
843
+ :param floating_text_block_categories: A list of top level layout objects
844
+ :param include_residual_text_container: This will regard synthetic text line annotations as floating text
845
+ blocks and therefore incorporate all image annotations of category
846
+ `word` when building text strings.
847
+ :return: A dict of lm features
848
+ """
849
+ raw_features = image_to_raw_lm_features( # pylint: disable=E1102
850
+ dataset_type=None,
851
+ use_token_tag=True,
852
+ text_container=text_container,
853
+ floating_text_block_categories=floating_text_block_categories,
854
+ include_residual_text_container=include_residual_text_container,
855
+ )(dp)
856
+ if raw_features is None:
857
+ return None
858
+ features = raw_features_to_layoutlm_features(
859
+ raw_features,
860
+ tokenizer,
861
+ padding,
862
+ truncation,
863
+ return_overflowing_tokens,
864
+ return_tensors=return_tensors,
865
+ sliding_window_stride=sliding_window_stride,
866
+ )
867
+ return features
@@ -18,20 +18,22 @@
18
18
  """
19
19
  Utility functions related to mapping tasks
20
20
  """
21
+ from __future__ import annotations
22
+
21
23
  import functools
22
24
  import itertools
23
25
  import traceback
24
26
  from types import TracebackType
25
- from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union
27
+ from typing import Any, Callable, Mapping, Optional, Sequence, Union
26
28
 
27
29
  import numpy as np
28
30
  from tabulate import tabulate
29
31
  from termcolor import colored
30
32
 
31
- from ..utils.detection_types import DP, BaseExceptionType, S, T
32
33
  from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDError
33
34
  from ..utils.logger import LoggingRecord, logger
34
35
  from ..utils.settings import ObjectTypes
36
+ from ..utils.types import DP, BaseExceptionType, S, T
35
37
 
36
38
  __all__ = ["MappingContextManager", "DefaultMapper", "maybe_get_fake_score", "LabelSummarizer", "curry"]
37
39
 
@@ -43,7 +45,7 @@ class MappingContextManager:
43
45
  """
44
46
 
45
47
  def __init__(
46
- self, dp_name: Optional[str] = None, filter_level: str = "image", **kwargs: Dict[str, Optional[str]]
48
+ self, dp_name: Optional[str] = None, filter_level: str = "image", **kwargs: dict[str, Optional[str]]
47
49
  ) -> None:
48
50
  """
49
51
  :param dp_name: A name for the datapoint to be mapped
@@ -55,7 +57,7 @@ class MappingContextManager:
55
57
  self.context_error = True
56
58
  self.kwargs = kwargs
57
59
 
58
- def __enter__(self) -> "MappingContextManager":
60
+ def __enter__(self) -> MappingContextManager:
59
61
  """
60
62
  context enter
61
63
  """
@@ -79,6 +81,7 @@ class MappingContextManager:
79
81
  AssertionError,
80
82
  TypeError,
81
83
  FileNotFoundError,
84
+ AttributeError,
82
85
  BoundingBoxError,
83
86
  AnnotationError,
84
87
  ImageError,
@@ -190,7 +193,7 @@ class LabelSummarizer:
190
193
 
191
194
  """
192
195
 
193
- def __init__(self, categories: Mapping[str, ObjectTypes]) -> None:
196
+ def __init__(self, categories: Mapping[int, ObjectTypes]) -> None:
194
197
  """
195
198
  :param categories: A dict of categories as given as in categories.get_categories().
196
199
  """
@@ -208,11 +211,11 @@ class LabelSummarizer:
208
211
  np_item = np.asarray(item, dtype="int8")
209
212
  self.summary += np.histogram(np_item, bins=self.hist_bins)[0]
210
213
 
211
- def get_summary(self) -> Dict[str, np.int32]:
214
+ def get_summary(self) -> dict[int, int]:
212
215
  """
213
216
  Get a dictionary with category ids and the number dumped
214
217
  """
215
- return dict(list(zip(self.categories.keys(), self.summary.astype(np.int32))))
218
+ return dict(list(zip(self.categories.keys(), self.summary.tolist())))
216
219
 
217
220
  def print_summary_histogram(self, dd_logic: bool = True) -> None:
218
221
  """
@@ -221,11 +224,9 @@ class LabelSummarizer:
221
224
  :param dd_logic: Follow dd category convention when printing histogram (last background bucket omitted).
222
225
  """
223
226
  if dd_logic:
224
- data = list(itertools.chain(*[[self.categories[str(i)].value, v] for i, v in enumerate(self.summary, 1)]))
227
+ data = list(itertools.chain(*[[self.categories[i].value, v] for i, v in enumerate(self.summary, 1)]))
225
228
  else:
226
- data = list(
227
- itertools.chain(*[[self.categories[str(i + 1)].value, v] for i, v in enumerate(self.summary[:-1])])
228
- )
229
+ data = list(itertools.chain(*[[self.categories[i + 1].value, v] for i, v in enumerate(self.summary[:-1])]))
229
230
  num_columns = min(6, len(data))
230
231
  total_img_anns = sum(data[1::2])
231
232
  data.extend([None] * ((num_columns - len(data) % num_columns) % num_columns))