deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show
  1. deepdoctection/__init__.py +16 -29
  2. deepdoctection/analyzer/dd.py +70 -59
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/common.py +9 -5
  5. deepdoctection/dataflow/custom.py +5 -5
  6. deepdoctection/dataflow/custom_serialize.py +75 -18
  7. deepdoctection/dataflow/parallel_map.py +3 -3
  8. deepdoctection/dataflow/serialize.py +4 -4
  9. deepdoctection/dataflow/stats.py +3 -3
  10. deepdoctection/datapoint/annotation.py +41 -56
  11. deepdoctection/datapoint/box.py +9 -8
  12. deepdoctection/datapoint/convert.py +6 -6
  13. deepdoctection/datapoint/image.py +56 -44
  14. deepdoctection/datapoint/view.py +245 -150
  15. deepdoctection/datasets/__init__.py +1 -4
  16. deepdoctection/datasets/adapter.py +35 -26
  17. deepdoctection/datasets/base.py +14 -12
  18. deepdoctection/datasets/dataflow_builder.py +3 -3
  19. deepdoctection/datasets/info.py +24 -26
  20. deepdoctection/datasets/instances/doclaynet.py +51 -51
  21. deepdoctection/datasets/instances/fintabnet.py +46 -46
  22. deepdoctection/datasets/instances/funsd.py +25 -24
  23. deepdoctection/datasets/instances/iiitar13k.py +13 -10
  24. deepdoctection/datasets/instances/layouttest.py +4 -3
  25. deepdoctection/datasets/instances/publaynet.py +5 -5
  26. deepdoctection/datasets/instances/pubtables1m.py +24 -21
  27. deepdoctection/datasets/instances/pubtabnet.py +32 -30
  28. deepdoctection/datasets/instances/rvlcdip.py +30 -30
  29. deepdoctection/datasets/instances/xfund.py +26 -26
  30. deepdoctection/datasets/save.py +6 -6
  31. deepdoctection/eval/__init__.py +1 -4
  32. deepdoctection/eval/accmetric.py +32 -33
  33. deepdoctection/eval/base.py +8 -9
  34. deepdoctection/eval/cocometric.py +15 -13
  35. deepdoctection/eval/eval.py +41 -37
  36. deepdoctection/eval/tedsmetric.py +30 -23
  37. deepdoctection/eval/tp_eval_callback.py +16 -19
  38. deepdoctection/extern/__init__.py +2 -7
  39. deepdoctection/extern/base.py +339 -134
  40. deepdoctection/extern/d2detect.py +85 -113
  41. deepdoctection/extern/deskew.py +14 -11
  42. deepdoctection/extern/doctrocr.py +141 -130
  43. deepdoctection/extern/fastlang.py +27 -18
  44. deepdoctection/extern/hfdetr.py +71 -62
  45. deepdoctection/extern/hflayoutlm.py +504 -211
  46. deepdoctection/extern/hflm.py +230 -0
  47. deepdoctection/extern/model.py +488 -302
  48. deepdoctection/extern/pdftext.py +23 -19
  49. deepdoctection/extern/pt/__init__.py +1 -3
  50. deepdoctection/extern/pt/nms.py +6 -2
  51. deepdoctection/extern/pt/ptutils.py +29 -19
  52. deepdoctection/extern/tessocr.py +39 -38
  53. deepdoctection/extern/texocr.py +18 -18
  54. deepdoctection/extern/tp/tfutils.py +57 -9
  55. deepdoctection/extern/tp/tpcompat.py +21 -14
  56. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
  60. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  61. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
  62. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
  67. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
  68. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  69. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  70. deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
  71. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  72. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  73. deepdoctection/extern/tpdetect.py +45 -53
  74. deepdoctection/mapper/__init__.py +3 -8
  75. deepdoctection/mapper/cats.py +27 -29
  76. deepdoctection/mapper/cocostruct.py +10 -10
  77. deepdoctection/mapper/d2struct.py +27 -26
  78. deepdoctection/mapper/hfstruct.py +13 -8
  79. deepdoctection/mapper/laylmstruct.py +178 -37
  80. deepdoctection/mapper/maputils.py +12 -11
  81. deepdoctection/mapper/match.py +2 -2
  82. deepdoctection/mapper/misc.py +11 -9
  83. deepdoctection/mapper/pascalstruct.py +4 -4
  84. deepdoctection/mapper/prodigystruct.py +5 -5
  85. deepdoctection/mapper/pubstruct.py +84 -92
  86. deepdoctection/mapper/tpstruct.py +5 -5
  87. deepdoctection/mapper/xfundstruct.py +33 -33
  88. deepdoctection/pipe/__init__.py +1 -1
  89. deepdoctection/pipe/anngen.py +12 -14
  90. deepdoctection/pipe/base.py +52 -106
  91. deepdoctection/pipe/common.py +72 -59
  92. deepdoctection/pipe/concurrency.py +16 -11
  93. deepdoctection/pipe/doctectionpipe.py +24 -21
  94. deepdoctection/pipe/language.py +20 -25
  95. deepdoctection/pipe/layout.py +20 -16
  96. deepdoctection/pipe/lm.py +75 -105
  97. deepdoctection/pipe/order.py +194 -89
  98. deepdoctection/pipe/refine.py +111 -124
  99. deepdoctection/pipe/segment.py +156 -161
  100. deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
  101. deepdoctection/pipe/text.py +37 -36
  102. deepdoctection/pipe/transform.py +19 -16
  103. deepdoctection/train/__init__.py +6 -12
  104. deepdoctection/train/d2_frcnn_train.py +48 -41
  105. deepdoctection/train/hf_detr_train.py +41 -30
  106. deepdoctection/train/hf_layoutlm_train.py +153 -135
  107. deepdoctection/train/tp_frcnn_train.py +32 -31
  108. deepdoctection/utils/concurrency.py +1 -1
  109. deepdoctection/utils/context.py +13 -6
  110. deepdoctection/utils/develop.py +4 -4
  111. deepdoctection/utils/env_info.py +87 -125
  112. deepdoctection/utils/file_utils.py +6 -11
  113. deepdoctection/utils/fs.py +22 -18
  114. deepdoctection/utils/identifier.py +2 -2
  115. deepdoctection/utils/logger.py +16 -15
  116. deepdoctection/utils/metacfg.py +7 -7
  117. deepdoctection/utils/mocks.py +93 -0
  118. deepdoctection/utils/pdf_utils.py +11 -11
  119. deepdoctection/utils/settings.py +185 -181
  120. deepdoctection/utils/tqdm.py +1 -1
  121. deepdoctection/utils/transform.py +14 -9
  122. deepdoctection/utils/types.py +104 -0
  123. deepdoctection/utils/utils.py +7 -7
  124. deepdoctection/utils/viz.py +74 -72
  125. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
  126. deepdoctection-0.33.dist-info/RECORD +146 -0
  127. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
  128. deepdoctection/utils/detection_types.py +0 -68
  129. deepdoctection-0.31.dist-info/RECORD +0 -144
  130. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
  131. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,230 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: hfml.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """
19
+ Wrapper for the Hugging Face Language Model for sequence and token classification
20
+ """
21
+ from __future__ import annotations
22
+
23
+ from abc import ABC
24
+ from pathlib import Path
25
+ from typing import Literal, Mapping, Optional, Union
26
+
27
+ from lazy_imports import try_import
28
+
29
+ from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
30
+ from ..utils.settings import TypeOrStr
31
+ from ..utils.types import JsonDict, PathLikeOrStr, Requirement
32
+ from .base import LMSequenceClassifier, ModelCategories, SequenceClassResult
33
+ from .hflayoutlm import get_tokenizer_from_model_class
34
+ from .pt.ptutils import get_torch_device
35
+
36
+ with try_import() as pt_import_guard:
37
+ import torch
38
+ import torch.nn.functional as F
39
+
40
+ with try_import() as tr_import_guard:
41
+ from transformers import PretrainedConfig, XLMRobertaForSequenceClassification
42
+
43
+
44
+ def predict_sequence_classes(
45
+ input_ids: torch.Tensor,
46
+ attention_mask: torch.Tensor,
47
+ token_type_ids: torch.Tensor,
48
+ model: Union[XLMRobertaForSequenceClassification],
49
+ ) -> SequenceClassResult:
50
+ """
51
+ :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
52
+ :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
53
+ :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
54
+ :param model: layoutlm model for sequence classification
55
+ :return: SequenceClassResult
56
+ """
57
+
58
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
59
+
60
+ score = torch.max(F.softmax(outputs.logits)).tolist()
61
+ sequence_class_predictions = outputs.logits.argmax(-1).squeeze().tolist()
62
+
63
+ return SequenceClassResult(class_id=sequence_class_predictions, score=float(score)) # type: ignore
64
+
65
+
66
+ class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
67
+ """
68
+ Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ path_config_json: PathLikeOrStr,
74
+ path_weights: PathLikeOrStr,
75
+ categories: Mapping[int, TypeOrStr],
76
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
77
+ ):
78
+ self.path_config = Path(path_config_json)
79
+ self.path_weights = Path(path_weights)
80
+ self.categories = ModelCategories(init_categories=categories)
81
+
82
+ self.device = get_torch_device(device)
83
+
84
+ @classmethod
85
+ def get_requirements(cls) -> list[Requirement]:
86
+ return [get_pytorch_requirement(), get_transformers_requirement()]
87
+
88
+ def clone(self) -> HFLmSequenceClassifierBase:
89
+ return self.__class__(self.path_config, self.path_weights, self.categories.get_categories(), self.device)
90
+
91
+ def _validate_encodings(
92
+ self, **encodings: Union[list[list[str]], torch.Tensor]
93
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
94
+ input_ids = encodings.get("input_ids")
95
+ attention_mask = encodings.get("attention_mask")
96
+ token_type_ids = encodings.get("token_type_ids")
97
+
98
+ if isinstance(input_ids, torch.Tensor):
99
+ input_ids = input_ids.to(self.device)
100
+ else:
101
+ raise ValueError(f"input_ids must be list but is {type(input_ids)}")
102
+ if isinstance(attention_mask, torch.Tensor):
103
+ attention_mask = attention_mask.to(self.device)
104
+ else:
105
+ raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
106
+ if isinstance(token_type_ids, torch.Tensor):
107
+ token_type_ids = token_type_ids.to(self.device)
108
+ else:
109
+ raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
110
+
111
+ input_ids = input_ids.to(self.device)
112
+ attention_mask = attention_mask.to(self.device)
113
+ token_type_ids = token_type_ids.to(self.device)
114
+ return input_ids, attention_mask, token_type_ids
115
+
116
+ @staticmethod
117
+ def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
118
+ """Returns the name of the model"""
119
+ return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
120
+
121
+ @staticmethod
122
+ def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
123
+ """A refinement for adding the tokenizer class name to the model configs.
124
+
125
+ :param model_class_name: The model name, e.g. model.__class__.__name__
126
+ :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
127
+ """
128
+ tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
129
+ return tokenizer.__class__.__name__
130
+
131
+ @staticmethod
132
+ def image_to_raw_features_mapping() -> str:
133
+ """Returns the mapping function to convert images into raw features."""
134
+ return "image_to_raw_lm_features"
135
+
136
+ @staticmethod
137
+ def image_to_features_mapping() -> str:
138
+ """Returns the mapping function to convert images into features."""
139
+ return "image_to_lm_features"
140
+
141
+
142
+ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
143
+ """
144
+ A wrapper class for `transformers.XLMRobertaForSequenceClassification` and similar models to use within a pipeline
145
+ component. Check <https://huggingface.co/docs/transformers/model_doc/xlm-roberta> for documentation of the
146
+ model itself.
147
+ Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
148
+ classification and other things please use another model of the family.
149
+
150
+ **Example**
151
+
152
+ # setting up compulsory ocr service
153
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
154
+ tess = TesseractOcrDetector(tesseract_config_path)
155
+ ocr_service = TextExtractionService(tess)
156
+
157
+ # hf tokenizer and token classifier
158
+ tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
159
+ roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
160
+ categories=["handwritten", "presentation", "resume"])
161
+
162
+ # token classification service
163
+ roberta_service = LMSequenceClassifierService(tokenizer,roberta)
164
+
165
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
166
+
167
+ path = "path/to/some/form"
168
+ df = pipe.analyze(path=path)
169
+
170
+ for dp in df:
171
+ ...
172
+ """
173
+
174
+ def __init__(
175
+ self,
176
+ path_config_json: PathLikeOrStr,
177
+ path_weights: PathLikeOrStr,
178
+ categories: Mapping[int, TypeOrStr],
179
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
180
+ use_xlm_tokenizer: bool = True,
181
+ ):
182
+ super().__init__(path_config_json, path_weights, categories, device)
183
+ self.name = self.get_name(path_weights, "bert-like")
184
+ self.model_id = self.get_model_id()
185
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
186
+ self.model.to(self.device)
187
+ self.model.config.tokenizer_class = self.get_tokenizer_class_name(
188
+ self.model.__class__.__name__, use_xlm_tokenizer
189
+ )
190
+
191
+ def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
192
+ input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
193
+
194
+ result = predict_sequence_classes(
195
+ input_ids,
196
+ attention_mask,
197
+ token_type_ids,
198
+ self.model,
199
+ )
200
+
201
+ result.class_id += 1
202
+ result.class_name = self.categories.categories[result.class_id]
203
+ return result
204
+
205
+ @staticmethod
206
+ def get_wrapped_model(
207
+ path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
208
+ ) -> XLMRobertaForSequenceClassification:
209
+ """
210
+ Get the inner (wrapped) model.
211
+
212
+ :param path_config_json: path to .json config file
213
+ :param path_weights: path to model artifact
214
+ :return: 'nn.Module'
215
+ """
216
+ config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
217
+ return XLMRobertaForSequenceClassification.from_pretrained(
218
+ pretrained_model_name_or_path=path_weights, config=config
219
+ )
220
+
221
+ @staticmethod
222
+ def default_kwargs_for_image_to_features_mapping() -> JsonDict:
223
+ """
224
+ Add some default arguments that might be necessary when preparing a sample. Overwrite this method
225
+ for some custom setting.
226
+ """
227
+ return {}
228
+
229
+ def clear_model(self) -> None:
230
+ self.model = None