deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +4 -2
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +919 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +162 -108
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +205 -119
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +26 -17
  104. deepdoctection/utils/env_info.py +86 -37
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -71
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.1.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.1.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Wrapper for the Hugging Face Language Model for sequence and token classification
19
+ Wrapper for the HF Language Model for sequence and token classification
20
20
  """
21
21
  from __future__ import annotations
22
22
 
@@ -48,11 +48,14 @@ def predict_sequence_classes(
48
48
  model: Union[XLMRobertaForSequenceClassification],
49
49
  ) -> SequenceClassResult:
50
50
  """
51
- :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
52
- :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
53
- :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
54
- :param model: layoutlm model for sequence classification
55
- :return: SequenceClassResult
51
+ Args:
52
+ input_ids: Token converted to ids to be taken from `XLMRobertaTokenizer`
53
+ attention_mask: The associated attention masks from padded sequences taken from `XLMRobertaTokenizer`
54
+ token_type_ids: Torch tensor of token type ids taken from `XLMRobertaTokenizer`
55
+ model: `XLMRobertaForSequenceClassification` model for sequence classification
56
+
57
+ Returns:
58
+ `SequenceClassResult`
56
59
  """
57
60
 
58
61
  outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
@@ -65,7 +68,7 @@ def predict_sequence_classes(
65
68
 
66
69
  class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
67
70
  """
68
- Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
71
+ Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
69
72
  """
70
73
 
71
74
  def __init__(
@@ -115,27 +118,51 @@ class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
115
118
 
116
119
  @staticmethod
117
120
  def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
118
- """Returns the name of the model"""
121
+ """
122
+ Returns the name of the model
123
+
124
+ Args:
125
+ path_weights: Path to model weights
126
+ architecture: Architecture name
127
+
128
+ Returns:
129
+ str: Model name
130
+ """
119
131
  return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
120
132
 
121
133
  @staticmethod
122
134
  def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
123
- """A refinement for adding the tokenizer class name to the model configs.
135
+ """
136
+ A refinement for adding the tokenizer class name to the model configs.
137
+
138
+ Args:
139
+ model_class_name: The model name, e.g. `model.__class__.__name__`
140
+ use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
124
141
 
125
- :param model_class_name: The model name, e.g. model.__class__.__name__
126
- :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
142
+ Returns:
143
+ str: Tokenizer class name
127
144
  """
128
145
  tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
129
146
  return tokenizer.__class__.__name__
130
147
 
131
148
  @staticmethod
132
149
  def image_to_raw_features_mapping() -> str:
133
- """Returns the mapping function to convert images into raw features."""
150
+ """
151
+ Returns the mapping function to convert images into raw features.
152
+
153
+ Returns:
154
+ str: Name of the mapping function
155
+ """
134
156
  return "image_to_raw_lm_features"
135
157
 
136
158
  @staticmethod
137
159
  def image_to_features_mapping() -> str:
138
- """Returns the mapping function to convert images into features."""
160
+ """
161
+ Returns the mapping function to convert images into features.
162
+
163
+ Returns:
164
+ str: Name of the mapping function
165
+ """
139
166
  return "image_to_lm_features"
140
167
 
141
168
 
@@ -147,28 +174,29 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
147
174
  Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
148
175
  classification and other things please use another model of the family.
149
176
 
150
- **Example**
177
+ Example:
178
+ ```python
179
+ # setting up compulsory ocr service
180
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
181
+ tess = TesseractOcrDetector(tesseract_config_path)
182
+ ocr_service = TextExtractionService(tess)
151
183
 
152
- # setting up compulsory ocr service
153
- tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
154
- tess = TesseractOcrDetector(tesseract_config_path)
155
- ocr_service = TextExtractionService(tess)
184
+ # hf tokenizer and token classifier
185
+ tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
186
+ roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
187
+ categories=["handwritten", "presentation", "resume"])
156
188
 
157
- # hf tokenizer and token classifier
158
- tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
159
- roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
160
- categories=["handwritten", "presentation", "resume"])
189
+ # token classification service
190
+ roberta_service = LMSequenceClassifierService(tokenizer,roberta)
161
191
 
162
- # token classification service
163
- roberta_service = LMSequenceClassifierService(tokenizer,roberta)
192
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
164
193
 
165
- pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
194
+ path = "path/to/some/form"
195
+ df = pipe.analyze(path=path)
166
196
 
167
- path = "path/to/some/form"
168
- df = pipe.analyze(path=path)
169
-
170
- for dp in df:
171
- ...
197
+ for dp in df:
198
+ ...
199
+ ```
172
200
  """
173
201
 
174
202
  def __init__(
@@ -209,9 +237,12 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
209
237
  """
210
238
  Get the inner (wrapped) model.
211
239
 
212
- :param path_config_json: path to .json config file
213
- :param path_weights: path to model artifact
214
- :return: 'nn.Module'
240
+ Args:
241
+ path_config_json: path to .json config file
242
+ path_weights: path to model artifact
243
+
244
+ Returns:
245
+ `XLMRobertaForSequenceClassification`
215
246
  """
216
247
  config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
217
248
  return XLMRobertaForSequenceClassification.from_pretrained(
@@ -223,6 +254,9 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
223
254
  """
224
255
  Add some default arguments that might be necessary when preparing a sample. Overwrite this method
225
256
  for some custom setting.
257
+
258
+ Returns:
259
+ JsonDict: Dictionary with default arguments
226
260
  """
227
261
  return {}
228
262