deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
deepdoctection/pipe/lm.py CHANGED
@@ -37,31 +37,34 @@ if TYPE_CHECKING:
37
37
  @pipeline_component_registry.register("LMTokenClassifierService")
38
38
  class LMTokenClassifierService(PipelineComponent):
39
39
  """
40
- Pipeline component for token classification
40
+ Module for token classification pipeline.
41
41
 
42
- **Example**
42
+ This module provides pipeline components for token and sequence classification using language models.
43
43
 
44
- # setting up compulsory ocr service
45
- tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
46
- tess = TesseractOcrDetector(tesseract_config_path)
47
- ocr_service = TextExtractionService(tess)
44
+ Example:
45
+ ```python
46
+ # setting up compulsory ocr service
47
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
48
+ tess = TesseractOcrDetector(tesseract_config_path)
49
+ ocr_service = TextExtractionService(tess)
48
50
 
49
- # hf tokenizer and token classifier
50
- tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
51
- layoutlm = HFLayoutLmTokenClassifier(categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
52
- 'E-header', 'E-question', 'I-answer', 'I-header',
53
- 'I-question', 'O', 'S-answer', 'S-header', 'S-question'])
51
+ # hf tokenizer and token classifier
52
+ tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
53
+ layoutlm = HFLayoutLmTokenClassifier(categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
54
+ 'E-header', 'E-question', 'I-answer', 'I-header',
55
+ 'I-question', 'O', 'S-answer', 'S-header', 'S-question'])
54
56
 
55
- # token classification service
56
- layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
57
+ # token classification service
58
+ layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
57
59
 
58
- pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
60
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service, layoutlm_service])
59
61
 
60
- path = "path/to/some/form"
61
- df = pipe.analyze(path=path)
62
+ path = "path/to/some/form"
63
+ df = pipe.analyze(path=path)
62
64
 
63
- for dp in df:
64
- ...
65
+ for dp in df:
66
+ ...
67
+ ```
65
68
  """
66
69
 
67
70
  def __init__(
@@ -76,32 +79,65 @@ class LMTokenClassifierService(PipelineComponent):
76
79
  sliding_window_stride: int = 0,
77
80
  ) -> None:
78
81
  """
79
- :param tokenizer: Token classifier, typing allows currently anything. This will be changed in the future
80
- :param language_model: language model token classifier
81
- :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
82
- `do_not_pad`.
83
- :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
84
- maximum acceptable input length for the model if that argument is not provided. This will
85
- truncate token by token, removing a token from the longest sequence in the pair if a pair of
86
- sequences (or a batch of pairs) is provided.
87
- If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
88
- model maximum admissible input size).
89
- :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
90
- can be returned as an additional batch element. Not that in this case, the number of input
91
- batch samples will be smaller than the output batch samples.
92
- :param use_other_as_default_category: When predicting token classes, it might be possible that some words might
93
- not get sent to the model because they are categorized as not eligible
94
- token (e.g. empty string). If set to `True` it will assign all words
95
- without token the `BioTag.outside` token.
96
- :param segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly
97
- for models that have been trained on segments rather than words.
98
- Choose a single or a sequence of layout segments to use their bounding boxes. Note, that
99
- the layout segments need to have a child-relationship with words. If a word does not
100
- appear as child, it will use the word bounding box.
101
- :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length, a sliding
102
- windows will be created with each window having max_length sequence input. When using
103
- `sliding_window_stride=0` no strides will be created, otherwise it will create slides
104
- with windows shifted `sliding_window_stride` to the right.
82
+ Pipeline component for token classification.
83
+
84
+ Example:
85
+ ```python
86
+ # setting up compulsory ocr service
87
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
88
+ tess = TesseractOcrDetector(tesseract_config_path)
89
+ ocr_service = TextExtractionService(tess)
90
+
91
+ # hf tokenizer and token classifier
92
+ tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
93
+ layoutlm = HFLayoutLmTokenClassifier(categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
94
+ 'E-header', 'E-question', 'I-answer', 'I-header',
95
+ 'I-question', 'O', 'S-answer', 'S-header', 'S-question'])
96
+
97
+ # token classification service
98
+ layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
99
+
100
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service, layoutlm_service])
101
+
102
+ path = "path/to/some/form"
103
+ df = pipe.analyze(path=path)
104
+
105
+ for dp in df:
106
+ ...
107
+ ```
108
+
109
+ Args:
110
+ tokenizer: `Token classifier`, typing allows currently anything. This will be changed in the future.
111
+ language_model: `language model token classifier`.
112
+ padding: A padding strategy to be passed to the `tokenizer`. Must be either `max_length`, `longest` or
113
+ `do_not_pad`.
114
+ truncation: If `True` will truncate to a maximum length specified with the argument `max_length` or to
115
+ the maximum acceptable input length for the model if that argument is not provided. This
116
+ will truncate token by token, removing a token from the longest sequence in the pair if a
117
+ pair of sequences (or a batch of pairs) is provided. If `False` then no truncation (i.e.,
118
+ can output batch with sequence lengths greater than the model maximum admissible input
119
+ size).
120
+ return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
121
+ can be returned as an additional batch element. Note that in this case, the
122
+ number of input batch samples will be smaller than the output batch samples.
123
+ use_other_as_default_category: When predicting token classes, it might be possible that some words
124
+ might not get sent to the model because they are categorized as not
125
+ eligible token (e.g. empty string). If set to `True` it will assign all
126
+ words without token the `BioTag.outside` token.
127
+ segment_positions: Using bounding boxes of segment instead of words improves model accuracy
128
+ significantly for models that have been trained on segments rather than words.
129
+ Choose a single or a sequence of layout segments to use their bounding boxes. Note,
130
+ that the layout segments need to have a child-relationship with words. If a word
131
+ does not appear as child, it will use the word bounding box.
132
+ sliding_window_stride: If the output of the `tokenizer` exceeds the `max_length` sequence length, a
133
+ sliding window will be created with each window having `max_length` sequence
134
+ input. When using `sliding_window_stride=0` no strides will be created,
135
+ otherwise it will create slides with windows shifted `sliding_window_stride` to
136
+ the right.
137
+
138
+ Note:
139
+ If `use_other_as_default_category` is set, words without eligible tokens will be assigned the
140
+ `BioTag.outside` token.
105
141
  """
106
142
  self.language_model = language_model
107
143
  self.padding = padding
@@ -134,6 +170,15 @@ class LMTokenClassifierService(PipelineComponent):
134
170
  self._init_sanity_checks()
135
171
 
136
172
  def serve(self, dp: Image) -> None:
173
+ """
174
+ Serve the token classification pipeline on a given `Image`.
175
+
176
+ Args:
177
+ dp: The `Image` to process.
178
+
179
+ Returns:
180
+ None
181
+ """
137
182
  lm_input = self.mapping_to_lm_input_func(**self.required_kwargs)(dp)
138
183
  if lm_input is None:
139
184
  return
@@ -231,30 +276,32 @@ class LMTokenClassifierService(PipelineComponent):
231
276
  @pipeline_component_registry.register("LMSequenceClassifierService")
232
277
  class LMSequenceClassifierService(PipelineComponent):
233
278
  """
234
- Pipeline component for sequence classification
279
+ Pipeline component for sequence classification.
235
280
 
236
- **Example**
281
+ Example:
282
+ ```python
283
+ # setting up compulsory ocr service
284
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
285
+ tess = TesseractOcrDetector(tesseract_config_path)
286
+ ocr_service = TextExtractionService(tess)
237
287
 
238
- # setting up compulsory ocr service
239
- tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
240
- tess = TesseractOcrDetector(tesseract_config_path)
241
- ocr_service = TextExtractionService(tess)
288
+ # hf tokenizer and token classifier
289
+ tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
290
+ layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json", "path/to/model.bin",
291
+ categories=["handwritten", "presentation", "resume"])
242
292
 
243
- # hf tokenizer and token classifier
244
- tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
245
- layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json","path/to/model.bin",
246
- categories=["handwritten", "presentation", "resume"])
293
+ # token classification service
294
+ layoutlm_service = LMSequenceClassifierService(tokenizer, layoutlm)
247
295
 
248
- # token classification service
249
- layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
296
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service, layoutlm_service])
250
297
 
251
- pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
298
+ path = "path/to/some/form"
299
+ df = pipe.analyze(path=path)
252
300
 
253
- path = "path/to/some/form"
254
- df = pipe.analyze(path=path)
301
+ for dp in df:
302
+ ...
303
+ ```
255
304
 
256
- for dp in df:
257
- ...
258
305
 
259
306
  """
260
307
 
@@ -268,22 +315,26 @@ class LMSequenceClassifierService(PipelineComponent):
268
315
  use_other_as_default_category: bool = False,
269
316
  ) -> None:
270
317
  """
271
- :param tokenizer: Tokenizer, typing allows currently anything. This will be changed in the future
272
- :param language_model: language model sequence classifier
273
- :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
274
- `do_not_pad`.
275
- :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
276
- maximum acceptable input length for the model if that argument is not provided. This will
277
- truncate token by token, removing a token from the longest sequence in the pair if a pair of
278
- sequences (or a batch of pairs) is provided.
279
- If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
280
- model maximum admissible input size).
281
- :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
282
- can be returned as an additional batch element. Not that in this case, the number of input
283
- batch samples will be smaller than the output batch samples.
284
- :param use_other_as_default_category: When predicting document classes, it might be possible that some pages
285
- do not get sent to the model because they are empty. If set to `True` it
286
- will assign images with no features the category `TokenClasses.OTHER`.
318
+ Args:
319
+ tokenizer: `Tokenizer`, typing allows currently anything. This will be changed in the future.
320
+ language_model: `language model sequence classifier`.
321
+ padding: A padding strategy to be passed to the `tokenizer`. Must be either `max_length`, `longest` or
322
+ `do_not_pad`.
323
+ truncation: If `True` will truncate to a maximum length specified with the argument `max_length` or to the
324
+ maximum acceptable input length for the model if that argument is not provided. This will
325
+ truncate token by token, removing a token from the longest sequence in the pair if a pair of
326
+ sequences (or a batch of pairs) is provided. If `False` then no truncation (i.e., can output
327
+ batch with sequence lengths greater than the model maximum admissible input size).
328
+ return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
329
+ can be returned as an additional batch element. Note that in this case, the
330
+ number of input batch samples will be smaller than the output batch samples.
331
+ use_other_as_default_category: When predicting document classes, it might be possible that some pages do
332
+ not get sent to the model because they are empty. If set to `True` it will
333
+ assign images with no features the category `TokenClasses.OTHER`.
334
+
335
+ Note:
336
+ If `use_other_as_default_category` is set, images with no features will be assigned the `TokenClasses.OTHER`
337
+ category.
287
338
 
288
339
  """
289
340
  self.language_model = language_model
@@ -305,6 +356,15 @@ class LMSequenceClassifierService(PipelineComponent):
305
356
  self._init_sanity_checks()
306
357
 
307
358
  def serve(self, dp: Image) -> None:
359
+ """
360
+ Serve the sequence classification pipeline on a given `Image`.
361
+
362
+ Args:
363
+ dp: The `Image` to process.
364
+
365
+ Returns:
366
+ None
367
+ """
308
368
  lm_input = self.mapping_to_lm_input_func(**self.required_kwargs)(dp)
309
369
  lm_output = None
310
370
  if lm_input is None:
@@ -347,7 +407,15 @@ class LMSequenceClassifierService(PipelineComponent):
347
407
 
348
408
  @staticmethod
349
409
  def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
350
- """Replacing eval functions"""
410
+ """
411
+ Get the function to map images to features for the language model.
412
+
413
+ Args:
414
+ mapping_str: The mapping function name as a string.
415
+
416
+ Returns:
417
+ A callable that maps an `Image` to features.
418
+ """
351
419
  return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
352
420
  mapping_str
353
421
  ]