deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
  120. deepdoctection/analyzer/_config.py +0 -146
  121. deepdoctection-0.42.0.dist-info/METADATA +0 -431
  122. deepdoctection-0.42.0.dist-info/RECORD +0 -148
  123. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,9 @@
13
13
  # Apache 2.0 License for more details.
14
14
 
15
15
  """
16
- Tree distance similarity metric taken from <https://github.com/ibm-aur-nlp/PubTabNet/blob/master/src/metric.py>
16
+ Tree distance similarity (TEDS) metric
17
+
18
+ Taken from <https://github.com/ibm-aur-nlp/PubTabNet/blob/master/src/metric.py>
17
19
  """
18
20
 
19
21
  import statistics
@@ -81,7 +83,8 @@ class TableTree(Tree):
81
83
 
82
84
  class CustomConfig(Config):
83
85
  """
84
- CustomConfig for calculating APTED tree edit distance. Check APTED docs for more information
86
+ `CustomConfig` for calculating `APTED` tree edit distance.
87
+ Check APTED docs for more information
85
88
  """
86
89
 
87
90
  @staticmethod
@@ -90,7 +93,7 @@ class CustomConfig(Config):
90
93
  return max(map(len, sequences))
91
94
 
92
95
  def normalized_distance(self, *sequences: Any) -> float:
93
- """Get distance from 0 to 1"""
96
+ """Get distance from `0` to `1`"""
94
97
  return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
95
98
 
96
99
  def rename(self, node1: Any, node2: Any) -> float:
@@ -104,7 +107,7 @@ class CustomConfig(Config):
104
107
 
105
108
 
106
109
  class TEDS:
107
- """Tree Edit Distance based Similarity"""
110
+ """Tree Edit Distance similarity"""
108
111
 
109
112
  def __init__(self, structure_only: bool = False):
110
113
  self.structure_only = structure_only
@@ -123,7 +126,7 @@ class TEDS:
123
126
  self.__tokens__ += list(node.tail)
124
127
 
125
128
  def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) -> Optional[TableTree]:
126
- """Converts HTML tree to the format required by apted"""
129
+ """Converts `HTML` tree to the format required by APTED"""
127
130
  global __tokens__ # pylint: disable = W0602
128
131
  if node.tag == "td":
129
132
  if self.structure_only:
@@ -151,8 +154,15 @@ class TEDS:
151
154
  return None
152
155
 
153
156
  def evaluate(self, inputs: tuple[str, str]) -> float:
154
- """Computes TEDS score between the prediction and the ground truth of a
157
+ """
158
+ Computes TEDS score between the prediction and the ground truth of a
155
159
  given sample
160
+
161
+ Args:
162
+ inputs: A tuple of ground truth and prediction in xml format
163
+
164
+ Returns:
165
+ A float value between 0.0 and 1.0, where 1.0 means perfect match
156
166
  """
157
167
 
158
168
  ground_truth, pred = inputs[0], inputs[1]
@@ -192,8 +202,13 @@ class TEDS:
192
202
  def teds_metric(gt_list: list[str], predict_list: list[str], structure_only: bool) -> tuple[float, int]:
193
203
  """
194
204
  Computes tree edit distance score (TEDS) between the prediction and the ground truth of a batch of samples. The
195
- approach to measure similarity of tables by means of their html representation has been adovacated in
196
- <https://arxiv.org/abs/1911.10683> .
205
+ approach to measure similarity of tables by means of their html representation has been advocated in
206
+ <https://arxiv.org/abs/1911.10683>
207
+
208
+ Args:
209
+ gt_list: A list of ground truth samples in `xml` format
210
+ predict_list: A list of predictions in `xml` format
211
+ structure_only: If `True`, only the structure of the table is considered, but no text
197
212
 
198
213
  """
199
214
  teds = TEDS(structure_only=structure_only)
@@ -218,7 +233,7 @@ def teds_metric(gt_list: list[str], predict_list: list[str], structure_only: boo
218
233
  @metric_registry.register("teds")
219
234
  class TedsMetric(MetricBase):
220
235
  """
221
- Metric induced by `teds`
236
+ Metric induced by `TEDS`
222
237
  """
223
238
 
224
239
  metric = teds_metric # type: ignore
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for EvalCallback in Tensorpack
19
+ `EvalCallback` in Tensorpack
20
20
  """
21
21
 
22
22
  from __future__ import annotations
@@ -53,7 +53,7 @@ __all__ = ["EvalCallback"]
53
53
 
54
54
  class EvalCallback(Callback): # pylint: disable=R0903
55
55
  """
56
- A callback that runs evaluation once a while. It supports evaluation on any pipeline component.
56
+ A callback that runs evaluation once in a while. It supports evaluation on any pipeline component.
57
57
  """
58
58
 
59
59
  _chief_only = False
@@ -71,17 +71,18 @@ class EvalCallback(Callback): # pylint: disable=R0903
71
71
  **build_eval_kwargs: str,
72
72
  ) -> None:
73
73
  """
74
- :param dataset: dataset
75
- :param category_names: String or list of category names
76
- :param sub_categories: Dict of categories/sub-categories or categories/list of sub-categories. See also
74
+ Args:
75
+ dataset: dataset
76
+ category_names: String or list of category names
77
+ sub_categories: Dict of categories/sub-categories or categories/list of sub-categories. See also
77
78
  `eval.Evaluator`
78
- :param metric: metric
79
- :param pipeline_component: Pipeline component with a detector.
80
- :param in_names: Specify tensor input names.
81
- E.g. `extern.tp.tpfrcnn.GeneralizedRCNN.get_inference_tensor_names`
82
- :param out_names: Specify tensor output names.
83
- :param build_eval_kwargs: Pass the necessary arguments in order to build the dataflow, e.g. "split",
84
- "build_mode", "max_datapoints" etc.
79
+ metric: metric
80
+ pipeline_component: Pipeline component with a detector.
81
+ in_names: Specify tensor input names.
82
+ E.g. `extern.tp.tpfrcnn.GeneralizedRCNN.get_inference_tensor_names`
83
+ out_names: Specify tensor output names.
84
+ build_eval_kwargs: Pass the necessary arguments in order to build the dataflow, e.g. `split`,
85
+ `build_mode`, `max_datapoints` etc.
85
86
  """
86
87
  self.dataset_name = dataset.dataset_info.name
87
88
  self.build_eval_kwargs = build_eval_kwargs
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Wrappers for models of external libraries as well as implementation of the Cascade-RCNN model of Tensorpack.
19
+ # Wrappers for models of external libraries
20
20
  """
21
21
 
22
22
  from .base import *