deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.0.dist-info/METADATA +0 -431
- deepdoctection-0.42.0.dist-info/RECORD +0 -148
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,9 @@
|
|
|
13
13
|
# Apache 2.0 License for more details.
|
|
14
14
|
|
|
15
15
|
"""
|
|
16
|
-
Tree distance similarity
|
|
16
|
+
Tree distance similarity (TEDS) metric
|
|
17
|
+
|
|
18
|
+
Taken from <https://github.com/ibm-aur-nlp/PubTabNet/blob/master/src/metric.py>
|
|
17
19
|
"""
|
|
18
20
|
|
|
19
21
|
import statistics
|
|
@@ -81,7 +83,8 @@ class TableTree(Tree):
|
|
|
81
83
|
|
|
82
84
|
class CustomConfig(Config):
|
|
83
85
|
"""
|
|
84
|
-
CustomConfig for calculating APTED tree edit distance.
|
|
86
|
+
`CustomConfig` for calculating `APTED` tree edit distance.
|
|
87
|
+
Check APTED docs for more information
|
|
85
88
|
"""
|
|
86
89
|
|
|
87
90
|
@staticmethod
|
|
@@ -90,7 +93,7 @@ class CustomConfig(Config):
|
|
|
90
93
|
return max(map(len, sequences))
|
|
91
94
|
|
|
92
95
|
def normalized_distance(self, *sequences: Any) -> float:
|
|
93
|
-
"""Get distance from 0 to 1"""
|
|
96
|
+
"""Get distance from `0` to `1`"""
|
|
94
97
|
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
|
|
95
98
|
|
|
96
99
|
def rename(self, node1: Any, node2: Any) -> float:
|
|
@@ -104,7 +107,7 @@ class CustomConfig(Config):
|
|
|
104
107
|
|
|
105
108
|
|
|
106
109
|
class TEDS:
|
|
107
|
-
"""Tree Edit Distance
|
|
110
|
+
"""Tree Edit Distance similarity"""
|
|
108
111
|
|
|
109
112
|
def __init__(self, structure_only: bool = False):
|
|
110
113
|
self.structure_only = structure_only
|
|
@@ -123,7 +126,7 @@ class TEDS:
|
|
|
123
126
|
self.__tokens__ += list(node.tail)
|
|
124
127
|
|
|
125
128
|
def load_html_tree(self, node: TableTree, parent: Optional[TableTree] = None) -> Optional[TableTree]:
|
|
126
|
-
"""Converts HTML tree to the format required by
|
|
129
|
+
"""Converts `HTML` tree to the format required by APTED"""
|
|
127
130
|
global __tokens__ # pylint: disable = W0602
|
|
128
131
|
if node.tag == "td":
|
|
129
132
|
if self.structure_only:
|
|
@@ -151,8 +154,15 @@ class TEDS:
|
|
|
151
154
|
return None
|
|
152
155
|
|
|
153
156
|
def evaluate(self, inputs: tuple[str, str]) -> float:
|
|
154
|
-
"""
|
|
157
|
+
"""
|
|
158
|
+
Computes TEDS score between the prediction and the ground truth of a
|
|
155
159
|
given sample
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
inputs: A tuple of ground truth and prediction in xml format
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
A float value between 0.0 and 1.0, where 1.0 means perfect match
|
|
156
166
|
"""
|
|
157
167
|
|
|
158
168
|
ground_truth, pred = inputs[0], inputs[1]
|
|
@@ -192,8 +202,13 @@ class TEDS:
|
|
|
192
202
|
def teds_metric(gt_list: list[str], predict_list: list[str], structure_only: bool) -> tuple[float, int]:
|
|
193
203
|
"""
|
|
194
204
|
Computes tree edit distance score (TEDS) between the prediction and the ground truth of a batch of samples. The
|
|
195
|
-
approach to measure similarity of tables by means of their html representation has been
|
|
196
|
-
<https://arxiv.org/abs/1911.10683>
|
|
205
|
+
approach to measure similarity of tables by means of their html representation has been advocated in
|
|
206
|
+
<https://arxiv.org/abs/1911.10683>
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
gt_list: A list of ground truth samples in `xml` format
|
|
210
|
+
predict_list: A list of predictions in `xml` format
|
|
211
|
+
structure_only: If `True`, only the structure of the table is considered, but no text
|
|
197
212
|
|
|
198
213
|
"""
|
|
199
214
|
teds = TEDS(structure_only=structure_only)
|
|
@@ -218,7 +233,7 @@ def teds_metric(gt_list: list[str], predict_list: list[str], structure_only: boo
|
|
|
218
233
|
@metric_registry.register("teds")
|
|
219
234
|
class TedsMetric(MetricBase):
|
|
220
235
|
"""
|
|
221
|
-
Metric induced by `
|
|
236
|
+
Metric induced by `TEDS`
|
|
222
237
|
"""
|
|
223
238
|
|
|
224
239
|
metric = teds_metric # type: ignore
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
|
|
19
|
+
`EvalCallback` in Tensorpack
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
from __future__ import annotations
|
|
@@ -53,7 +53,7 @@ __all__ = ["EvalCallback"]
|
|
|
53
53
|
|
|
54
54
|
class EvalCallback(Callback): # pylint: disable=R0903
|
|
55
55
|
"""
|
|
56
|
-
A callback that runs evaluation once a while. It supports evaluation on any pipeline component.
|
|
56
|
+
A callback that runs evaluation once in a while. It supports evaluation on any pipeline component.
|
|
57
57
|
"""
|
|
58
58
|
|
|
59
59
|
_chief_only = False
|
|
@@ -71,17 +71,18 @@ class EvalCallback(Callback): # pylint: disable=R0903
|
|
|
71
71
|
**build_eval_kwargs: str,
|
|
72
72
|
) -> None:
|
|
73
73
|
"""
|
|
74
|
-
:
|
|
75
|
-
|
|
76
|
-
|
|
74
|
+
Args:
|
|
75
|
+
dataset: dataset
|
|
76
|
+
category_names: String or list of category names
|
|
77
|
+
sub_categories: Dict of categories/sub-categories or categories/list of sub-categories. See also
|
|
77
78
|
`eval.Evaluator`
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
79
|
+
metric: metric
|
|
80
|
+
pipeline_component: Pipeline component with a detector.
|
|
81
|
+
in_names: Specify tensor input names.
|
|
82
|
+
E.g. `extern.tp.tpfrcnn.GeneralizedRCNN.get_inference_tensor_names`
|
|
83
|
+
out_names: Specify tensor output names.
|
|
84
|
+
build_eval_kwargs: Pass the necessary arguments in order to build the dataflow, e.g. `split`,
|
|
85
|
+
`build_mode`, `max_datapoints` etc.
|
|
85
86
|
"""
|
|
86
87
|
self.dataset_name = dataset.dataset_info.name
|
|
87
88
|
self.build_eval_kwargs = build_eval_kwargs
|