deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +16 -29
- deepdoctection/analyzer/dd.py +70 -59
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +41 -56
- deepdoctection/datapoint/box.py +9 -8
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +56 -44
- deepdoctection/datapoint/view.py +245 -150
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +35 -26
- deepdoctection/datasets/base.py +14 -12
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +24 -26
- deepdoctection/datasets/instances/doclaynet.py +51 -51
- deepdoctection/datasets/instances/fintabnet.py +46 -46
- deepdoctection/datasets/instances/funsd.py +25 -24
- deepdoctection/datasets/instances/iiitar13k.py +13 -10
- deepdoctection/datasets/instances/layouttest.py +4 -3
- deepdoctection/datasets/instances/publaynet.py +5 -5
- deepdoctection/datasets/instances/pubtables1m.py +24 -21
- deepdoctection/datasets/instances/pubtabnet.py +32 -30
- deepdoctection/datasets/instances/rvlcdip.py +30 -30
- deepdoctection/datasets/instances/xfund.py +26 -26
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +15 -13
- deepdoctection/eval/eval.py +41 -37
- deepdoctection/eval/tedsmetric.py +30 -23
- deepdoctection/eval/tp_eval_callback.py +16 -19
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +85 -113
- deepdoctection/extern/deskew.py +14 -11
- deepdoctection/extern/doctrocr.py +141 -130
- deepdoctection/extern/fastlang.py +27 -18
- deepdoctection/extern/hfdetr.py +71 -62
- deepdoctection/extern/hflayoutlm.py +504 -211
- deepdoctection/extern/hflm.py +230 -0
- deepdoctection/extern/model.py +488 -302
- deepdoctection/extern/pdftext.py +23 -19
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +29 -19
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +18 -18
- deepdoctection/extern/tp/tfutils.py +57 -9
- deepdoctection/extern/tp/tpcompat.py +21 -14
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +45 -53
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/cats.py +27 -29
- deepdoctection/mapper/cocostruct.py +10 -10
- deepdoctection/mapper/d2struct.py +27 -26
- deepdoctection/mapper/hfstruct.py +13 -8
- deepdoctection/mapper/laylmstruct.py +178 -37
- deepdoctection/mapper/maputils.py +12 -11
- deepdoctection/mapper/match.py +2 -2
- deepdoctection/mapper/misc.py +11 -9
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +5 -5
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +5 -5
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +12 -14
- deepdoctection/pipe/base.py +52 -106
- deepdoctection/pipe/common.py +72 -59
- deepdoctection/pipe/concurrency.py +16 -11
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +20 -16
- deepdoctection/pipe/lm.py +75 -105
- deepdoctection/pipe/order.py +194 -89
- deepdoctection/pipe/refine.py +111 -124
- deepdoctection/pipe/segment.py +156 -161
- deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +48 -41
- deepdoctection/train/hf_detr_train.py +41 -30
- deepdoctection/train/hf_layoutlm_train.py +153 -135
- deepdoctection/train/tp_frcnn_train.py +32 -31
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +87 -125
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +22 -18
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +16 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +11 -11
- deepdoctection/utils/settings.py +185 -181
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +74 -72
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
- deepdoctection-0.33.dist-info/RECORD +146 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.31.dist-info/RECORD +0 -144
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
|
@@ -30,9 +30,10 @@ Module for Fintabnet dataset. Place the dataset as follows
|
|
|
30
30
|
├── FinTabNet_1.0.0_table_train.jsonl
|
|
31
31
|
├── FinTabNet_1.0.0_table_val.jsonl
|
|
32
32
|
"""
|
|
33
|
+
from __future__ import annotations
|
|
33
34
|
|
|
34
35
|
from pathlib import Path
|
|
35
|
-
from typing import
|
|
36
|
+
from typing import Mapping, Sequence, Union
|
|
36
37
|
|
|
37
38
|
from ...dataflow import DataFlow, MapData, MultiProcessMapData
|
|
38
39
|
from ...dataflow.common import FlattenData
|
|
@@ -42,10 +43,10 @@ from ...mapper.cats import cat_to_sub_cat, filter_cat
|
|
|
42
43
|
from ...mapper.maputils import curry
|
|
43
44
|
from ...mapper.misc import image_ann_to_image, maybe_ann_to_sub_image
|
|
44
45
|
from ...mapper.pubstruct import pub_to_image
|
|
45
|
-
from ...utils.detection_types import JsonDict
|
|
46
46
|
from ...utils.file_utils import set_mp_spawn
|
|
47
47
|
from ...utils.logger import LoggingRecord, logger
|
|
48
48
|
from ...utils.settings import CellType, DatasetType, LayoutType, ObjectTypes, TableType
|
|
49
|
+
from ...utils.types import PubtabnetDict
|
|
49
50
|
from ...utils.utils import to_bool
|
|
50
51
|
from ..base import _BuiltInDataset
|
|
51
52
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
@@ -82,38 +83,38 @@ _URL = (
|
|
|
82
83
|
"fintabnet.tar.gz?_ga=2.17492593.994196051.1634564576-1173244232.1625045842"
|
|
83
84
|
)
|
|
84
85
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
|
|
85
|
-
_TYPE = DatasetType.
|
|
86
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
86
87
|
_LOCATION = "fintabnet"
|
|
87
88
|
_ANNOTATION_FILES: Mapping[str, str] = {
|
|
88
89
|
"train": "FinTabNet_1.0.0_table_train.jsonl",
|
|
89
90
|
"test": "FinTabNet_1.0.0_table_test.jsonl",
|
|
90
91
|
"val": "FinTabNet_1.0.0_table_val.jsonl",
|
|
91
92
|
}
|
|
92
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
93
|
+
_INIT_CATEGORIES = [LayoutType.TABLE, LayoutType.CELL, TableType.ITEM]
|
|
93
94
|
_SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]]
|
|
94
95
|
_SUB_CATEGORIES = {
|
|
95
|
-
LayoutType.
|
|
96
|
-
CellType.
|
|
97
|
-
CellType.
|
|
98
|
-
CellType.
|
|
99
|
-
CellType.
|
|
100
|
-
CellType.
|
|
101
|
-
CellType.
|
|
96
|
+
LayoutType.CELL: {
|
|
97
|
+
CellType.HEADER: [CellType.HEADER, CellType.BODY],
|
|
98
|
+
CellType.ROW_NUMBER: [],
|
|
99
|
+
CellType.COLUMN_NUMBER: [],
|
|
100
|
+
CellType.ROW_SPAN: [],
|
|
101
|
+
CellType.COLUMN_SPAN: [],
|
|
102
|
+
CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
|
|
102
103
|
},
|
|
103
|
-
TableType.
|
|
104
|
-
CellType.
|
|
105
|
-
CellType.
|
|
106
|
-
CellType.
|
|
107
|
-
CellType.
|
|
108
|
-
CellType.
|
|
109
|
-
CellType.
|
|
104
|
+
TableType.ITEM: {TableType.ITEM: [LayoutType.ROW, LayoutType.COLUMN]},
|
|
105
|
+
CellType.HEADER: {
|
|
106
|
+
CellType.ROW_NUMBER: [],
|
|
107
|
+
CellType.COLUMN_NUMBER: [],
|
|
108
|
+
CellType.ROW_SPAN: [],
|
|
109
|
+
CellType.COLUMN_SPAN: [],
|
|
110
|
+
CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
|
|
110
111
|
},
|
|
111
|
-
CellType.
|
|
112
|
-
CellType.
|
|
113
|
-
CellType.
|
|
114
|
-
CellType.
|
|
115
|
-
CellType.
|
|
116
|
-
CellType.
|
|
112
|
+
CellType.BODY: {
|
|
113
|
+
CellType.ROW_NUMBER: [],
|
|
114
|
+
CellType.COLUMN_NUMBER: [],
|
|
115
|
+
CellType.ROW_SPAN: [],
|
|
116
|
+
CellType.COLUMN_SPAN: [],
|
|
117
|
+
CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
|
|
117
118
|
},
|
|
118
119
|
}
|
|
119
120
|
|
|
@@ -133,7 +134,7 @@ class Fintabnet(_BuiltInDataset):
|
|
|
133
134
|
def _categories(self) -> DatasetCategories:
|
|
134
135
|
return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
|
|
135
136
|
|
|
136
|
-
def _builder(self) ->
|
|
137
|
+
def _builder(self) -> FintabnetBuilder:
|
|
137
138
|
return FintabnetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
138
139
|
|
|
139
140
|
|
|
@@ -200,18 +201,17 @@ class FintabnetBuilder(DataFlowBaseBuilder):
|
|
|
200
201
|
|
|
201
202
|
# Map
|
|
202
203
|
@curry
|
|
203
|
-
def _map_filename(dp:
|
|
204
|
+
def _map_filename(dp: PubtabnetDict, workdir: Path) -> PubtabnetDict:
|
|
204
205
|
dp["filename"] = workdir / "pdf" / dp["filename"]
|
|
205
206
|
return dp
|
|
206
207
|
|
|
207
|
-
|
|
208
|
-
df = MapData(df, map_filename)
|
|
208
|
+
df = MapData(df, _map_filename(self.get_workdir()))
|
|
209
209
|
|
|
210
210
|
buffer_size = 200 if max_datapoints is None else min(max_datapoints, 200) - 1
|
|
211
211
|
|
|
212
212
|
pub_mapper = pub_to_image(
|
|
213
|
-
self.categories.get_categories(name_as_key=True, init=True),
|
|
214
|
-
load_image,
|
|
213
|
+
categories_name_as_key=self.categories.get_categories(name_as_key=True, init=True),
|
|
214
|
+
load_image=load_image,
|
|
215
215
|
fake_score=fake_score,
|
|
216
216
|
rows_and_cols=rows_and_cols,
|
|
217
217
|
dd_pipe_like=False,
|
|
@@ -232,39 +232,39 @@ class FintabnetBuilder(DataFlowBaseBuilder):
|
|
|
232
232
|
if build_mode == "table":
|
|
233
233
|
|
|
234
234
|
@curry
|
|
235
|
-
def _crop_and_add_image(dp: Image, category_names:
|
|
235
|
+
def _crop_and_add_image(dp: Image, category_names: list[str]) -> Image:
|
|
236
236
|
return image_ann_to_image(dp, category_names=category_names)
|
|
237
237
|
|
|
238
238
|
df = MapData(
|
|
239
239
|
df,
|
|
240
240
|
_crop_and_add_image( # pylint: disable=E1120
|
|
241
241
|
category_names=[
|
|
242
|
-
LayoutType.
|
|
243
|
-
LayoutType.
|
|
244
|
-
CellType.
|
|
245
|
-
CellType.
|
|
246
|
-
TableType.
|
|
247
|
-
LayoutType.
|
|
248
|
-
LayoutType.
|
|
242
|
+
LayoutType.TABLE,
|
|
243
|
+
LayoutType.CELL,
|
|
244
|
+
CellType.HEADER,
|
|
245
|
+
CellType.BODY,
|
|
246
|
+
TableType.ITEM,
|
|
247
|
+
LayoutType.ROW,
|
|
248
|
+
LayoutType.COLUMN,
|
|
249
249
|
]
|
|
250
250
|
),
|
|
251
251
|
)
|
|
252
252
|
df = MapData(
|
|
253
253
|
df,
|
|
254
254
|
maybe_ann_to_sub_image( # pylint: disable=E1120 # 259
|
|
255
|
-
category_names_sub_image=LayoutType.
|
|
255
|
+
category_names_sub_image=LayoutType.TABLE,
|
|
256
256
|
category_names=[
|
|
257
|
-
LayoutType.
|
|
258
|
-
CellType.
|
|
259
|
-
CellType.
|
|
260
|
-
TableType.
|
|
261
|
-
LayoutType.
|
|
262
|
-
LayoutType.
|
|
257
|
+
LayoutType.CELL,
|
|
258
|
+
CellType.HEADER,
|
|
259
|
+
CellType.BODY,
|
|
260
|
+
TableType.ITEM,
|
|
261
|
+
LayoutType.ROW,
|
|
262
|
+
LayoutType.COLUMN,
|
|
263
263
|
],
|
|
264
264
|
add_summary=True,
|
|
265
265
|
),
|
|
266
266
|
)
|
|
267
|
-
df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation_iter(category_names=LayoutType.
|
|
267
|
+
df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation_iter(category_names=LayoutType.TABLE)])
|
|
268
268
|
df = FlattenData(df)
|
|
269
269
|
df = MapData(df, lambda dp: dp[0])
|
|
270
270
|
|
|
@@ -32,6 +32,7 @@ Module for Funsd dataset. Install the dataset following the folder structure
|
|
|
32
32
|
│ ├── images
|
|
33
33
|
│ │ ├── ...
|
|
34
34
|
"""
|
|
35
|
+
from __future__ import annotations
|
|
35
36
|
|
|
36
37
|
import os
|
|
37
38
|
from typing import Dict, List, Mapping, Union
|
|
@@ -40,16 +41,16 @@ from ...dataflow import DataFlow, MapData, SerializerFiles
|
|
|
40
41
|
from ...datasets.info import DatasetInfo
|
|
41
42
|
from ...mapper.cats import cat_to_sub_cat, filter_cat
|
|
42
43
|
from ...mapper.xfundstruct import xfund_to_image
|
|
43
|
-
from ...utils.detection_types import JsonDict, Pathlike
|
|
44
44
|
from ...utils.fs import load_json
|
|
45
45
|
from ...utils.settings import BioTag, DatasetType, LayoutType, ObjectTypes, TokenClasses, TokenClassWithTag, WordType
|
|
46
|
+
from ...utils.types import FunsdDict, PathLikeOrStr
|
|
46
47
|
from ..base import _BuiltInDataset
|
|
47
48
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
48
49
|
from ..info import DatasetCategories
|
|
49
50
|
from ..registry import dataset_registry
|
|
50
51
|
|
|
51
52
|
|
|
52
|
-
def load_file(path_ann:
|
|
53
|
+
def load_file(path_ann: PathLikeOrStr) -> FunsdDict:
|
|
53
54
|
"""
|
|
54
55
|
Loading json file
|
|
55
56
|
|
|
@@ -79,28 +80,28 @@ _LICENSE = (
|
|
|
79
80
|
|
|
80
81
|
_URL = "https://guillaumejaume.github.io/FUNSD/download/"
|
|
81
82
|
_SPLITS: Mapping[str, str] = {"train": "training_data", "test": "testing_data"}
|
|
82
|
-
_TYPE = DatasetType.
|
|
83
|
+
_TYPE = DatasetType.TOKEN_CLASSIFICATION
|
|
83
84
|
_LOCATION = "funsd"
|
|
84
85
|
_ANNOTATION_FILES: Mapping[str, str] = {"train": "annotations", "test": "annotations"}
|
|
85
86
|
|
|
86
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
87
|
+
_INIT_CATEGORIES = [LayoutType.WORD, LayoutType.TEXT]
|
|
87
88
|
_SUB_CATEGORIES: Dict[ObjectTypes, Dict[ObjectTypes, List[ObjectTypes]]]
|
|
88
89
|
_SUB_CATEGORIES = {
|
|
89
|
-
LayoutType.
|
|
90
|
-
WordType.
|
|
91
|
-
WordType.
|
|
92
|
-
WordType.
|
|
93
|
-
TokenClassWithTag.
|
|
94
|
-
TokenClassWithTag.
|
|
95
|
-
TokenClassWithTag.
|
|
96
|
-
TokenClassWithTag.
|
|
97
|
-
TokenClassWithTag.
|
|
98
|
-
TokenClassWithTag.
|
|
99
|
-
BioTag.
|
|
90
|
+
LayoutType.WORD: {
|
|
91
|
+
WordType.TOKEN_CLASS: [TokenClasses.OTHER, TokenClasses.QUESTION, TokenClasses.ANSWER, TokenClasses.HEADER],
|
|
92
|
+
WordType.TAG: [BioTag.INSIDE, BioTag.OUTSIDE, BioTag.BEGIN],
|
|
93
|
+
WordType.TOKEN_TAG: [
|
|
94
|
+
TokenClassWithTag.B_ANSWER,
|
|
95
|
+
TokenClassWithTag.B_HEADER,
|
|
96
|
+
TokenClassWithTag.B_QUESTION,
|
|
97
|
+
TokenClassWithTag.I_ANSWER,
|
|
98
|
+
TokenClassWithTag.I_HEADER,
|
|
99
|
+
TokenClassWithTag.I_QUESTION,
|
|
100
|
+
BioTag.OUTSIDE,
|
|
100
101
|
],
|
|
101
102
|
},
|
|
102
|
-
LayoutType.
|
|
103
|
-
WordType.
|
|
103
|
+
LayoutType.TEXT: {
|
|
104
|
+
WordType.TOKEN_CLASS: [TokenClasses.OTHER, TokenClasses.QUESTION, TokenClasses.ANSWER, TokenClasses.HEADER]
|
|
104
105
|
},
|
|
105
106
|
}
|
|
106
107
|
|
|
@@ -120,7 +121,7 @@ class Funsd(_BuiltInDataset):
|
|
|
120
121
|
def _categories(self) -> DatasetCategories:
|
|
121
122
|
return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
|
|
122
123
|
|
|
123
|
-
def _builder(self) ->
|
|
124
|
+
def _builder(self) -> FunsdBuilder:
|
|
124
125
|
return FunsdBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
125
126
|
|
|
126
127
|
|
|
@@ -158,14 +159,14 @@ class FunsdBuilder(DataFlowBaseBuilder):
|
|
|
158
159
|
# Map
|
|
159
160
|
categories_name_as_key = self.categories.get_categories(init=True, name_as_key=True)
|
|
160
161
|
category_names_mapping = {
|
|
161
|
-
"other": TokenClasses.
|
|
162
|
-
"question": TokenClasses.
|
|
163
|
-
"answer": TokenClasses.
|
|
164
|
-
"header": TokenClasses.
|
|
162
|
+
"other": TokenClasses.OTHER,
|
|
163
|
+
"question": TokenClasses.QUESTION,
|
|
164
|
+
"answer": TokenClasses.ANSWER,
|
|
165
|
+
"header": TokenClasses.HEADER,
|
|
165
166
|
}
|
|
166
167
|
ner_token_to_id_mapping = self.categories.get_sub_categories(
|
|
167
|
-
categories=LayoutType.
|
|
168
|
-
sub_categories={LayoutType.
|
|
168
|
+
categories=LayoutType.WORD,
|
|
169
|
+
sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
|
|
169
170
|
keys=False,
|
|
170
171
|
values_as_dict=True,
|
|
171
172
|
name_as_key=True,
|
|
@@ -35,25 +35,28 @@ Module for IIITar13K dataset. Install the dataset following the folder structure
|
|
|
35
35
|
│ ├── ...
|
|
36
36
|
|
|
37
37
|
"""
|
|
38
|
+
from __future__ import annotations
|
|
38
39
|
|
|
39
40
|
import os
|
|
40
41
|
from typing import Mapping, Union
|
|
41
42
|
|
|
43
|
+
from lazy_imports import try_import
|
|
44
|
+
|
|
42
45
|
from ...dataflow import DataFlow, MapData, SerializerFiles
|
|
43
46
|
from ...datasets.info import DatasetInfo
|
|
44
47
|
from ...mapper.maputils import curry
|
|
45
48
|
from ...mapper.misc import xml_to_dict
|
|
46
49
|
from ...mapper.pascalstruct import pascal_voc_dict_to_image
|
|
47
|
-
from ...utils.detection_types import JsonDict
|
|
48
50
|
from ...utils.file_utils import lxml_available
|
|
49
51
|
from ...utils.fs import get_package_path
|
|
50
52
|
from ...utils.settings import DatasetType, LayoutType
|
|
53
|
+
from ...utils.types import JsonDict
|
|
51
54
|
from ..base import _BuiltInDataset
|
|
52
55
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
53
56
|
from ..info import DatasetCategories
|
|
54
57
|
from ..registry import dataset_registry
|
|
55
58
|
|
|
56
|
-
|
|
59
|
+
with try_import() as import_guard:
|
|
57
60
|
from lxml import etree
|
|
58
61
|
|
|
59
62
|
_NAME = "iiitar13k"
|
|
@@ -73,7 +76,7 @@ _LICENSE = "NN"
|
|
|
73
76
|
_URL = "http://cvit.iiit.ac.in/usodi/iiitar13k.php"
|
|
74
77
|
|
|
75
78
|
_SPLITS: Mapping[str, str] = {"train": "training_images", "val": "validation_images", "test": "test_images"}
|
|
76
|
-
_TYPE = DatasetType.
|
|
79
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
77
80
|
_LOCATION = "iiitar13k"
|
|
78
81
|
_ANNOTATION_FILES: Mapping[str, str] = {
|
|
79
82
|
"train": "training_xml",
|
|
@@ -81,7 +84,7 @@ _ANNOTATION_FILES: Mapping[str, str] = {
|
|
|
81
84
|
"test": "test_xml",
|
|
82
85
|
}
|
|
83
86
|
|
|
84
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
87
|
+
_INIT_CATEGORIES = [LayoutType.TABLE, LayoutType.LOGO, LayoutType.FIGURE, LayoutType.SIGNATURE]
|
|
85
88
|
|
|
86
89
|
|
|
87
90
|
@dataset_registry.register("iiitar13k")
|
|
@@ -99,7 +102,7 @@ class IIITar13K(_BuiltInDataset):
|
|
|
99
102
|
def _categories(self) -> DatasetCategories:
|
|
100
103
|
return DatasetCategories(init_categories=_INIT_CATEGORIES)
|
|
101
104
|
|
|
102
|
-
def _builder(self) ->
|
|
105
|
+
def _builder(self) -> IIITar13KBuilder:
|
|
103
106
|
return IIITar13KBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
104
107
|
|
|
105
108
|
|
|
@@ -173,11 +176,11 @@ class IIITar13KBuilder(DataFlowBaseBuilder):
|
|
|
173
176
|
filter_empty_image=True,
|
|
174
177
|
fake_score=fake_score,
|
|
175
178
|
category_name_mapping={
|
|
176
|
-
"natural_image": LayoutType.
|
|
177
|
-
"figure": LayoutType.
|
|
178
|
-
"logo": LayoutType.
|
|
179
|
-
"signature": LayoutType.
|
|
180
|
-
"table": LayoutType.
|
|
179
|
+
"natural_image": LayoutType.FIGURE,
|
|
180
|
+
"figure": LayoutType.FIGURE,
|
|
181
|
+
"logo": LayoutType.LOGO,
|
|
182
|
+
"signature": LayoutType.SIGNATURE,
|
|
183
|
+
"table": LayoutType.FIGURE,
|
|
181
184
|
},
|
|
182
185
|
),
|
|
183
186
|
)
|
|
@@ -24,6 +24,7 @@ Module for Testlayout dataset. Install the dataset following the folder structur
|
|
|
24
24
|
├── test
|
|
25
25
|
│ ├── xrf_layout_test.jsonl
|
|
26
26
|
"""
|
|
27
|
+
from __future__ import annotations
|
|
27
28
|
|
|
28
29
|
from typing import Mapping, Union
|
|
29
30
|
|
|
@@ -51,7 +52,7 @@ _LICENSE = (
|
|
|
51
52
|
)
|
|
52
53
|
|
|
53
54
|
_SPLITS: Mapping[str, str] = {"test": "test", "predict": "predict"}
|
|
54
|
-
_TYPE = DatasetType.
|
|
55
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
55
56
|
_LOCATION = "testlayout"
|
|
56
57
|
|
|
57
58
|
_ANNOTATION_FILES: Mapping[str, str] = {
|
|
@@ -59,7 +60,7 @@ _ANNOTATION_FILES: Mapping[str, str] = {
|
|
|
59
60
|
"predict": "xrf_layout_test_predict.jsonl",
|
|
60
61
|
}
|
|
61
62
|
|
|
62
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
63
|
+
_INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutType.TABLE, LayoutType.FIGURE]
|
|
63
64
|
|
|
64
65
|
|
|
65
66
|
@dataset_registry.register("testlayout")
|
|
@@ -77,7 +78,7 @@ class LayoutTest(_BuiltInDataset):
|
|
|
77
78
|
def _categories(self) -> DatasetCategories:
|
|
78
79
|
return DatasetCategories(init_categories=_INIT_CATEGORIES)
|
|
79
80
|
|
|
80
|
-
def _builder(self) ->
|
|
81
|
+
def _builder(self) -> LayoutTestBuilder:
|
|
81
82
|
return LayoutTestBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
82
83
|
|
|
83
84
|
|
|
@@ -28,7 +28,7 @@ Module for Publaynet dataset. Place the dataset as follows
|
|
|
28
28
|
├── train.json
|
|
29
29
|
├── val.json
|
|
30
30
|
"""
|
|
31
|
-
|
|
31
|
+
from __future__ import annotations
|
|
32
32
|
|
|
33
33
|
from typing import Mapping, Union
|
|
34
34
|
|
|
@@ -61,12 +61,12 @@ _URL = (
|
|
|
61
61
|
"publaynet.tar.gz?_ga=2.23017467.1796315263.1628754613-1173244232.1625045842"
|
|
62
62
|
)
|
|
63
63
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val"}
|
|
64
|
-
_TYPE = DatasetType.
|
|
64
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
65
65
|
|
|
66
66
|
_LOCATION = "publaynet"
|
|
67
67
|
|
|
68
68
|
_ANNOTATION_FILES: Mapping[str, str] = {"train": "train.json", "val": "val.json"}
|
|
69
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
69
|
+
_INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutType.TABLE, LayoutType.FIGURE]
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
@dataset_registry.register("publaynet")
|
|
@@ -84,7 +84,7 @@ class Publaynet(_BuiltInDataset):
|
|
|
84
84
|
def _categories(self) -> DatasetCategories:
|
|
85
85
|
return DatasetCategories(init_categories=_INIT_CATEGORIES)
|
|
86
86
|
|
|
87
|
-
def _builder(self) ->
|
|
87
|
+
def _builder(self) -> PublaynetBuilder:
|
|
88
88
|
return PublaynetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
89
89
|
|
|
90
90
|
|
|
@@ -120,7 +120,7 @@ class PublaynetBuilder(DataFlowBaseBuilder):
|
|
|
120
120
|
df = SerializerCoco.load(path, max_datapoints=max_datapoints)
|
|
121
121
|
|
|
122
122
|
# Map
|
|
123
|
-
df = MapDataComponent(df, lambda dp: self.get_workdir() / self.get_split(split) / dp, "file_name")
|
|
123
|
+
df = MapDataComponent(df, lambda dp: (self.get_workdir() / self.get_split(split) / dp).as_posix(), "file_name")
|
|
124
124
|
coco_mapper = coco_to_image( # pylint: disable=E1120 # 259
|
|
125
125
|
self.categories.get_categories(init=True),
|
|
126
126
|
load_image,
|
|
@@ -37,26 +37,29 @@ Module for PubTables1M-Detection-PASCAL-VOC dataset. Install the dataset followi
|
|
|
37
37
|
├── PubTables-1M-Structure_Annotations_Test
|
|
38
38
|
├── PubTables-1M-Structure_Images_Test
|
|
39
39
|
"""
|
|
40
|
+
from __future__ import annotations
|
|
40
41
|
|
|
41
42
|
import os
|
|
42
43
|
from typing import Mapping, Union
|
|
43
44
|
|
|
45
|
+
from lazy_imports import try_import
|
|
46
|
+
|
|
44
47
|
from ...dataflow import DataFlow, MapData, SerializerFiles
|
|
45
48
|
from ...datasets.info import DatasetInfo
|
|
46
49
|
from ...mapper.cats import filter_cat
|
|
47
50
|
from ...mapper.maputils import curry
|
|
48
51
|
from ...mapper.misc import xml_to_dict
|
|
49
52
|
from ...mapper.pascalstruct import pascal_voc_dict_to_image
|
|
50
|
-
from ...utils.detection_types import JsonDict
|
|
51
53
|
from ...utils.file_utils import lxml_available
|
|
52
54
|
from ...utils.fs import get_package_path
|
|
53
55
|
from ...utils.settings import CellType, DatasetType, LayoutType
|
|
56
|
+
from ...utils.types import JsonDict
|
|
54
57
|
from ..base import _BuiltInDataset
|
|
55
58
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
56
59
|
from ..info import DatasetCategories
|
|
57
60
|
from ..registry import dataset_registry
|
|
58
61
|
|
|
59
|
-
|
|
62
|
+
with try_import() as import_guard:
|
|
60
63
|
from lxml import etree
|
|
61
64
|
|
|
62
65
|
_NAME = "pubtables1m_det"
|
|
@@ -77,14 +80,14 @@ _LICENSE = "Community Data License Agreement – Permissive, Version 1.0"
|
|
|
77
80
|
_URL = "https://msropendata.com/datasets/505fcbe3-1383-42b1-913a-f651b8b712d3"
|
|
78
81
|
|
|
79
82
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
|
|
80
|
-
_TYPE = DatasetType.
|
|
83
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
81
84
|
_LOCATION = "PubTables1M"
|
|
82
85
|
_ANNOTATION_FILES: Mapping[str, str] = {
|
|
83
86
|
"train": "PubTables1M-Detection-PASCAL-VOC/train",
|
|
84
87
|
"val": "PubTables1M-Detection-PASCAL-VOC/val",
|
|
85
88
|
"test": "PubTables1M-Detection-PASCAL-VOC/test",
|
|
86
89
|
}
|
|
87
|
-
_INIT_CATEGORIES_DET = [LayoutType.
|
|
90
|
+
_INIT_CATEGORIES_DET = [LayoutType.TABLE, LayoutType.TABLE_ROTATED]
|
|
88
91
|
|
|
89
92
|
|
|
90
93
|
@dataset_registry.register("pubtables1m_det")
|
|
@@ -102,7 +105,7 @@ class Pubtables1MDet(_BuiltInDataset):
|
|
|
102
105
|
def _categories(self) -> DatasetCategories:
|
|
103
106
|
return DatasetCategories(init_categories=_INIT_CATEGORIES_DET)
|
|
104
107
|
|
|
105
|
-
def _builder(self) ->
|
|
108
|
+
def _builder(self) -> Pubtables1MBuilder:
|
|
106
109
|
return Pubtables1MBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
107
110
|
|
|
108
111
|
|
|
@@ -177,7 +180,7 @@ class Pubtables1MBuilder(DataFlowBaseBuilder):
|
|
|
177
180
|
load_image,
|
|
178
181
|
filter_empty_image=True,
|
|
179
182
|
fake_score=fake_score,
|
|
180
|
-
category_name_mapping={"table": LayoutType.
|
|
183
|
+
category_name_mapping={"table": LayoutType.TABLE, "table rotated": LayoutType.TABLE_ROTATED},
|
|
181
184
|
),
|
|
182
185
|
)
|
|
183
186
|
|
|
@@ -192,13 +195,13 @@ _ANNOTATION_FILES_STRUCT: Mapping[str, str] = {
|
|
|
192
195
|
}
|
|
193
196
|
|
|
194
197
|
_INIT_CATEGORIES_STRUCT = [
|
|
195
|
-
LayoutType.
|
|
196
|
-
LayoutType.
|
|
197
|
-
LayoutType.
|
|
198
|
-
CellType.
|
|
199
|
-
CellType.
|
|
200
|
-
CellType.
|
|
201
|
-
CellType.
|
|
198
|
+
LayoutType.TABLE,
|
|
199
|
+
LayoutType.ROW,
|
|
200
|
+
LayoutType.COLUMN,
|
|
201
|
+
CellType.SPANNING,
|
|
202
|
+
CellType.ROW_HEADER,
|
|
203
|
+
CellType.COLUMN_HEADER,
|
|
204
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
202
205
|
]
|
|
203
206
|
|
|
204
207
|
_IMAGES: Mapping[str, str] = {
|
|
@@ -225,7 +228,7 @@ class Pubtables1MStruct(_BuiltInDataset):
|
|
|
225
228
|
def _categories(self) -> DatasetCategories:
|
|
226
229
|
return DatasetCategories(init_categories=_INIT_CATEGORIES_STRUCT)
|
|
227
230
|
|
|
228
|
-
def _builder(self) ->
|
|
231
|
+
def _builder(self) -> Pubtables1MBuilderStruct:
|
|
229
232
|
return Pubtables1MBuilderStruct(location=_LOCATION, annotation_files=_ANNOTATION_FILES_STRUCT)
|
|
230
233
|
|
|
231
234
|
|
|
@@ -299,13 +302,13 @@ class Pubtables1MBuilderStruct(DataFlowBaseBuilder):
|
|
|
299
302
|
filter_empty_image=True,
|
|
300
303
|
fake_score=fake_score,
|
|
301
304
|
category_name_mapping={
|
|
302
|
-
"table": LayoutType.
|
|
303
|
-
"table spanning cell": CellType.
|
|
304
|
-
"table row": LayoutType.
|
|
305
|
-
"table row header": CellType.
|
|
306
|
-
"table projected row header": CellType.
|
|
307
|
-
"table column": LayoutType.
|
|
308
|
-
"table column header": CellType.
|
|
305
|
+
"table": LayoutType.TABLE,
|
|
306
|
+
"table spanning cell": CellType.SPANNING,
|
|
307
|
+
"table row": LayoutType.ROW,
|
|
308
|
+
"table row header": CellType.ROW_HEADER,
|
|
309
|
+
"table projected row header": CellType.PROJECTED_ROW_HEADER,
|
|
310
|
+
"table column": LayoutType.COLUMN,
|
|
311
|
+
"table column header": CellType.COLUMN_HEADER,
|
|
309
312
|
},
|
|
310
313
|
),
|
|
311
314
|
)
|
|
@@ -27,17 +27,18 @@ Module for Pubtabnet dataset. Place the dataset as follows
|
|
|
27
27
|
│ ├── PMC3.png
|
|
28
28
|
├── PubTabNet_2.0.0.jsonl
|
|
29
29
|
"""
|
|
30
|
+
from __future__ import annotations
|
|
30
31
|
|
|
31
|
-
from typing import
|
|
32
|
+
from typing import Mapping, Union
|
|
32
33
|
|
|
33
34
|
from ...dataflow import DataFlow, MapData
|
|
34
35
|
from ...dataflow.custom_serialize import SerializerJsonlines
|
|
35
36
|
from ...datasets.info import DatasetInfo
|
|
36
37
|
from ...mapper.cats import cat_to_sub_cat, filter_cat
|
|
37
38
|
from ...mapper.pubstruct import pub_to_image
|
|
38
|
-
from ...utils.detection_types import JsonDict
|
|
39
39
|
from ...utils.logger import LoggingRecord, logger
|
|
40
40
|
from ...utils.settings import CellType, DatasetType, LayoutType, ObjectTypes, TableType, WordType
|
|
41
|
+
from ...utils.types import PubtabnetDict
|
|
41
42
|
from ..base import _BuiltInDataset
|
|
42
43
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
43
44
|
from ..info import DatasetCategories
|
|
@@ -69,38 +70,38 @@ _URL = (
|
|
|
69
70
|
"pubtabnet.tar.gz?_ga=2.267291150.146828643.1629125962-1173244232.1625045842"
|
|
70
71
|
)
|
|
71
72
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
|
|
72
|
-
_TYPE = DatasetType.
|
|
73
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
73
74
|
_LOCATION = "pubtabnet"
|
|
74
75
|
_ANNOTATION_FILES: Mapping[str, str] = {"all": "PubTabNet_2.0.0.jsonl"}
|
|
75
76
|
|
|
76
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
77
|
-
_SUB_CATEGORIES:
|
|
77
|
+
_INIT_CATEGORIES = [LayoutType.CELL, TableType.ITEM, LayoutType.TABLE, LayoutType.WORD]
|
|
78
|
+
_SUB_CATEGORIES: dict[ObjectTypes, dict[ObjectTypes, list[ObjectTypes]]]
|
|
78
79
|
_SUB_CATEGORIES = {
|
|
79
|
-
TableType.
|
|
80
|
-
LayoutType.
|
|
81
|
-
CellType.
|
|
82
|
-
CellType.
|
|
83
|
-
CellType.
|
|
84
|
-
CellType.
|
|
85
|
-
CellType.
|
|
86
|
-
CellType.
|
|
80
|
+
TableType.ITEM: {TableType.ITEM: [LayoutType.ROW, LayoutType.COLUMN]},
|
|
81
|
+
LayoutType.CELL: {
|
|
82
|
+
CellType.HEADER: [CellType.HEADER, CellType.BODY],
|
|
83
|
+
CellType.ROW_NUMBER: [],
|
|
84
|
+
CellType.COLUMN_NUMBER: [],
|
|
85
|
+
CellType.ROW_SPAN: [],
|
|
86
|
+
CellType.COLUMN_SPAN: [],
|
|
87
|
+
CellType.SPANNING: [CellType.SPANNING],
|
|
87
88
|
},
|
|
88
|
-
CellType.
|
|
89
|
-
CellType.
|
|
90
|
-
CellType.
|
|
91
|
-
CellType.
|
|
92
|
-
CellType.
|
|
93
|
-
CellType.
|
|
89
|
+
CellType.HEADER: {
|
|
90
|
+
CellType.ROW_NUMBER: [],
|
|
91
|
+
CellType.COLUMN_NUMBER: [],
|
|
92
|
+
CellType.ROW_SPAN: [],
|
|
93
|
+
CellType.COLUMN_SPAN: [],
|
|
94
|
+
CellType.SPANNING: [CellType.SPANNING],
|
|
94
95
|
},
|
|
95
|
-
CellType.
|
|
96
|
-
CellType.
|
|
97
|
-
CellType.
|
|
98
|
-
CellType.
|
|
99
|
-
CellType.
|
|
100
|
-
CellType.
|
|
96
|
+
CellType.BODY: {
|
|
97
|
+
CellType.ROW_NUMBER: [],
|
|
98
|
+
CellType.COLUMN_NUMBER: [],
|
|
99
|
+
CellType.ROW_SPAN: [],
|
|
100
|
+
CellType.COLUMN_SPAN: [],
|
|
101
|
+
CellType.SPANNING: [CellType.SPANNING],
|
|
101
102
|
},
|
|
102
|
-
LayoutType.
|
|
103
|
-
LayoutType.
|
|
103
|
+
LayoutType.TABLE: {TableType.HTML: [TableType.HTML]},
|
|
104
|
+
LayoutType.WORD: {WordType.CHARACTERS: [WordType.CHARACTERS]},
|
|
104
105
|
}
|
|
105
106
|
|
|
106
107
|
|
|
@@ -119,7 +120,7 @@ class Pubtabnet(_BuiltInDataset):
|
|
|
119
120
|
def _categories(self) -> DatasetCategories:
|
|
120
121
|
return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
|
|
121
122
|
|
|
122
|
-
def _builder(self) ->
|
|
123
|
+
def _builder(self) -> PubtabnetBuilder:
|
|
123
124
|
return PubtabnetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
124
125
|
|
|
125
126
|
|
|
@@ -169,7 +170,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
|
|
|
169
170
|
df = SerializerJsonlines.load(path, max_datapoints=max_datapoints)
|
|
170
171
|
|
|
171
172
|
# Map
|
|
172
|
-
def replace_filename(dp:
|
|
173
|
+
def replace_filename(dp: PubtabnetDict) -> PubtabnetDict:
|
|
173
174
|
dp["filename"] = self.get_workdir() / dp["split"] / dp["filename"]
|
|
174
175
|
return dp
|
|
175
176
|
|
|
@@ -177,7 +178,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
|
|
|
177
178
|
df = MapData(df, lambda dp: dp if dp["split"] == split else None)
|
|
178
179
|
pub_mapper = pub_to_image(
|
|
179
180
|
self.categories.get_categories(name_as_key=True, init=True),
|
|
180
|
-
load_image,
|
|
181
|
+
load_image=load_image,
|
|
181
182
|
fake_score=fake_score,
|
|
182
183
|
rows_and_cols=rows_and_cols,
|
|
183
184
|
dd_pipe_like=dd_pipe_like,
|
|
@@ -186,6 +187,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
|
|
|
186
187
|
)
|
|
187
188
|
|
|
188
189
|
df = MapData(df, pub_mapper)
|
|
190
|
+
|
|
189
191
|
if self.categories.is_cat_to_sub_cat():
|
|
190
192
|
df = MapData(
|
|
191
193
|
df,
|