deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +8 -25
- deepdoctection/analyzer/dd.py +84 -71
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +78 -56
- deepdoctection/datapoint/box.py +7 -7
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +157 -75
- deepdoctection/datapoint/view.py +175 -151
- deepdoctection/datasets/adapter.py +30 -24
- deepdoctection/datasets/base.py +10 -10
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +23 -25
- deepdoctection/datasets/instances/doclaynet.py +48 -49
- deepdoctection/datasets/instances/fintabnet.py +44 -45
- deepdoctection/datasets/instances/funsd.py +23 -23
- deepdoctection/datasets/instances/iiitar13k.py +8 -8
- deepdoctection/datasets/instances/layouttest.py +2 -2
- deepdoctection/datasets/instances/publaynet.py +3 -3
- deepdoctection/datasets/instances/pubtables1m.py +18 -18
- deepdoctection/datasets/instances/pubtabnet.py +30 -29
- deepdoctection/datasets/instances/rvlcdip.py +28 -29
- deepdoctection/datasets/instances/xfund.py +51 -30
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +13 -12
- deepdoctection/eval/eval.py +32 -26
- deepdoctection/eval/tedsmetric.py +16 -12
- deepdoctection/eval/tp_eval_callback.py +7 -16
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +69 -89
- deepdoctection/extern/deskew.py +11 -10
- deepdoctection/extern/doctrocr.py +81 -64
- deepdoctection/extern/fastlang.py +23 -16
- deepdoctection/extern/hfdetr.py +53 -38
- deepdoctection/extern/hflayoutlm.py +216 -155
- deepdoctection/extern/hflm.py +35 -30
- deepdoctection/extern/model.py +433 -255
- deepdoctection/extern/pdftext.py +15 -15
- deepdoctection/extern/pt/ptutils.py +4 -2
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +14 -16
- deepdoctection/extern/tp/tfutils.py +16 -2
- deepdoctection/extern/tp/tpcompat.py +11 -7
- deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
- deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
- deepdoctection/extern/tpdetect.py +40 -45
- deepdoctection/mapper/cats.py +36 -40
- deepdoctection/mapper/cocostruct.py +16 -12
- deepdoctection/mapper/d2struct.py +22 -22
- deepdoctection/mapper/hfstruct.py +7 -7
- deepdoctection/mapper/laylmstruct.py +22 -24
- deepdoctection/mapper/maputils.py +9 -10
- deepdoctection/mapper/match.py +33 -2
- deepdoctection/mapper/misc.py +6 -7
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +6 -6
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/anngen.py +39 -14
- deepdoctection/pipe/base.py +68 -99
- deepdoctection/pipe/common.py +181 -85
- deepdoctection/pipe/concurrency.py +14 -10
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +18 -16
- deepdoctection/pipe/lm.py +49 -47
- deepdoctection/pipe/order.py +63 -65
- deepdoctection/pipe/refine.py +102 -109
- deepdoctection/pipe/segment.py +157 -162
- deepdoctection/pipe/sub_layout.py +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/d2_frcnn_train.py +27 -25
- deepdoctection/train/hf_detr_train.py +22 -18
- deepdoctection/train/hf_layoutlm_train.py +49 -48
- deepdoctection/train/tp_frcnn_train.py +10 -11
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +52 -14
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +41 -14
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +15 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/pdf_utils.py +39 -14
- deepdoctection/utils/settings.py +188 -182
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +70 -69
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
- deepdoctection-0.34.dist-info/RECORD +146 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.32.dist-info/RECORD +0 -146
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
|
@@ -33,7 +33,7 @@ Module for Fintabnet dataset. Place the dataset as follows
|
|
|
33
33
|
from __future__ import annotations
|
|
34
34
|
|
|
35
35
|
from pathlib import Path
|
|
36
|
-
from typing import
|
|
36
|
+
from typing import Mapping, Sequence, Union
|
|
37
37
|
|
|
38
38
|
from ...dataflow import DataFlow, MapData, MultiProcessMapData
|
|
39
39
|
from ...dataflow.common import FlattenData
|
|
@@ -43,10 +43,10 @@ from ...mapper.cats import cat_to_sub_cat, filter_cat
|
|
|
43
43
|
from ...mapper.maputils import curry
|
|
44
44
|
from ...mapper.misc import image_ann_to_image, maybe_ann_to_sub_image
|
|
45
45
|
from ...mapper.pubstruct import pub_to_image
|
|
46
|
-
from ...utils.detection_types import JsonDict
|
|
47
46
|
from ...utils.file_utils import set_mp_spawn
|
|
48
47
|
from ...utils.logger import LoggingRecord, logger
|
|
49
48
|
from ...utils.settings import CellType, DatasetType, LayoutType, ObjectTypes, TableType
|
|
49
|
+
from ...utils.types import PubtabnetDict
|
|
50
50
|
from ...utils.utils import to_bool
|
|
51
51
|
from ..base import _BuiltInDataset
|
|
52
52
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
@@ -83,38 +83,38 @@ _URL = (
|
|
|
83
83
|
"fintabnet.tar.gz?_ga=2.17492593.994196051.1634564576-1173244232.1625045842"
|
|
84
84
|
)
|
|
85
85
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
|
|
86
|
-
_TYPE = DatasetType.
|
|
86
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
87
87
|
_LOCATION = "fintabnet"
|
|
88
88
|
_ANNOTATION_FILES: Mapping[str, str] = {
|
|
89
89
|
"train": "FinTabNet_1.0.0_table_train.jsonl",
|
|
90
90
|
"test": "FinTabNet_1.0.0_table_test.jsonl",
|
|
91
91
|
"val": "FinTabNet_1.0.0_table_val.jsonl",
|
|
92
92
|
}
|
|
93
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
93
|
+
_INIT_CATEGORIES = [LayoutType.TABLE, LayoutType.CELL, TableType.ITEM]
|
|
94
94
|
_SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]]
|
|
95
95
|
_SUB_CATEGORIES = {
|
|
96
|
-
LayoutType.
|
|
97
|
-
CellType.
|
|
98
|
-
CellType.
|
|
99
|
-
CellType.
|
|
100
|
-
CellType.
|
|
101
|
-
CellType.
|
|
102
|
-
CellType.
|
|
96
|
+
LayoutType.CELL: {
|
|
97
|
+
CellType.HEADER: [CellType.HEADER, CellType.BODY],
|
|
98
|
+
CellType.ROW_NUMBER: [],
|
|
99
|
+
CellType.COLUMN_NUMBER: [],
|
|
100
|
+
CellType.ROW_SPAN: [],
|
|
101
|
+
CellType.COLUMN_SPAN: [],
|
|
102
|
+
CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
|
|
103
103
|
},
|
|
104
|
-
TableType.
|
|
105
|
-
CellType.
|
|
106
|
-
CellType.
|
|
107
|
-
CellType.
|
|
108
|
-
CellType.
|
|
109
|
-
CellType.
|
|
110
|
-
CellType.
|
|
104
|
+
TableType.ITEM: {TableType.ITEM: [LayoutType.ROW, LayoutType.COLUMN]},
|
|
105
|
+
CellType.HEADER: {
|
|
106
|
+
CellType.ROW_NUMBER: [],
|
|
107
|
+
CellType.COLUMN_NUMBER: [],
|
|
108
|
+
CellType.ROW_SPAN: [],
|
|
109
|
+
CellType.COLUMN_SPAN: [],
|
|
110
|
+
CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
|
|
111
111
|
},
|
|
112
|
-
CellType.
|
|
113
|
-
CellType.
|
|
114
|
-
CellType.
|
|
115
|
-
CellType.
|
|
116
|
-
CellType.
|
|
117
|
-
CellType.
|
|
112
|
+
CellType.BODY: {
|
|
113
|
+
CellType.ROW_NUMBER: [],
|
|
114
|
+
CellType.COLUMN_NUMBER: [],
|
|
115
|
+
CellType.ROW_SPAN: [],
|
|
116
|
+
CellType.COLUMN_SPAN: [],
|
|
117
|
+
CellType.SPANNING: [CellType.SPANNING, LayoutType.CELL],
|
|
118
118
|
},
|
|
119
119
|
}
|
|
120
120
|
|
|
@@ -201,18 +201,17 @@ class FintabnetBuilder(DataFlowBaseBuilder):
|
|
|
201
201
|
|
|
202
202
|
# Map
|
|
203
203
|
@curry
|
|
204
|
-
def _map_filename(dp:
|
|
204
|
+
def _map_filename(dp: PubtabnetDict, workdir: Path) -> PubtabnetDict:
|
|
205
205
|
dp["filename"] = workdir / "pdf" / dp["filename"]
|
|
206
206
|
return dp
|
|
207
207
|
|
|
208
|
-
|
|
209
|
-
df = MapData(df, map_filename)
|
|
208
|
+
df = MapData(df, _map_filename(self.get_workdir()))
|
|
210
209
|
|
|
211
210
|
buffer_size = 200 if max_datapoints is None else min(max_datapoints, 200) - 1
|
|
212
211
|
|
|
213
212
|
pub_mapper = pub_to_image(
|
|
214
|
-
self.categories.get_categories(name_as_key=True, init=True),
|
|
215
|
-
load_image,
|
|
213
|
+
categories_name_as_key=self.categories.get_categories(name_as_key=True, init=True),
|
|
214
|
+
load_image=load_image,
|
|
216
215
|
fake_score=fake_score,
|
|
217
216
|
rows_and_cols=rows_and_cols,
|
|
218
217
|
dd_pipe_like=False,
|
|
@@ -233,39 +232,39 @@ class FintabnetBuilder(DataFlowBaseBuilder):
|
|
|
233
232
|
if build_mode == "table":
|
|
234
233
|
|
|
235
234
|
@curry
|
|
236
|
-
def _crop_and_add_image(dp: Image, category_names:
|
|
235
|
+
def _crop_and_add_image(dp: Image, category_names: list[str]) -> Image:
|
|
237
236
|
return image_ann_to_image(dp, category_names=category_names)
|
|
238
237
|
|
|
239
238
|
df = MapData(
|
|
240
239
|
df,
|
|
241
240
|
_crop_and_add_image( # pylint: disable=E1120
|
|
242
241
|
category_names=[
|
|
243
|
-
LayoutType.
|
|
244
|
-
LayoutType.
|
|
245
|
-
CellType.
|
|
246
|
-
CellType.
|
|
247
|
-
TableType.
|
|
248
|
-
LayoutType.
|
|
249
|
-
LayoutType.
|
|
242
|
+
LayoutType.TABLE,
|
|
243
|
+
LayoutType.CELL,
|
|
244
|
+
CellType.HEADER,
|
|
245
|
+
CellType.BODY,
|
|
246
|
+
TableType.ITEM,
|
|
247
|
+
LayoutType.ROW,
|
|
248
|
+
LayoutType.COLUMN,
|
|
250
249
|
]
|
|
251
250
|
),
|
|
252
251
|
)
|
|
253
252
|
df = MapData(
|
|
254
253
|
df,
|
|
255
254
|
maybe_ann_to_sub_image( # pylint: disable=E1120 # 259
|
|
256
|
-
category_names_sub_image=LayoutType.
|
|
255
|
+
category_names_sub_image=LayoutType.TABLE,
|
|
257
256
|
category_names=[
|
|
258
|
-
LayoutType.
|
|
259
|
-
CellType.
|
|
260
|
-
CellType.
|
|
261
|
-
TableType.
|
|
262
|
-
LayoutType.
|
|
263
|
-
LayoutType.
|
|
257
|
+
LayoutType.CELL,
|
|
258
|
+
CellType.HEADER,
|
|
259
|
+
CellType.BODY,
|
|
260
|
+
TableType.ITEM,
|
|
261
|
+
LayoutType.ROW,
|
|
262
|
+
LayoutType.COLUMN,
|
|
264
263
|
],
|
|
265
264
|
add_summary=True,
|
|
266
265
|
),
|
|
267
266
|
)
|
|
268
|
-
df = MapData(df, lambda dp: [ann.image for ann in dp.
|
|
267
|
+
df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation(category_names=LayoutType.TABLE)])
|
|
269
268
|
df = FlattenData(df)
|
|
270
269
|
df = MapData(df, lambda dp: dp[0])
|
|
271
270
|
|
|
@@ -41,16 +41,16 @@ from ...dataflow import DataFlow, MapData, SerializerFiles
|
|
|
41
41
|
from ...datasets.info import DatasetInfo
|
|
42
42
|
from ...mapper.cats import cat_to_sub_cat, filter_cat
|
|
43
43
|
from ...mapper.xfundstruct import xfund_to_image
|
|
44
|
-
from ...utils.detection_types import JsonDict, Pathlike
|
|
45
44
|
from ...utils.fs import load_json
|
|
46
45
|
from ...utils.settings import BioTag, DatasetType, LayoutType, ObjectTypes, TokenClasses, TokenClassWithTag, WordType
|
|
46
|
+
from ...utils.types import FunsdDict, PathLikeOrStr
|
|
47
47
|
from ..base import _BuiltInDataset
|
|
48
48
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
49
49
|
from ..info import DatasetCategories
|
|
50
50
|
from ..registry import dataset_registry
|
|
51
51
|
|
|
52
52
|
|
|
53
|
-
def load_file(path_ann:
|
|
53
|
+
def load_file(path_ann: PathLikeOrStr) -> FunsdDict:
|
|
54
54
|
"""
|
|
55
55
|
Loading json file
|
|
56
56
|
|
|
@@ -80,28 +80,28 @@ _LICENSE = (
|
|
|
80
80
|
|
|
81
81
|
_URL = "https://guillaumejaume.github.io/FUNSD/download/"
|
|
82
82
|
_SPLITS: Mapping[str, str] = {"train": "training_data", "test": "testing_data"}
|
|
83
|
-
_TYPE = DatasetType.
|
|
83
|
+
_TYPE = DatasetType.TOKEN_CLASSIFICATION
|
|
84
84
|
_LOCATION = "funsd"
|
|
85
85
|
_ANNOTATION_FILES: Mapping[str, str] = {"train": "annotations", "test": "annotations"}
|
|
86
86
|
|
|
87
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
87
|
+
_INIT_CATEGORIES = [LayoutType.WORD, LayoutType.TEXT]
|
|
88
88
|
_SUB_CATEGORIES: Dict[ObjectTypes, Dict[ObjectTypes, List[ObjectTypes]]]
|
|
89
89
|
_SUB_CATEGORIES = {
|
|
90
|
-
LayoutType.
|
|
91
|
-
WordType.
|
|
92
|
-
WordType.
|
|
93
|
-
WordType.
|
|
94
|
-
TokenClassWithTag.
|
|
95
|
-
TokenClassWithTag.
|
|
96
|
-
TokenClassWithTag.
|
|
97
|
-
TokenClassWithTag.
|
|
98
|
-
TokenClassWithTag.
|
|
99
|
-
TokenClassWithTag.
|
|
100
|
-
BioTag.
|
|
90
|
+
LayoutType.WORD: {
|
|
91
|
+
WordType.TOKEN_CLASS: [TokenClasses.OTHER, TokenClasses.QUESTION, TokenClasses.ANSWER, TokenClasses.HEADER],
|
|
92
|
+
WordType.TAG: [BioTag.INSIDE, BioTag.OUTSIDE, BioTag.BEGIN],
|
|
93
|
+
WordType.TOKEN_TAG: [
|
|
94
|
+
TokenClassWithTag.B_ANSWER,
|
|
95
|
+
TokenClassWithTag.B_HEADER,
|
|
96
|
+
TokenClassWithTag.B_QUESTION,
|
|
97
|
+
TokenClassWithTag.I_ANSWER,
|
|
98
|
+
TokenClassWithTag.I_HEADER,
|
|
99
|
+
TokenClassWithTag.I_QUESTION,
|
|
100
|
+
BioTag.OUTSIDE,
|
|
101
101
|
],
|
|
102
102
|
},
|
|
103
|
-
LayoutType.
|
|
104
|
-
WordType.
|
|
103
|
+
LayoutType.TEXT: {
|
|
104
|
+
WordType.TOKEN_CLASS: [TokenClasses.OTHER, TokenClasses.QUESTION, TokenClasses.ANSWER, TokenClasses.HEADER]
|
|
105
105
|
},
|
|
106
106
|
}
|
|
107
107
|
|
|
@@ -159,14 +159,14 @@ class FunsdBuilder(DataFlowBaseBuilder):
|
|
|
159
159
|
# Map
|
|
160
160
|
categories_name_as_key = self.categories.get_categories(init=True, name_as_key=True)
|
|
161
161
|
category_names_mapping = {
|
|
162
|
-
"other": TokenClasses.
|
|
163
|
-
"question": TokenClasses.
|
|
164
|
-
"answer": TokenClasses.
|
|
165
|
-
"header": TokenClasses.
|
|
162
|
+
"other": TokenClasses.OTHER,
|
|
163
|
+
"question": TokenClasses.QUESTION,
|
|
164
|
+
"answer": TokenClasses.ANSWER,
|
|
165
|
+
"header": TokenClasses.HEADER,
|
|
166
166
|
}
|
|
167
167
|
ner_token_to_id_mapping = self.categories.get_sub_categories(
|
|
168
|
-
categories=LayoutType.
|
|
169
|
-
sub_categories={LayoutType.
|
|
168
|
+
categories=LayoutType.WORD,
|
|
169
|
+
sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
|
|
170
170
|
keys=False,
|
|
171
171
|
values_as_dict=True,
|
|
172
172
|
name_as_key=True,
|
|
@@ -47,10 +47,10 @@ from ...datasets.info import DatasetInfo
|
|
|
47
47
|
from ...mapper.maputils import curry
|
|
48
48
|
from ...mapper.misc import xml_to_dict
|
|
49
49
|
from ...mapper.pascalstruct import pascal_voc_dict_to_image
|
|
50
|
-
from ...utils.detection_types import JsonDict
|
|
51
50
|
from ...utils.file_utils import lxml_available
|
|
52
51
|
from ...utils.fs import get_package_path
|
|
53
52
|
from ...utils.settings import DatasetType, LayoutType
|
|
53
|
+
from ...utils.types import JsonDict
|
|
54
54
|
from ..base import _BuiltInDataset
|
|
55
55
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
56
56
|
from ..info import DatasetCategories
|
|
@@ -76,7 +76,7 @@ _LICENSE = "NN"
|
|
|
76
76
|
_URL = "http://cvit.iiit.ac.in/usodi/iiitar13k.php"
|
|
77
77
|
|
|
78
78
|
_SPLITS: Mapping[str, str] = {"train": "training_images", "val": "validation_images", "test": "test_images"}
|
|
79
|
-
_TYPE = DatasetType.
|
|
79
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
80
80
|
_LOCATION = "iiitar13k"
|
|
81
81
|
_ANNOTATION_FILES: Mapping[str, str] = {
|
|
82
82
|
"train": "training_xml",
|
|
@@ -84,7 +84,7 @@ _ANNOTATION_FILES: Mapping[str, str] = {
|
|
|
84
84
|
"test": "test_xml",
|
|
85
85
|
}
|
|
86
86
|
|
|
87
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
87
|
+
_INIT_CATEGORIES = [LayoutType.TABLE, LayoutType.LOGO, LayoutType.FIGURE, LayoutType.SIGNATURE]
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
@dataset_registry.register("iiitar13k")
|
|
@@ -176,11 +176,11 @@ class IIITar13KBuilder(DataFlowBaseBuilder):
|
|
|
176
176
|
filter_empty_image=True,
|
|
177
177
|
fake_score=fake_score,
|
|
178
178
|
category_name_mapping={
|
|
179
|
-
"natural_image": LayoutType.
|
|
180
|
-
"figure": LayoutType.
|
|
181
|
-
"logo": LayoutType.
|
|
182
|
-
"signature": LayoutType.
|
|
183
|
-
"table": LayoutType.
|
|
179
|
+
"natural_image": LayoutType.FIGURE,
|
|
180
|
+
"figure": LayoutType.FIGURE,
|
|
181
|
+
"logo": LayoutType.LOGO,
|
|
182
|
+
"signature": LayoutType.SIGNATURE,
|
|
183
|
+
"table": LayoutType.FIGURE,
|
|
184
184
|
},
|
|
185
185
|
),
|
|
186
186
|
)
|
|
@@ -52,7 +52,7 @@ _LICENSE = (
|
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
_SPLITS: Mapping[str, str] = {"test": "test", "predict": "predict"}
|
|
55
|
-
_TYPE = DatasetType.
|
|
55
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
56
56
|
_LOCATION = "testlayout"
|
|
57
57
|
|
|
58
58
|
_ANNOTATION_FILES: Mapping[str, str] = {
|
|
@@ -60,7 +60,7 @@ _ANNOTATION_FILES: Mapping[str, str] = {
|
|
|
60
60
|
"predict": "xrf_layout_test_predict.jsonl",
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
63
|
+
_INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutType.TABLE, LayoutType.FIGURE]
|
|
64
64
|
|
|
65
65
|
|
|
66
66
|
@dataset_registry.register("testlayout")
|
|
@@ -61,12 +61,12 @@ _URL = (
|
|
|
61
61
|
"publaynet.tar.gz?_ga=2.23017467.1796315263.1628754613-1173244232.1625045842"
|
|
62
62
|
)
|
|
63
63
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val"}
|
|
64
|
-
_TYPE = DatasetType.
|
|
64
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
65
65
|
|
|
66
66
|
_LOCATION = "publaynet"
|
|
67
67
|
|
|
68
68
|
_ANNOTATION_FILES: Mapping[str, str] = {"train": "train.json", "val": "val.json"}
|
|
69
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
69
|
+
_INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutType.TABLE, LayoutType.FIGURE]
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
@dataset_registry.register("publaynet")
|
|
@@ -120,7 +120,7 @@ class PublaynetBuilder(DataFlowBaseBuilder):
|
|
|
120
120
|
df = SerializerCoco.load(path, max_datapoints=max_datapoints)
|
|
121
121
|
|
|
122
122
|
# Map
|
|
123
|
-
df = MapDataComponent(df, lambda dp: self.get_workdir() / self.get_split(split) / dp, "file_name")
|
|
123
|
+
df = MapDataComponent(df, lambda dp: (self.get_workdir() / self.get_split(split) / dp).as_posix(), "file_name")
|
|
124
124
|
coco_mapper = coco_to_image( # pylint: disable=E1120 # 259
|
|
125
125
|
self.categories.get_categories(init=True),
|
|
126
126
|
load_image,
|
|
@@ -50,10 +50,10 @@ from ...mapper.cats import filter_cat
|
|
|
50
50
|
from ...mapper.maputils import curry
|
|
51
51
|
from ...mapper.misc import xml_to_dict
|
|
52
52
|
from ...mapper.pascalstruct import pascal_voc_dict_to_image
|
|
53
|
-
from ...utils.detection_types import JsonDict
|
|
54
53
|
from ...utils.file_utils import lxml_available
|
|
55
54
|
from ...utils.fs import get_package_path
|
|
56
55
|
from ...utils.settings import CellType, DatasetType, LayoutType
|
|
56
|
+
from ...utils.types import JsonDict
|
|
57
57
|
from ..base import _BuiltInDataset
|
|
58
58
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
59
59
|
from ..info import DatasetCategories
|
|
@@ -80,14 +80,14 @@ _LICENSE = "Community Data License Agreement – Permissive, Version 1.0"
|
|
|
80
80
|
_URL = "https://msropendata.com/datasets/505fcbe3-1383-42b1-913a-f651b8b712d3"
|
|
81
81
|
|
|
82
82
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
|
|
83
|
-
_TYPE = DatasetType.
|
|
83
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
84
84
|
_LOCATION = "PubTables1M"
|
|
85
85
|
_ANNOTATION_FILES: Mapping[str, str] = {
|
|
86
86
|
"train": "PubTables1M-Detection-PASCAL-VOC/train",
|
|
87
87
|
"val": "PubTables1M-Detection-PASCAL-VOC/val",
|
|
88
88
|
"test": "PubTables1M-Detection-PASCAL-VOC/test",
|
|
89
89
|
}
|
|
90
|
-
_INIT_CATEGORIES_DET = [LayoutType.
|
|
90
|
+
_INIT_CATEGORIES_DET = [LayoutType.TABLE, LayoutType.TABLE_ROTATED]
|
|
91
91
|
|
|
92
92
|
|
|
93
93
|
@dataset_registry.register("pubtables1m_det")
|
|
@@ -180,7 +180,7 @@ class Pubtables1MBuilder(DataFlowBaseBuilder):
|
|
|
180
180
|
load_image,
|
|
181
181
|
filter_empty_image=True,
|
|
182
182
|
fake_score=fake_score,
|
|
183
|
-
category_name_mapping={"table": LayoutType.
|
|
183
|
+
category_name_mapping={"table": LayoutType.TABLE, "table rotated": LayoutType.TABLE_ROTATED},
|
|
184
184
|
),
|
|
185
185
|
)
|
|
186
186
|
|
|
@@ -195,13 +195,13 @@ _ANNOTATION_FILES_STRUCT: Mapping[str, str] = {
|
|
|
195
195
|
}
|
|
196
196
|
|
|
197
197
|
_INIT_CATEGORIES_STRUCT = [
|
|
198
|
-
LayoutType.
|
|
199
|
-
LayoutType.
|
|
200
|
-
LayoutType.
|
|
201
|
-
CellType.
|
|
202
|
-
CellType.
|
|
203
|
-
CellType.
|
|
204
|
-
CellType.
|
|
198
|
+
LayoutType.TABLE,
|
|
199
|
+
LayoutType.ROW,
|
|
200
|
+
LayoutType.COLUMN,
|
|
201
|
+
CellType.SPANNING,
|
|
202
|
+
CellType.ROW_HEADER,
|
|
203
|
+
CellType.COLUMN_HEADER,
|
|
204
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
205
205
|
]
|
|
206
206
|
|
|
207
207
|
_IMAGES: Mapping[str, str] = {
|
|
@@ -302,13 +302,13 @@ class Pubtables1MBuilderStruct(DataFlowBaseBuilder):
|
|
|
302
302
|
filter_empty_image=True,
|
|
303
303
|
fake_score=fake_score,
|
|
304
304
|
category_name_mapping={
|
|
305
|
-
"table": LayoutType.
|
|
306
|
-
"table spanning cell": CellType.
|
|
307
|
-
"table row": LayoutType.
|
|
308
|
-
"table row header": CellType.
|
|
309
|
-
"table projected row header": CellType.
|
|
310
|
-
"table column": LayoutType.
|
|
311
|
-
"table column header": CellType.
|
|
305
|
+
"table": LayoutType.TABLE,
|
|
306
|
+
"table spanning cell": CellType.SPANNING,
|
|
307
|
+
"table row": LayoutType.ROW,
|
|
308
|
+
"table row header": CellType.ROW_HEADER,
|
|
309
|
+
"table projected row header": CellType.PROJECTED_ROW_HEADER,
|
|
310
|
+
"table column": LayoutType.COLUMN,
|
|
311
|
+
"table column header": CellType.COLUMN_HEADER,
|
|
312
312
|
},
|
|
313
313
|
),
|
|
314
314
|
)
|
|
@@ -29,16 +29,16 @@ Module for Pubtabnet dataset. Place the dataset as follows
|
|
|
29
29
|
"""
|
|
30
30
|
from __future__ import annotations
|
|
31
31
|
|
|
32
|
-
from typing import
|
|
32
|
+
from typing import Mapping, Union
|
|
33
33
|
|
|
34
34
|
from ...dataflow import DataFlow, MapData
|
|
35
35
|
from ...dataflow.custom_serialize import SerializerJsonlines
|
|
36
36
|
from ...datasets.info import DatasetInfo
|
|
37
37
|
from ...mapper.cats import cat_to_sub_cat, filter_cat
|
|
38
38
|
from ...mapper.pubstruct import pub_to_image
|
|
39
|
-
from ...utils.detection_types import JsonDict
|
|
40
39
|
from ...utils.logger import LoggingRecord, logger
|
|
41
40
|
from ...utils.settings import CellType, DatasetType, LayoutType, ObjectTypes, TableType, WordType
|
|
41
|
+
from ...utils.types import PubtabnetDict
|
|
42
42
|
from ..base import _BuiltInDataset
|
|
43
43
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
44
44
|
from ..info import DatasetCategories
|
|
@@ -70,38 +70,38 @@ _URL = (
|
|
|
70
70
|
"pubtabnet.tar.gz?_ga=2.267291150.146828643.1629125962-1173244232.1625045842"
|
|
71
71
|
)
|
|
72
72
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
|
|
73
|
-
_TYPE = DatasetType.
|
|
73
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
74
74
|
_LOCATION = "pubtabnet"
|
|
75
75
|
_ANNOTATION_FILES: Mapping[str, str] = {"all": "PubTabNet_2.0.0.jsonl"}
|
|
76
76
|
|
|
77
|
-
_INIT_CATEGORIES = [LayoutType.
|
|
78
|
-
_SUB_CATEGORIES:
|
|
77
|
+
_INIT_CATEGORIES = [LayoutType.CELL, TableType.ITEM, LayoutType.TABLE, LayoutType.WORD]
|
|
78
|
+
_SUB_CATEGORIES: dict[ObjectTypes, dict[ObjectTypes, list[ObjectTypes]]]
|
|
79
79
|
_SUB_CATEGORIES = {
|
|
80
|
-
TableType.
|
|
81
|
-
LayoutType.
|
|
82
|
-
CellType.
|
|
83
|
-
CellType.
|
|
84
|
-
CellType.
|
|
85
|
-
CellType.
|
|
86
|
-
CellType.
|
|
87
|
-
CellType.
|
|
80
|
+
TableType.ITEM: {TableType.ITEM: [LayoutType.ROW, LayoutType.COLUMN]},
|
|
81
|
+
LayoutType.CELL: {
|
|
82
|
+
CellType.HEADER: [CellType.HEADER, CellType.BODY],
|
|
83
|
+
CellType.ROW_NUMBER: [],
|
|
84
|
+
CellType.COLUMN_NUMBER: [],
|
|
85
|
+
CellType.ROW_SPAN: [],
|
|
86
|
+
CellType.COLUMN_SPAN: [],
|
|
87
|
+
CellType.SPANNING: [CellType.SPANNING],
|
|
88
88
|
},
|
|
89
|
-
CellType.
|
|
90
|
-
CellType.
|
|
91
|
-
CellType.
|
|
92
|
-
CellType.
|
|
93
|
-
CellType.
|
|
94
|
-
CellType.
|
|
89
|
+
CellType.HEADER: {
|
|
90
|
+
CellType.ROW_NUMBER: [],
|
|
91
|
+
CellType.COLUMN_NUMBER: [],
|
|
92
|
+
CellType.ROW_SPAN: [],
|
|
93
|
+
CellType.COLUMN_SPAN: [],
|
|
94
|
+
CellType.SPANNING: [CellType.SPANNING],
|
|
95
95
|
},
|
|
96
|
-
CellType.
|
|
97
|
-
CellType.
|
|
98
|
-
CellType.
|
|
99
|
-
CellType.
|
|
100
|
-
CellType.
|
|
101
|
-
CellType.
|
|
96
|
+
CellType.BODY: {
|
|
97
|
+
CellType.ROW_NUMBER: [],
|
|
98
|
+
CellType.COLUMN_NUMBER: [],
|
|
99
|
+
CellType.ROW_SPAN: [],
|
|
100
|
+
CellType.COLUMN_SPAN: [],
|
|
101
|
+
CellType.SPANNING: [CellType.SPANNING],
|
|
102
102
|
},
|
|
103
|
-
LayoutType.
|
|
104
|
-
LayoutType.
|
|
103
|
+
LayoutType.TABLE: {TableType.HTML: [TableType.HTML]},
|
|
104
|
+
LayoutType.WORD: {WordType.CHARACTERS: [WordType.CHARACTERS]},
|
|
105
105
|
}
|
|
106
106
|
|
|
107
107
|
|
|
@@ -170,7 +170,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
|
|
|
170
170
|
df = SerializerJsonlines.load(path, max_datapoints=max_datapoints)
|
|
171
171
|
|
|
172
172
|
# Map
|
|
173
|
-
def replace_filename(dp:
|
|
173
|
+
def replace_filename(dp: PubtabnetDict) -> PubtabnetDict:
|
|
174
174
|
dp["filename"] = self.get_workdir() / dp["split"] / dp["filename"]
|
|
175
175
|
return dp
|
|
176
176
|
|
|
@@ -178,7 +178,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
|
|
|
178
178
|
df = MapData(df, lambda dp: dp if dp["split"] == split else None)
|
|
179
179
|
pub_mapper = pub_to_image(
|
|
180
180
|
self.categories.get_categories(name_as_key=True, init=True),
|
|
181
|
-
load_image,
|
|
181
|
+
load_image=load_image,
|
|
182
182
|
fake_score=fake_score,
|
|
183
183
|
rows_and_cols=rows_and_cols,
|
|
184
184
|
dd_pipe_like=dd_pipe_like,
|
|
@@ -187,6 +187,7 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
|
|
|
187
187
|
)
|
|
188
188
|
|
|
189
189
|
df = MapData(df, pub_mapper)
|
|
190
|
+
|
|
190
191
|
if self.categories.is_cat_to_sub_cat():
|
|
191
192
|
df = MapData(
|
|
192
193
|
df,
|
|
@@ -36,12 +36,12 @@ from typing import Mapping, Union
|
|
|
36
36
|
|
|
37
37
|
from ...dataflow import DataFlow, MapData
|
|
38
38
|
from ...dataflow.custom_serialize import SerializerTabsepFiles
|
|
39
|
-
from ...datapoint.annotation import CategoryAnnotation
|
|
39
|
+
from ...datapoint.annotation import CategoryAnnotation
|
|
40
40
|
from ...datapoint.image import Image
|
|
41
41
|
from ...mapper.cats import filter_summary
|
|
42
42
|
from ...mapper.maputils import curry
|
|
43
43
|
from ...utils.fs import load_image_from_file
|
|
44
|
-
from ...utils.settings import DatasetType, DocumentType, PageType, TypeOrStr
|
|
44
|
+
from ...utils.settings import DatasetType, DocumentType, PageType, SummaryType, TypeOrStr
|
|
45
45
|
from ..base import _BuiltInDataset
|
|
46
46
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
47
47
|
from ..info import DatasetCategories, DatasetInfo
|
|
@@ -64,27 +64,27 @@ _LICENSE = (
|
|
|
64
64
|
_URL = "https://www.cs.cmu.edu/~aharley/rvl-cdip/"
|
|
65
65
|
|
|
66
66
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
|
|
67
|
-
_TYPE = DatasetType.
|
|
67
|
+
_TYPE = DatasetType.SEQUENCE_CLASSIFICATION
|
|
68
68
|
_LOCATION = "rvl-cdip"
|
|
69
69
|
|
|
70
70
|
_ANNOTATION_FILES: Mapping[str, str] = {"train": "labels/train.txt", "val": "labels/val.txt", "test": "labels/test.txt"}
|
|
71
71
|
_INIT_CATEGORIES = [
|
|
72
|
-
DocumentType.
|
|
73
|
-
DocumentType.
|
|
74
|
-
DocumentType.
|
|
75
|
-
DocumentType.
|
|
76
|
-
DocumentType.
|
|
77
|
-
DocumentType.
|
|
78
|
-
DocumentType.
|
|
79
|
-
DocumentType.
|
|
80
|
-
DocumentType.
|
|
81
|
-
DocumentType.
|
|
82
|
-
DocumentType.
|
|
83
|
-
DocumentType.
|
|
84
|
-
DocumentType.
|
|
85
|
-
DocumentType.
|
|
86
|
-
DocumentType.
|
|
87
|
-
DocumentType.
|
|
72
|
+
DocumentType.LETTER,
|
|
73
|
+
DocumentType.FORM,
|
|
74
|
+
DocumentType.EMAIL,
|
|
75
|
+
DocumentType.HANDWRITTEN,
|
|
76
|
+
DocumentType.ADVERTISEMENT,
|
|
77
|
+
DocumentType.SCIENTIFIC_REPORT,
|
|
78
|
+
DocumentType.SCIENTIFIC_PUBLICATION,
|
|
79
|
+
DocumentType.SPECIFICATION,
|
|
80
|
+
DocumentType.FILE_FOLDER,
|
|
81
|
+
DocumentType.NEWS_ARTICLE,
|
|
82
|
+
DocumentType.BUDGET,
|
|
83
|
+
DocumentType.INVOICE,
|
|
84
|
+
DocumentType.PRESENTATION,
|
|
85
|
+
DocumentType.QUESTIONNAIRE,
|
|
86
|
+
DocumentType.RESUME,
|
|
87
|
+
DocumentType.MEMO,
|
|
88
88
|
]
|
|
89
89
|
|
|
90
90
|
|
|
@@ -139,15 +139,15 @@ class RvlcdipBuilder(DataFlowBaseBuilder):
|
|
|
139
139
|
|
|
140
140
|
@curry
|
|
141
141
|
def _map_str_to_image(dp: str, load_img: bool) -> Image:
|
|
142
|
-
location,
|
|
143
|
-
label =
|
|
142
|
+
location, label_str = dp.split()[0], dp.split()[1]
|
|
143
|
+
label = int(label_str) + 1
|
|
144
144
|
file_name = os.path.split(location)[1]
|
|
145
145
|
image = Image(location=(self.get_workdir() / "images" / location).as_posix(), file_name=file_name)
|
|
146
146
|
image.image = load_image_from_file(image.location)
|
|
147
|
-
summary =
|
|
147
|
+
summary = CategoryAnnotation(category_name=SummaryType.SUMMARY)
|
|
148
148
|
categories_dict = self.categories.get_categories(init=True)
|
|
149
149
|
summary.dump_sub_category(
|
|
150
|
-
PageType.
|
|
150
|
+
PageType.DOCUMENT_TYPE, CategoryAnnotation(category_name=categories_dict[label], category_id=label)
|
|
151
151
|
)
|
|
152
152
|
image.summary = summary
|
|
153
153
|
if not load_img:
|
|
@@ -159,15 +159,14 @@ class RvlcdipBuilder(DataFlowBaseBuilder):
|
|
|
159
159
|
if self.categories.is_filtered():
|
|
160
160
|
df = MapData(
|
|
161
161
|
df,
|
|
162
|
-
filter_summary({PageType.
|
|
162
|
+
filter_summary({PageType.DOCUMENT_TYPE: self.categories.get_categories(as_dict=False, filtered=True)}),
|
|
163
163
|
)
|
|
164
164
|
|
|
165
165
|
@curry
|
|
166
|
-
def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr,
|
|
167
|
-
if dp.summary:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
|
|
166
|
+
def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, int]) -> Image:
|
|
167
|
+
if PageType.DOCUMENT_TYPE in dp.summary.sub_categories:
|
|
168
|
+
summary_cat = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE)
|
|
169
|
+
summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
|
|
171
170
|
return dp
|
|
172
171
|
|
|
173
172
|
df = MapData(df, _re_map_cat_ids(self.categories.get_categories(filtered=True, name_as_key=True)))
|