deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +16 -29
- deepdoctection/analyzer/dd.py +70 -59
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +41 -56
- deepdoctection/datapoint/box.py +9 -8
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +56 -44
- deepdoctection/datapoint/view.py +245 -150
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +35 -26
- deepdoctection/datasets/base.py +14 -12
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +24 -26
- deepdoctection/datasets/instances/doclaynet.py +51 -51
- deepdoctection/datasets/instances/fintabnet.py +46 -46
- deepdoctection/datasets/instances/funsd.py +25 -24
- deepdoctection/datasets/instances/iiitar13k.py +13 -10
- deepdoctection/datasets/instances/layouttest.py +4 -3
- deepdoctection/datasets/instances/publaynet.py +5 -5
- deepdoctection/datasets/instances/pubtables1m.py +24 -21
- deepdoctection/datasets/instances/pubtabnet.py +32 -30
- deepdoctection/datasets/instances/rvlcdip.py +30 -30
- deepdoctection/datasets/instances/xfund.py +26 -26
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +15 -13
- deepdoctection/eval/eval.py +41 -37
- deepdoctection/eval/tedsmetric.py +30 -23
- deepdoctection/eval/tp_eval_callback.py +16 -19
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +85 -113
- deepdoctection/extern/deskew.py +14 -11
- deepdoctection/extern/doctrocr.py +141 -130
- deepdoctection/extern/fastlang.py +27 -18
- deepdoctection/extern/hfdetr.py +71 -62
- deepdoctection/extern/hflayoutlm.py +504 -211
- deepdoctection/extern/hflm.py +230 -0
- deepdoctection/extern/model.py +488 -302
- deepdoctection/extern/pdftext.py +23 -19
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +29 -19
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +18 -18
- deepdoctection/extern/tp/tfutils.py +57 -9
- deepdoctection/extern/tp/tpcompat.py +21 -14
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +45 -53
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/cats.py +27 -29
- deepdoctection/mapper/cocostruct.py +10 -10
- deepdoctection/mapper/d2struct.py +27 -26
- deepdoctection/mapper/hfstruct.py +13 -8
- deepdoctection/mapper/laylmstruct.py +178 -37
- deepdoctection/mapper/maputils.py +12 -11
- deepdoctection/mapper/match.py +2 -2
- deepdoctection/mapper/misc.py +11 -9
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +5 -5
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +5 -5
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +12 -14
- deepdoctection/pipe/base.py +52 -106
- deepdoctection/pipe/common.py +72 -59
- deepdoctection/pipe/concurrency.py +16 -11
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +20 -16
- deepdoctection/pipe/lm.py +75 -105
- deepdoctection/pipe/order.py +194 -89
- deepdoctection/pipe/refine.py +111 -124
- deepdoctection/pipe/segment.py +156 -161
- deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +48 -41
- deepdoctection/train/hf_detr_train.py +41 -30
- deepdoctection/train/hf_layoutlm_train.py +153 -135
- deepdoctection/train/tp_frcnn_train.py +32 -31
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +87 -125
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +22 -18
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +16 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +11 -11
- deepdoctection/utils/settings.py +185 -181
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +74 -72
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
- deepdoctection-0.33.dist-info/RECORD +146 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.31.dist-info/RECORD +0 -144
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
|
@@ -26,13 +26,10 @@ Create an info card, a DataFlowBaseBuilder derived instance, possibly a category
|
|
|
26
26
|
DatasetBase derived instance to create a data set.
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
|
-
from
|
|
29
|
+
from .adapter import *
|
|
30
30
|
from .base import *
|
|
31
31
|
from .dataflow_builder import DataFlowBaseBuilder
|
|
32
32
|
from .info import *
|
|
33
33
|
from .instances import *
|
|
34
34
|
from .registry import *
|
|
35
35
|
from .save import *
|
|
36
|
-
|
|
37
|
-
if pytorch_available():
|
|
38
|
-
from .adapter import *
|
|
@@ -22,19 +22,22 @@ Module for wrapping datasets into a pytorch dataset framework.
|
|
|
22
22
|
|
|
23
23
|
from typing import Any, Callable, Iterator, Mapping, Optional, Union
|
|
24
24
|
|
|
25
|
-
from
|
|
25
|
+
from lazy_imports import try_import
|
|
26
|
+
|
|
27
|
+
from ..dataflow import CustomDataFromList, MapData, RepeatedData
|
|
26
28
|
from ..datapoint.image import Image
|
|
27
29
|
from ..datasets.base import DatasetBase
|
|
28
30
|
from ..mapper.maputils import LabelSummarizer
|
|
29
|
-
from ..utils.detection_types import DP, JsonDict
|
|
30
|
-
from ..utils.file_utils import pytorch_available
|
|
31
31
|
from ..utils.logger import LoggingRecord, log_once, logger
|
|
32
32
|
from ..utils.settings import DatasetType, LayoutType, ObjectTypes, PageType, WordType
|
|
33
33
|
from ..utils.tqdm import get_tqdm
|
|
34
|
+
from ..utils.types import DP, JsonDict
|
|
34
35
|
from .registry import get_dataset
|
|
35
36
|
|
|
36
|
-
|
|
37
|
+
with try_import() as import_guard:
|
|
37
38
|
from torch.utils.data import IterableDataset
|
|
39
|
+
if not import_guard.is_successful():
|
|
40
|
+
from ..utils.mocks import IterableDataset # type: ignore
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
class DatasetAdapter(IterableDataset): # type: ignore
|
|
@@ -54,6 +57,7 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
54
57
|
cache_dataset: bool,
|
|
55
58
|
image_to_framework_func: Optional[Callable[[DP], Optional[JsonDict]]] = None,
|
|
56
59
|
use_token_tag: bool = True,
|
|
60
|
+
number_repetitions: int = -1,
|
|
57
61
|
**build_kwargs: str,
|
|
58
62
|
) -> None:
|
|
59
63
|
"""
|
|
@@ -66,6 +70,12 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
66
70
|
`WordType.token_class`.
|
|
67
71
|
:param build_kwargs: optional parameters for defining the dataflow.
|
|
68
72
|
"""
|
|
73
|
+
if number_repetitions == -1 and not cache_dataset:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
"Number of repetitions cannot be infinite when not caching the dataset. Instead try to"
|
|
76
|
+
" set a high number of repetitions"
|
|
77
|
+
)
|
|
78
|
+
|
|
69
79
|
if isinstance(name_or_dataset, str):
|
|
70
80
|
self.dataset = get_dataset(name_or_dataset)
|
|
71
81
|
else:
|
|
@@ -75,22 +85,22 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
75
85
|
|
|
76
86
|
if cache_dataset:
|
|
77
87
|
logger.info(LoggingRecord("Yielding dataflow into memory and create torch dataset"))
|
|
78
|
-
categories: Mapping[
|
|
88
|
+
categories: Mapping[int, ObjectTypes] = {}
|
|
79
89
|
_data_statistics = True
|
|
80
|
-
if self.dataset.dataset_info.type in (DatasetType.
|
|
90
|
+
if self.dataset.dataset_info.type in (DatasetType.OBJECT_DETECTION, DatasetType.SEQUENCE_CLASSIFICATION):
|
|
81
91
|
categories = self.dataset.dataflow.categories.get_categories(filtered=True)
|
|
82
|
-
elif self.dataset.dataset_info.type in (DatasetType.
|
|
92
|
+
elif self.dataset.dataset_info.type in (DatasetType.TOKEN_CLASSIFICATION,):
|
|
83
93
|
if use_token_tag:
|
|
84
94
|
categories = self.dataset.dataflow.categories.get_sub_categories(
|
|
85
|
-
categories=LayoutType.
|
|
86
|
-
sub_categories={LayoutType.
|
|
95
|
+
categories=LayoutType.WORD,
|
|
96
|
+
sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG]},
|
|
87
97
|
keys=False,
|
|
88
98
|
values_as_dict=True,
|
|
89
|
-
)[LayoutType.
|
|
99
|
+
)[LayoutType.WORD][WordType.TOKEN_TAG]
|
|
90
100
|
else:
|
|
91
101
|
categories = self.dataset.dataflow.categories.get_sub_categories(
|
|
92
|
-
categories=LayoutType.
|
|
93
|
-
)[LayoutType.
|
|
102
|
+
categories=LayoutType.WORD, sub_categories={LayoutType.WORD: [WordType.TOKEN_CLASS]}, keys=False
|
|
103
|
+
)[LayoutType.WORD][WordType.TOKEN_CLASS]
|
|
94
104
|
else:
|
|
95
105
|
logger.info(
|
|
96
106
|
LoggingRecord(f"dataset is of type {self.dataset.dataset_info.type}. Cannot generate statistics.")
|
|
@@ -118,19 +128,19 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
118
128
|
"images when needed and reduce memory costs!!!",
|
|
119
129
|
"warn",
|
|
120
130
|
)
|
|
121
|
-
if self.dataset.dataset_info.type == DatasetType.
|
|
131
|
+
if self.dataset.dataset_info.type == DatasetType.OBJECT_DETECTION:
|
|
122
132
|
anns = dp.get_annotation()
|
|
123
|
-
cat_ids = [
|
|
133
|
+
cat_ids = [ann.category_id for ann in anns]
|
|
124
134
|
|
|
125
|
-
elif self.dataset.dataset_info.type == DatasetType.
|
|
126
|
-
cat_ids = dp.summary.get_sub_category(PageType.
|
|
135
|
+
elif self.dataset.dataset_info.type == DatasetType.SEQUENCE_CLASSIFICATION:
|
|
136
|
+
cat_ids = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id
|
|
127
137
|
|
|
128
|
-
elif self.dataset.dataset_info.type == DatasetType.
|
|
129
|
-
anns = dp.get_annotation(category_names=LayoutType.
|
|
138
|
+
elif self.dataset.dataset_info.type == DatasetType.TOKEN_CLASSIFICATION:
|
|
139
|
+
anns = dp.get_annotation(category_names=LayoutType.WORD)
|
|
130
140
|
if use_token_tag:
|
|
131
|
-
cat_ids = [ann.get_sub_category(WordType.
|
|
141
|
+
cat_ids = [ann.get_sub_category(WordType.TOKEN_TAG).category_id for ann in anns]
|
|
132
142
|
else:
|
|
133
|
-
cat_ids = [ann.get_sub_category(WordType.
|
|
143
|
+
cat_ids = [ann.get_sub_category(WordType.TOKEN_CLASS).category_id for ann in anns]
|
|
134
144
|
|
|
135
145
|
if _data_statistics:
|
|
136
146
|
summarizer.dump(cat_ids)
|
|
@@ -141,14 +151,13 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
141
151
|
if _data_statistics:
|
|
142
152
|
summarizer.print_summary_histogram()
|
|
143
153
|
self.number_datapoints = len(datapoints)
|
|
154
|
+
if not self.number_datapoints:
|
|
155
|
+
raise ValueError("DatasetAdapter receives no datapoints. Please check your dataflow build config.")
|
|
144
156
|
|
|
145
157
|
df = CustomDataFromList(datapoints, shuffle=True)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
df_list = CacheData(df).get_cache()
|
|
150
|
-
df = CustomDataFromList(df_list, shuffle=True)
|
|
151
|
-
df = RepeatedData(df, -1)
|
|
158
|
+
df = RepeatedData(df, number_repetitions)
|
|
159
|
+
else:
|
|
160
|
+
df = RepeatedData(df, number_repetitions)
|
|
152
161
|
|
|
153
162
|
if image_to_framework_func:
|
|
154
163
|
df = MapData(df, image_to_framework_func)
|
deepdoctection/datasets/base.py
CHANGED
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for the base class of datasets.
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
21
23
|
import json
|
|
22
24
|
import os
|
|
23
25
|
import pprint
|
|
@@ -25,15 +27,15 @@ from abc import ABC, abstractmethod
|
|
|
25
27
|
from collections import defaultdict
|
|
26
28
|
from inspect import signature
|
|
27
29
|
from pathlib import Path
|
|
28
|
-
from typing import Any,
|
|
30
|
+
from typing import Any, Mapping, Optional, Sequence, Type, Union
|
|
29
31
|
|
|
30
32
|
import numpy as np
|
|
31
33
|
|
|
32
34
|
from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
|
|
33
35
|
from ..datapoint.image import Image
|
|
34
|
-
from ..utils.detection_types import Pathlike
|
|
35
36
|
from ..utils.logger import LoggingRecord, logger
|
|
36
|
-
from ..utils.settings import ObjectTypes, TypeOrStr, get_type
|
|
37
|
+
from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
|
|
38
|
+
from ..utils.types import PathLikeOrStr
|
|
37
39
|
from .dataflow_builder import DataFlowBaseBuilder
|
|
38
40
|
from .info import DatasetCategories, DatasetInfo, get_merged_categories
|
|
39
41
|
|
|
@@ -136,14 +138,14 @@ class SplitDataFlow(DataFlowBaseBuilder):
|
|
|
136
138
|
Dataflow builder for splitting datasets
|
|
137
139
|
"""
|
|
138
140
|
|
|
139
|
-
def __init__(self, train:
|
|
141
|
+
def __init__(self, train: list[Image], val: list[Image], test: Optional[list[Image]]):
|
|
140
142
|
"""
|
|
141
143
|
:param train: Cached train split
|
|
142
144
|
:param val: Cached val split
|
|
143
145
|
:param test: Cached test split
|
|
144
146
|
"""
|
|
145
147
|
super().__init__(location="")
|
|
146
|
-
self.split_cache:
|
|
148
|
+
self.split_cache: dict[str, list[Image]]
|
|
147
149
|
if test is None:
|
|
148
150
|
self.split_cache = {"train": train, "val": val}
|
|
149
151
|
else:
|
|
@@ -213,8 +215,8 @@ class MergeDataset(DatasetBase):
|
|
|
213
215
|
:param datasets: An arbitrary number of datasets
|
|
214
216
|
"""
|
|
215
217
|
self.datasets = datasets
|
|
216
|
-
self.dataflows: Optional[
|
|
217
|
-
self.datapoint_list: Optional[
|
|
218
|
+
self.dataflows: Optional[tuple[DataFlow, ...]] = None
|
|
219
|
+
self.datapoint_list: Optional[list[Image]] = None
|
|
218
220
|
super().__init__()
|
|
219
221
|
self._dataset_info.type = datasets[0].dataset_info.type
|
|
220
222
|
self._dataset_info.name = "merge_" + "_".join([dataset.dataset_info.name for dataset in self.datasets])
|
|
@@ -237,7 +239,7 @@ class MergeDataset(DatasetBase):
|
|
|
237
239
|
def __init__(self, *dataflow_builders: DataFlowBaseBuilder):
|
|
238
240
|
super().__init__("")
|
|
239
241
|
self.dataflow_builders = dataflow_builders
|
|
240
|
-
self.dataflows: Optional[
|
|
242
|
+
self.dataflows: Optional[tuple[DataFlow, ...]] = None
|
|
241
243
|
|
|
242
244
|
def build(self, **kwargs: Union[str, int]) -> DataFlow:
|
|
243
245
|
"""
|
|
@@ -325,7 +327,7 @@ class MergeDataset(DatasetBase):
|
|
|
325
327
|
self._dataflow_builder = SplitDataFlow(train_dataset, val_dataset, test_dataset)
|
|
326
328
|
self._dataflow_builder.categories = self._categories()
|
|
327
329
|
|
|
328
|
-
def get_ids_by_split(self) ->
|
|
330
|
+
def get_ids_by_split(self) -> dict[str, list[str]]:
|
|
329
331
|
"""
|
|
330
332
|
To reproduce a dataset split at a later stage, get a summary of the by having a dict of list with split and
|
|
331
333
|
the image ids contained in the split.
|
|
@@ -387,7 +389,7 @@ class CustomDataset(DatasetBase):
|
|
|
387
389
|
self,
|
|
388
390
|
name: str,
|
|
389
391
|
dataset_type: TypeOrStr,
|
|
390
|
-
location:
|
|
392
|
+
location: PathLikeOrStr,
|
|
391
393
|
init_categories: Sequence[ObjectTypes],
|
|
392
394
|
dataflow_builder: Type[DataFlowBaseBuilder],
|
|
393
395
|
init_sub_categories: Optional[Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]]] = None,
|
|
@@ -423,7 +425,7 @@ class CustomDataset(DatasetBase):
|
|
|
423
425
|
"""
|
|
424
426
|
|
|
425
427
|
self.name = name
|
|
426
|
-
self.type = get_type(dataset_type)
|
|
428
|
+
self.type: DatasetType = get_type(dataset_type) # type: ignore
|
|
427
429
|
self.location = location
|
|
428
430
|
self.init_categories = init_categories
|
|
429
431
|
if init_sub_categories is None:
|
|
@@ -449,7 +451,7 @@ class CustomDataset(DatasetBase):
|
|
|
449
451
|
return self.dataflow_builder
|
|
450
452
|
|
|
451
453
|
@staticmethod
|
|
452
|
-
def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) ->
|
|
454
|
+
def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
|
|
453
455
|
"""
|
|
454
456
|
This static method creates a CustomDataset instance from a dataset card.
|
|
455
457
|
|
|
@@ -24,8 +24,8 @@ from pathlib import Path
|
|
|
24
24
|
from typing import Mapping, Optional, Sequence, Union
|
|
25
25
|
|
|
26
26
|
from ..dataflow import DataFlow
|
|
27
|
-
from ..utils.detection_types import Pathlike
|
|
28
27
|
from ..utils.fs import get_dataset_dir_path
|
|
28
|
+
from ..utils.types import PathLikeOrStr
|
|
29
29
|
from .info import DatasetCategories
|
|
30
30
|
|
|
31
31
|
|
|
@@ -44,7 +44,7 @@ class DataFlowBaseBuilder(ABC):
|
|
|
44
44
|
|
|
45
45
|
def __init__(
|
|
46
46
|
self,
|
|
47
|
-
location:
|
|
47
|
+
location: PathLikeOrStr,
|
|
48
48
|
annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
|
|
49
49
|
):
|
|
50
50
|
"""
|
|
@@ -100,7 +100,7 @@ class DataFlowBaseBuilder(ABC):
|
|
|
100
100
|
|
|
101
101
|
:return: local workdir
|
|
102
102
|
"""
|
|
103
|
-
return get_dataset_dir_path() / self.location
|
|
103
|
+
return Path(get_dataset_dir_path()) / self.location
|
|
104
104
|
|
|
105
105
|
@abstractmethod
|
|
106
106
|
def build(self, **kwargs: Union[str, int]) -> DataFlow:
|
deepdoctection/datasets/info.py
CHANGED
|
@@ -22,34 +22,34 @@ Module for storing dataset info (e.g. general meta data or categories)
|
|
|
22
22
|
from copy import copy
|
|
23
23
|
from dataclasses import dataclass, field
|
|
24
24
|
from itertools import chain
|
|
25
|
-
from typing import Any,
|
|
25
|
+
from typing import Any, Literal, Mapping, Optional, Sequence, Union, no_type_check, overload
|
|
26
26
|
|
|
27
|
-
from ..utils.settings import
|
|
27
|
+
from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
|
|
28
28
|
from ..utils.utils import call_only_once
|
|
29
29
|
|
|
30
30
|
__all__ = ["DatasetInfo", "DatasetCategories", "get_merged_categories"]
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
@overload
|
|
34
|
-
def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[True], starts_with: int = ...) ->
|
|
34
|
+
def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[True], starts_with: int = ...) -> dict[ObjectTypes, int]:
|
|
35
35
|
...
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
@overload
|
|
39
|
-
def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[False], starts_with: int = ...) ->
|
|
39
|
+
def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[False], starts_with: int = ...) -> dict[int, ObjectTypes]:
|
|
40
40
|
...
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
@overload
|
|
44
44
|
def _get_dict(
|
|
45
45
|
l: Sequence[ObjectTypes], name_as_key: bool, starts_with: int = ...
|
|
46
|
-
) -> Union[
|
|
46
|
+
) -> Union[dict[ObjectTypes, int], dict[int, ObjectTypes]]:
|
|
47
47
|
...
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def _get_dict(
|
|
51
51
|
l: Sequence[ObjectTypes], name_as_key: bool, starts_with: int = 1
|
|
52
|
-
) -> Union[
|
|
52
|
+
) -> Union[dict[ObjectTypes, int], dict[int, ObjectTypes]]:
|
|
53
53
|
"""
|
|
54
54
|
Converts a list into a dict, where keys/values are the list indices.
|
|
55
55
|
|
|
@@ -59,8 +59,8 @@ def _get_dict(
|
|
|
59
59
|
:return: A dictionary of list indices/list elements.
|
|
60
60
|
"""
|
|
61
61
|
if name_as_key:
|
|
62
|
-
return {v:
|
|
63
|
-
return
|
|
62
|
+
return {v: k for k, v in enumerate(l, starts_with)}
|
|
63
|
+
return dict(enumerate(l, starts_with))
|
|
64
64
|
|
|
65
65
|
|
|
66
66
|
@dataclass
|
|
@@ -89,7 +89,7 @@ class DatasetInfo:
|
|
|
89
89
|
license: str = field(default="")
|
|
90
90
|
url: Union[str, Sequence[str]] = field(default="")
|
|
91
91
|
splits: Mapping[str, str] = field(default_factory=dict)
|
|
92
|
-
type:
|
|
92
|
+
type: DatasetType = field(default=DatasetType.DEFAULT)
|
|
93
93
|
|
|
94
94
|
def get_split(self, key: str) -> str:
|
|
95
95
|
"""
|
|
@@ -143,13 +143,13 @@ class DatasetCategories:
|
|
|
143
143
|
@overload
|
|
144
144
|
def get_categories(
|
|
145
145
|
self, *, name_as_key: Literal[True], init: bool = ..., filtered: bool = ...
|
|
146
|
-
) -> Mapping[ObjectTypes,
|
|
146
|
+
) -> Mapping[ObjectTypes, int]:
|
|
147
147
|
...
|
|
148
148
|
|
|
149
149
|
@overload
|
|
150
150
|
def get_categories(
|
|
151
151
|
self, *, name_as_key: Literal[False] = ..., init: bool = ..., filtered: bool = ...
|
|
152
|
-
) -> Mapping[
|
|
152
|
+
) -> Mapping[int, ObjectTypes]:
|
|
153
153
|
...
|
|
154
154
|
|
|
155
155
|
@overload
|
|
@@ -161,12 +161,12 @@ class DatasetCategories:
|
|
|
161
161
|
@overload
|
|
162
162
|
def get_categories(
|
|
163
163
|
self, as_dict: Literal[True] = ..., name_as_key: bool = False, init: bool = False, filtered: bool = False
|
|
164
|
-
) -> Union[Mapping[ObjectTypes,
|
|
164
|
+
) -> Union[Mapping[ObjectTypes, int], Mapping[int, ObjectTypes]]:
|
|
165
165
|
...
|
|
166
166
|
|
|
167
167
|
def get_categories(
|
|
168
168
|
self, as_dict: bool = True, name_as_key: bool = False, init: bool = False, filtered: bool = False
|
|
169
|
-
) -> Union[Sequence[ObjectTypes], Mapping[ObjectTypes,
|
|
169
|
+
) -> Union[Sequence[ObjectTypes], Mapping[ObjectTypes, int], Mapping[int, ObjectTypes]]:
|
|
170
170
|
"""
|
|
171
171
|
Get categories of a dataset. The returned value also respects modifications of the inventory like filtered
|
|
172
172
|
categories of replaced categories with sub categories. However, you must correctly pass arguments to return the
|
|
@@ -229,7 +229,7 @@ class DatasetCategories:
|
|
|
229
229
|
if sub_categories is None:
|
|
230
230
|
sub_categories = {}
|
|
231
231
|
|
|
232
|
-
sub_cat:
|
|
232
|
+
sub_cat: dict[ObjectTypes, Union[ObjectTypes, list[ObjectTypes]]] = {}
|
|
233
233
|
for cat in _categories:
|
|
234
234
|
assert cat in self.get_categories( # pylint: disable=E1135
|
|
235
235
|
as_dict=False, filtered=True
|
|
@@ -254,9 +254,9 @@ class DatasetCategories:
|
|
|
254
254
|
for category, value in sub_cat.items():
|
|
255
255
|
if category not in sub_categories:
|
|
256
256
|
continue
|
|
257
|
-
sub_cat_tmp:
|
|
257
|
+
sub_cat_tmp: dict[str, Union[dict[int, ObjectTypes], dict[ObjectTypes, int], Sequence[str]]] = {}
|
|
258
258
|
sub_categories_list: Union[
|
|
259
|
-
ObjectTypes, str,
|
|
259
|
+
ObjectTypes, str, list[Sequence[Union[ObjectTypes, str]]], Sequence[Union[ObjectTypes, str]]
|
|
260
260
|
]
|
|
261
261
|
if isinstance(sub_categories[category], ObjectTypes):
|
|
262
262
|
sub_categories_list = [sub_categories[category]]
|
|
@@ -267,14 +267,12 @@ class DatasetCategories:
|
|
|
267
267
|
continue
|
|
268
268
|
if values_as_dict:
|
|
269
269
|
if not name_as_key:
|
|
270
|
-
sub_cat_tmp[sub_cat_key] =
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
}
|
|
270
|
+
sub_cat_tmp[sub_cat_key] = dict(
|
|
271
|
+
enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
|
|
272
|
+
)
|
|
274
273
|
else:
|
|
275
274
|
sub_cat_tmp[sub_cat_key] = {
|
|
276
|
-
v:
|
|
277
|
-
for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
|
|
275
|
+
v: k for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
|
|
278
276
|
}
|
|
279
277
|
else:
|
|
280
278
|
sub_cat_tmp[sub_cat_key] = self.init_sub_categories[category][get_type(sub_cat_key)]
|
|
@@ -284,7 +282,7 @@ class DatasetCategories:
|
|
|
284
282
|
return sub_cat
|
|
285
283
|
|
|
286
284
|
@call_only_once
|
|
287
|
-
def set_cat_to_sub_cat(self, cat_to_sub_cat:
|
|
285
|
+
def set_cat_to_sub_cat(self, cat_to_sub_cat: dict[TypeOrStr, TypeOrStr]) -> None:
|
|
288
286
|
"""
|
|
289
287
|
Change category representation if sub-categories are available. Pass a dictionary of the main category
|
|
290
288
|
and the requested sub-category. This will change the dictionary of categories and the category names
|
|
@@ -323,7 +321,7 @@ class DatasetCategories:
|
|
|
323
321
|
self._categories_update = _categories_update_list
|
|
324
322
|
|
|
325
323
|
@call_only_once
|
|
326
|
-
def filter_categories(self, categories: Union[TypeOrStr,
|
|
324
|
+
def filter_categories(self, categories: Union[TypeOrStr, list[TypeOrStr]]) -> None:
|
|
327
325
|
"""
|
|
328
326
|
Filter categories of a dataset. This will keep all the categories chosen and remove all others.
|
|
329
327
|
This method can only be called once per object.
|
|
@@ -415,7 +413,7 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
|
|
|
415
413
|
# form a set of possible sub category values. To get a list of all values from all dataset, take the union
|
|
416
414
|
intersect_init_sub_cat_values = {}
|
|
417
415
|
for sub_cat_key in intersect_sub_cat_per_key:
|
|
418
|
-
val:
|
|
416
|
+
val: set[ObjectTypes] = set()
|
|
419
417
|
for cat in categories:
|
|
420
418
|
val.update(cat.init_sub_categories[key][sub_cat_key])
|
|
421
419
|
intersect_init_sub_cat_values[sub_cat_key] = list(val)
|
|
@@ -425,7 +423,7 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
|
|
|
425
423
|
# construction is not deterministic but guarantees for unique values in all sub categories. Now we build the
|
|
426
424
|
# ensemble dict of sub categories where we guarantee unique values on one hand side and always maintain the
|
|
427
425
|
# same arrangements for all category/ sub category lists
|
|
428
|
-
init_sub_cat:
|
|
426
|
+
init_sub_cat: dict[ObjectTypes, Any] = {}
|
|
429
427
|
for category in categories:
|
|
430
428
|
for cat in intersect_sub_cat_keys:
|
|
431
429
|
for sub_cat_key in category.init_sub_categories[cat]:
|
|
@@ -25,19 +25,20 @@ Module for DocLayNet dataset. Place the dataset as follows
|
|
|
25
25
|
├── PNG
|
|
26
26
|
│ ├── 0a0d43e301facee9e99cc33b9b16e732dd207135f4027e75f6aea2bf117535a2.png
|
|
27
27
|
"""
|
|
28
|
+
from __future__ import annotations
|
|
28
29
|
|
|
29
30
|
import os
|
|
30
31
|
from typing import Mapping, Sequence, Union
|
|
31
32
|
|
|
32
33
|
from ...dataflow import DataFlow, MapData, MapDataComponent, SerializerCoco
|
|
33
|
-
from ...datapoint.annotation import CategoryAnnotation
|
|
34
|
+
from ...datapoint.annotation import CategoryAnnotation
|
|
34
35
|
from ...datapoint.image import Image
|
|
35
36
|
from ...mapper.cats import add_summary, cat_to_sub_cat, filter_cat, filter_summary
|
|
36
37
|
from ...mapper.cocostruct import coco_to_image
|
|
37
38
|
from ...mapper.maputils import curry
|
|
38
|
-
from ...utils.detection_types import JsonDict
|
|
39
39
|
from ...utils.fs import load_image_from_file
|
|
40
|
-
from ...utils.settings import DatasetType, DocumentType, LayoutType, ObjectTypes, PageType, TypeOrStr
|
|
40
|
+
from ...utils.settings import DatasetType, DocumentType, LayoutType, ObjectTypes, PageType, SummaryType, TypeOrStr
|
|
41
|
+
from ...utils.types import CocoDatapointDict
|
|
41
42
|
from ..base import DatasetBase
|
|
42
43
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
43
44
|
from ..info import DatasetCategories, DatasetInfo
|
|
@@ -63,36 +64,36 @@ _DESCRIPTION = (
|
|
|
63
64
|
_LICENSE = "CDLA-Permissive"
|
|
64
65
|
_URL = "https://codait-cos-dax.s3.us.cloud-object-storage.appdomain.cloud/dax-doclaynet/1.0.0/DocLayNet_core.zip"
|
|
65
66
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
|
|
66
|
-
_TYPE = DatasetType.
|
|
67
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
67
68
|
|
|
68
69
|
_LOCATION = "DocLayNet_core"
|
|
69
70
|
|
|
70
71
|
_ANNOTATION_FILES: Mapping[str, str] = {"train": "COCO/train.json", "val": "COCO/val.json", "test": "COCO/test.json"}
|
|
71
72
|
_INIT_CATEGORIES = [
|
|
72
|
-
LayoutType.
|
|
73
|
-
LayoutType.
|
|
74
|
-
LayoutType.
|
|
75
|
-
LayoutType.
|
|
76
|
-
LayoutType.
|
|
77
|
-
LayoutType.
|
|
78
|
-
LayoutType.
|
|
79
|
-
LayoutType.
|
|
80
|
-
LayoutType.
|
|
81
|
-
LayoutType.
|
|
82
|
-
LayoutType.
|
|
73
|
+
LayoutType.CAPTION,
|
|
74
|
+
LayoutType.FOOTNOTE,
|
|
75
|
+
LayoutType.FORMULA,
|
|
76
|
+
LayoutType.LIST,
|
|
77
|
+
LayoutType.PAGE_FOOTER,
|
|
78
|
+
LayoutType.PAGE_HEADER,
|
|
79
|
+
LayoutType.FIGURE,
|
|
80
|
+
LayoutType.SECTION_HEADER,
|
|
81
|
+
LayoutType.TABLE,
|
|
82
|
+
LayoutType.TEXT,
|
|
83
|
+
LayoutType.TITLE,
|
|
83
84
|
]
|
|
84
85
|
_SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]] = {
|
|
85
|
-
LayoutType.
|
|
86
|
-
LayoutType.
|
|
87
|
-
LayoutType.
|
|
88
|
-
LayoutType.
|
|
89
|
-
LayoutType.
|
|
90
|
-
LayoutType.
|
|
91
|
-
LayoutType.
|
|
92
|
-
LayoutType.
|
|
93
|
-
LayoutType.
|
|
94
|
-
LayoutType.
|
|
95
|
-
LayoutType.
|
|
86
|
+
LayoutType.CAPTION: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
|
|
87
|
+
LayoutType.FOOTNOTE: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
|
|
88
|
+
LayoutType.FORMULA: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
|
|
89
|
+
LayoutType.LIST: {DatasetType.PUBLAYNET: [LayoutType.LIST]},
|
|
90
|
+
LayoutType.PAGE_FOOTER: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
|
|
91
|
+
LayoutType.PAGE_HEADER: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
|
|
92
|
+
LayoutType.FIGURE: {DatasetType.PUBLAYNET: [LayoutType.FIGURE]},
|
|
93
|
+
LayoutType.SECTION_HEADER: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
|
|
94
|
+
LayoutType.TABLE: {DatasetType.PUBLAYNET: [LayoutType.TABLE]},
|
|
95
|
+
LayoutType.TEXT: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
|
|
96
|
+
LayoutType.TITLE: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
|
|
96
97
|
}
|
|
97
98
|
|
|
98
99
|
|
|
@@ -109,7 +110,7 @@ class DocLayNet(DatasetBase):
|
|
|
109
110
|
def _categories(self) -> DatasetCategories:
|
|
110
111
|
return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
|
|
111
112
|
|
|
112
|
-
def _builder(self) ->
|
|
113
|
+
def _builder(self) -> DocLayNetBuilder:
|
|
113
114
|
return DocLayNetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
114
115
|
|
|
115
116
|
|
|
@@ -161,7 +162,7 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
|
|
|
161
162
|
filter_empty_image=True,
|
|
162
163
|
fake_score=fake_score,
|
|
163
164
|
coarse_mapping={1: 10, 2: 10, 3: 10, 4: 4, 5: 10, 6: 11, 7: 7, 8: 11, 9: 9, 10: 10, 11: 11},
|
|
164
|
-
coarse_sub_cat_name=DatasetType.
|
|
165
|
+
coarse_sub_cat_name=DatasetType.PUBLAYNET,
|
|
165
166
|
),
|
|
166
167
|
)
|
|
167
168
|
|
|
@@ -185,14 +186,14 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
|
|
|
185
186
|
|
|
186
187
|
|
|
187
188
|
_NAME_SEQ = "doclaynet-seq"
|
|
188
|
-
_TYPE_SEQ = DatasetType.
|
|
189
|
+
_TYPE_SEQ = DatasetType.SEQUENCE_CLASSIFICATION
|
|
189
190
|
_INIT_CATEGORIES_SEQ = [
|
|
190
|
-
DocumentType.
|
|
191
|
-
DocumentType.
|
|
192
|
-
DocumentType.
|
|
193
|
-
DocumentType.
|
|
194
|
-
DocumentType.
|
|
195
|
-
DocumentType.
|
|
191
|
+
DocumentType.FINANCIAL_REPORT,
|
|
192
|
+
DocumentType.SCIENTIFIC_PUBLICATION,
|
|
193
|
+
DocumentType.LAWS_AND_REGULATIONS,
|
|
194
|
+
DocumentType.GOVERNMENT_TENDERS,
|
|
195
|
+
DocumentType.MANUALS,
|
|
196
|
+
DocumentType.PATENTS,
|
|
196
197
|
]
|
|
197
198
|
|
|
198
199
|
|
|
@@ -209,7 +210,7 @@ class DocLayNetSeq(DatasetBase):
|
|
|
209
210
|
def _categories(self) -> DatasetCategories:
|
|
210
211
|
return DatasetCategories(init_categories=_INIT_CATEGORIES_SEQ)
|
|
211
212
|
|
|
212
|
-
def _builder(self) ->
|
|
213
|
+
def _builder(self) -> DocLayNetSeqBuilder:
|
|
213
214
|
return DocLayNetSeqBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
214
215
|
|
|
215
216
|
|
|
@@ -244,22 +245,22 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
|
|
|
244
245
|
df = MapDataComponent(df, lambda dp: self.get_workdir() / "PNG" / dp, "file_name")
|
|
245
246
|
|
|
246
247
|
@curry
|
|
247
|
-
def _map_to_image(dp:
|
|
248
|
+
def _map_to_image(dp: CocoDatapointDict, load_img: bool) -> Image:
|
|
248
249
|
image = Image(location=dp["file_name"], file_name=os.path.split(dp["file_name"])[1])
|
|
249
250
|
image.image = load_image_from_file(image.location)
|
|
250
|
-
summary =
|
|
251
|
+
summary = CategoryAnnotation(category_name=SummaryType.SUMMARY)
|
|
251
252
|
label_to_category_name = {
|
|
252
|
-
"financial_reports": DocumentType.
|
|
253
|
-
"scientific_articles": DocumentType.
|
|
254
|
-
"laws_and_regulations": DocumentType.
|
|
255
|
-
"government_tenders": DocumentType.
|
|
256
|
-
"manuals": DocumentType.
|
|
257
|
-
"patents": DocumentType.
|
|
253
|
+
"financial_reports": DocumentType.FINANCIAL_REPORT,
|
|
254
|
+
"scientific_articles": DocumentType.SCIENTIFIC_PUBLICATION,
|
|
255
|
+
"laws_and_regulations": DocumentType.LAWS_AND_REGULATIONS,
|
|
256
|
+
"government_tenders": DocumentType.GOVERNMENT_TENDERS,
|
|
257
|
+
"manuals": DocumentType.MANUALS,
|
|
258
|
+
"patents": DocumentType.PATENTS,
|
|
258
259
|
}
|
|
259
260
|
categories_dict = self.categories.get_categories(init=True, name_as_key=True)
|
|
260
261
|
category_name = label_to_category_name[dp["doc_category"]]
|
|
261
262
|
summary.dump_sub_category(
|
|
262
|
-
PageType.
|
|
263
|
+
PageType.DOCUMENT_TYPE,
|
|
263
264
|
CategoryAnnotation(category_name=category_name, category_id=categories_dict[category_name]),
|
|
264
265
|
)
|
|
265
266
|
image.summary = summary
|
|
@@ -273,15 +274,14 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
|
|
|
273
274
|
if self.categories.is_filtered():
|
|
274
275
|
df = MapData(
|
|
275
276
|
df,
|
|
276
|
-
filter_summary({PageType.
|
|
277
|
+
filter_summary({PageType.DOCUMENT_TYPE: self.categories.get_categories(as_dict=False, filtered=True)}),
|
|
277
278
|
)
|
|
278
279
|
|
|
279
280
|
@curry
|
|
280
|
-
def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr,
|
|
281
|
-
if dp.summary:
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
|
|
281
|
+
def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, int]) -> Image:
|
|
282
|
+
if PageType.DOCUMENT_TYPE in dp.summary.sub_categories:
|
|
283
|
+
summary_cat = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE)
|
|
284
|
+
summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
|
|
285
285
|
return dp
|
|
286
286
|
|
|
287
287
|
df = MapData(df, _re_map_cat_ids(self.categories.get_categories(filtered=True, name_as_key=True)))
|