deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +8 -25
- deepdoctection/analyzer/dd.py +84 -71
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +78 -56
- deepdoctection/datapoint/box.py +7 -7
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +157 -75
- deepdoctection/datapoint/view.py +175 -151
- deepdoctection/datasets/adapter.py +30 -24
- deepdoctection/datasets/base.py +10 -10
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +23 -25
- deepdoctection/datasets/instances/doclaynet.py +48 -49
- deepdoctection/datasets/instances/fintabnet.py +44 -45
- deepdoctection/datasets/instances/funsd.py +23 -23
- deepdoctection/datasets/instances/iiitar13k.py +8 -8
- deepdoctection/datasets/instances/layouttest.py +2 -2
- deepdoctection/datasets/instances/publaynet.py +3 -3
- deepdoctection/datasets/instances/pubtables1m.py +18 -18
- deepdoctection/datasets/instances/pubtabnet.py +30 -29
- deepdoctection/datasets/instances/rvlcdip.py +28 -29
- deepdoctection/datasets/instances/xfund.py +51 -30
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +13 -12
- deepdoctection/eval/eval.py +32 -26
- deepdoctection/eval/tedsmetric.py +16 -12
- deepdoctection/eval/tp_eval_callback.py +7 -16
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +69 -89
- deepdoctection/extern/deskew.py +11 -10
- deepdoctection/extern/doctrocr.py +81 -64
- deepdoctection/extern/fastlang.py +23 -16
- deepdoctection/extern/hfdetr.py +53 -38
- deepdoctection/extern/hflayoutlm.py +216 -155
- deepdoctection/extern/hflm.py +35 -30
- deepdoctection/extern/model.py +433 -255
- deepdoctection/extern/pdftext.py +15 -15
- deepdoctection/extern/pt/ptutils.py +4 -2
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +14 -16
- deepdoctection/extern/tp/tfutils.py +16 -2
- deepdoctection/extern/tp/tpcompat.py +11 -7
- deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
- deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
- deepdoctection/extern/tpdetect.py +40 -45
- deepdoctection/mapper/cats.py +36 -40
- deepdoctection/mapper/cocostruct.py +16 -12
- deepdoctection/mapper/d2struct.py +22 -22
- deepdoctection/mapper/hfstruct.py +7 -7
- deepdoctection/mapper/laylmstruct.py +22 -24
- deepdoctection/mapper/maputils.py +9 -10
- deepdoctection/mapper/match.py +33 -2
- deepdoctection/mapper/misc.py +6 -7
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +6 -6
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/anngen.py +39 -14
- deepdoctection/pipe/base.py +68 -99
- deepdoctection/pipe/common.py +181 -85
- deepdoctection/pipe/concurrency.py +14 -10
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +18 -16
- deepdoctection/pipe/lm.py +49 -47
- deepdoctection/pipe/order.py +63 -65
- deepdoctection/pipe/refine.py +102 -109
- deepdoctection/pipe/segment.py +157 -162
- deepdoctection/pipe/sub_layout.py +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/d2_frcnn_train.py +27 -25
- deepdoctection/train/hf_detr_train.py +22 -18
- deepdoctection/train/hf_layoutlm_train.py +49 -48
- deepdoctection/train/tp_frcnn_train.py +10 -11
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +52 -14
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +41 -14
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +15 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/pdf_utils.py +39 -14
- deepdoctection/utils/settings.py +188 -182
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +70 -69
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
- deepdoctection-0.34.dist-info/RECORD +146 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.32.dist-info/RECORD +0 -146
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
|
@@ -24,14 +24,14 @@ from typing import Any, Callable, Iterator, Mapping, Optional, Union
|
|
|
24
24
|
|
|
25
25
|
from lazy_imports import try_import
|
|
26
26
|
|
|
27
|
-
from ..dataflow import
|
|
27
|
+
from ..dataflow import CustomDataFromList, MapData, RepeatedData
|
|
28
28
|
from ..datapoint.image import Image
|
|
29
29
|
from ..datasets.base import DatasetBase
|
|
30
30
|
from ..mapper.maputils import LabelSummarizer
|
|
31
|
-
from ..utils.detection_types import DP, JsonDict
|
|
32
31
|
from ..utils.logger import LoggingRecord, log_once, logger
|
|
33
32
|
from ..utils.settings import DatasetType, LayoutType, ObjectTypes, PageType, WordType
|
|
34
33
|
from ..utils.tqdm import get_tqdm
|
|
34
|
+
from ..utils.types import DP, JsonDict
|
|
35
35
|
from .registry import get_dataset
|
|
36
36
|
|
|
37
37
|
with try_import() as import_guard:
|
|
@@ -57,6 +57,7 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
57
57
|
cache_dataset: bool,
|
|
58
58
|
image_to_framework_func: Optional[Callable[[DP], Optional[JsonDict]]] = None,
|
|
59
59
|
use_token_tag: bool = True,
|
|
60
|
+
number_repetitions: int = -1,
|
|
60
61
|
**build_kwargs: str,
|
|
61
62
|
) -> None:
|
|
62
63
|
"""
|
|
@@ -69,6 +70,12 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
69
70
|
`WordType.token_class`.
|
|
70
71
|
:param build_kwargs: optional parameters for defining the dataflow.
|
|
71
72
|
"""
|
|
73
|
+
if number_repetitions == -1 and not cache_dataset:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
"Number of repetitions cannot be infinite when not caching the dataset. Instead try to"
|
|
76
|
+
" set a high number of repetitions"
|
|
77
|
+
)
|
|
78
|
+
|
|
72
79
|
if isinstance(name_or_dataset, str):
|
|
73
80
|
self.dataset = get_dataset(name_or_dataset)
|
|
74
81
|
else:
|
|
@@ -78,22 +85,22 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
78
85
|
|
|
79
86
|
if cache_dataset:
|
|
80
87
|
logger.info(LoggingRecord("Yielding dataflow into memory and create torch dataset"))
|
|
81
|
-
categories: Mapping[
|
|
88
|
+
categories: Mapping[int, ObjectTypes] = {}
|
|
82
89
|
_data_statistics = True
|
|
83
|
-
if self.dataset.dataset_info.type in (DatasetType.
|
|
90
|
+
if self.dataset.dataset_info.type in (DatasetType.OBJECT_DETECTION, DatasetType.SEQUENCE_CLASSIFICATION):
|
|
84
91
|
categories = self.dataset.dataflow.categories.get_categories(filtered=True)
|
|
85
|
-
elif self.dataset.dataset_info.type in (DatasetType.
|
|
92
|
+
elif self.dataset.dataset_info.type in (DatasetType.TOKEN_CLASSIFICATION,):
|
|
86
93
|
if use_token_tag:
|
|
87
94
|
categories = self.dataset.dataflow.categories.get_sub_categories(
|
|
88
|
-
categories=LayoutType.
|
|
89
|
-
sub_categories={LayoutType.
|
|
95
|
+
categories=LayoutType.WORD,
|
|
96
|
+
sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG]},
|
|
90
97
|
keys=False,
|
|
91
98
|
values_as_dict=True,
|
|
92
|
-
)[LayoutType.
|
|
99
|
+
)[LayoutType.WORD][WordType.TOKEN_TAG]
|
|
93
100
|
else:
|
|
94
101
|
categories = self.dataset.dataflow.categories.get_sub_categories(
|
|
95
|
-
categories=LayoutType.
|
|
96
|
-
)[LayoutType.
|
|
102
|
+
categories=LayoutType.WORD, sub_categories={LayoutType.WORD: [WordType.TOKEN_CLASS]}, keys=False
|
|
103
|
+
)[LayoutType.WORD][WordType.TOKEN_CLASS]
|
|
97
104
|
else:
|
|
98
105
|
logger.info(
|
|
99
106
|
LoggingRecord(f"dataset is of type {self.dataset.dataset_info.type}. Cannot generate statistics.")
|
|
@@ -121,19 +128,19 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
121
128
|
"images when needed and reduce memory costs!!!",
|
|
122
129
|
"warn",
|
|
123
130
|
)
|
|
124
|
-
if self.dataset.dataset_info.type == DatasetType.
|
|
131
|
+
if self.dataset.dataset_info.type == DatasetType.OBJECT_DETECTION:
|
|
125
132
|
anns = dp.get_annotation()
|
|
126
|
-
cat_ids = [
|
|
133
|
+
cat_ids = [ann.category_id for ann in anns]
|
|
127
134
|
|
|
128
|
-
elif self.dataset.dataset_info.type == DatasetType.
|
|
129
|
-
cat_ids = dp.summary.get_sub_category(PageType.
|
|
135
|
+
elif self.dataset.dataset_info.type == DatasetType.SEQUENCE_CLASSIFICATION:
|
|
136
|
+
cat_ids = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id
|
|
130
137
|
|
|
131
|
-
elif self.dataset.dataset_info.type == DatasetType.
|
|
132
|
-
anns = dp.get_annotation(category_names=LayoutType.
|
|
138
|
+
elif self.dataset.dataset_info.type == DatasetType.TOKEN_CLASSIFICATION:
|
|
139
|
+
anns = dp.get_annotation(category_names=LayoutType.WORD)
|
|
133
140
|
if use_token_tag:
|
|
134
|
-
cat_ids = [ann.get_sub_category(WordType.
|
|
141
|
+
cat_ids = [ann.get_sub_category(WordType.TOKEN_TAG).category_id for ann in anns]
|
|
135
142
|
else:
|
|
136
|
-
cat_ids = [ann.get_sub_category(WordType.
|
|
143
|
+
cat_ids = [ann.get_sub_category(WordType.TOKEN_CLASS).category_id for ann in anns]
|
|
137
144
|
|
|
138
145
|
if _data_statistics:
|
|
139
146
|
summarizer.dump(cat_ids)
|
|
@@ -144,14 +151,13 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
144
151
|
if _data_statistics:
|
|
145
152
|
summarizer.print_summary_histogram()
|
|
146
153
|
self.number_datapoints = len(datapoints)
|
|
154
|
+
if not self.number_datapoints:
|
|
155
|
+
raise ValueError("DatasetAdapter receives no datapoints. Please check your dataflow build config.")
|
|
147
156
|
|
|
148
157
|
df = CustomDataFromList(datapoints, shuffle=True)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
df_list = CacheData(df).get_cache()
|
|
153
|
-
df = CustomDataFromList(df_list, shuffle=True)
|
|
154
|
-
df = RepeatedData(df, -1)
|
|
158
|
+
df = RepeatedData(df, number_repetitions)
|
|
159
|
+
else:
|
|
160
|
+
df = RepeatedData(df, number_repetitions)
|
|
155
161
|
|
|
156
162
|
if image_to_framework_func:
|
|
157
163
|
df = MapData(df, image_to_framework_func)
|
deepdoctection/datasets/base.py
CHANGED
|
@@ -27,15 +27,15 @@ from abc import ABC, abstractmethod
|
|
|
27
27
|
from collections import defaultdict
|
|
28
28
|
from inspect import signature
|
|
29
29
|
from pathlib import Path
|
|
30
|
-
from typing import Any,
|
|
30
|
+
from typing import Any, Mapping, Optional, Sequence, Type, Union
|
|
31
31
|
|
|
32
32
|
import numpy as np
|
|
33
33
|
|
|
34
34
|
from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
|
|
35
35
|
from ..datapoint.image import Image
|
|
36
|
-
from ..utils.detection_types import Pathlike
|
|
37
36
|
from ..utils.logger import LoggingRecord, logger
|
|
38
37
|
from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
|
|
38
|
+
from ..utils.types import PathLikeOrStr
|
|
39
39
|
from .dataflow_builder import DataFlowBaseBuilder
|
|
40
40
|
from .info import DatasetCategories, DatasetInfo, get_merged_categories
|
|
41
41
|
|
|
@@ -138,14 +138,14 @@ class SplitDataFlow(DataFlowBaseBuilder):
|
|
|
138
138
|
Dataflow builder for splitting datasets
|
|
139
139
|
"""
|
|
140
140
|
|
|
141
|
-
def __init__(self, train:
|
|
141
|
+
def __init__(self, train: list[Image], val: list[Image], test: Optional[list[Image]]):
|
|
142
142
|
"""
|
|
143
143
|
:param train: Cached train split
|
|
144
144
|
:param val: Cached val split
|
|
145
145
|
:param test: Cached test split
|
|
146
146
|
"""
|
|
147
147
|
super().__init__(location="")
|
|
148
|
-
self.split_cache:
|
|
148
|
+
self.split_cache: dict[str, list[Image]]
|
|
149
149
|
if test is None:
|
|
150
150
|
self.split_cache = {"train": train, "val": val}
|
|
151
151
|
else:
|
|
@@ -215,8 +215,8 @@ class MergeDataset(DatasetBase):
|
|
|
215
215
|
:param datasets: An arbitrary number of datasets
|
|
216
216
|
"""
|
|
217
217
|
self.datasets = datasets
|
|
218
|
-
self.dataflows: Optional[
|
|
219
|
-
self.datapoint_list: Optional[
|
|
218
|
+
self.dataflows: Optional[tuple[DataFlow, ...]] = None
|
|
219
|
+
self.datapoint_list: Optional[list[Image]] = None
|
|
220
220
|
super().__init__()
|
|
221
221
|
self._dataset_info.type = datasets[0].dataset_info.type
|
|
222
222
|
self._dataset_info.name = "merge_" + "_".join([dataset.dataset_info.name for dataset in self.datasets])
|
|
@@ -239,7 +239,7 @@ class MergeDataset(DatasetBase):
|
|
|
239
239
|
def __init__(self, *dataflow_builders: DataFlowBaseBuilder):
|
|
240
240
|
super().__init__("")
|
|
241
241
|
self.dataflow_builders = dataflow_builders
|
|
242
|
-
self.dataflows: Optional[
|
|
242
|
+
self.dataflows: Optional[tuple[DataFlow, ...]] = None
|
|
243
243
|
|
|
244
244
|
def build(self, **kwargs: Union[str, int]) -> DataFlow:
|
|
245
245
|
"""
|
|
@@ -327,7 +327,7 @@ class MergeDataset(DatasetBase):
|
|
|
327
327
|
self._dataflow_builder = SplitDataFlow(train_dataset, val_dataset, test_dataset)
|
|
328
328
|
self._dataflow_builder.categories = self._categories()
|
|
329
329
|
|
|
330
|
-
def get_ids_by_split(self) ->
|
|
330
|
+
def get_ids_by_split(self) -> dict[str, list[str]]:
|
|
331
331
|
"""
|
|
332
332
|
To reproduce a dataset split at a later stage, get a summary of the by having a dict of list with split and
|
|
333
333
|
the image ids contained in the split.
|
|
@@ -389,7 +389,7 @@ class CustomDataset(DatasetBase):
|
|
|
389
389
|
self,
|
|
390
390
|
name: str,
|
|
391
391
|
dataset_type: TypeOrStr,
|
|
392
|
-
location:
|
|
392
|
+
location: PathLikeOrStr,
|
|
393
393
|
init_categories: Sequence[ObjectTypes],
|
|
394
394
|
dataflow_builder: Type[DataFlowBaseBuilder],
|
|
395
395
|
init_sub_categories: Optional[Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]]] = None,
|
|
@@ -451,7 +451,7 @@ class CustomDataset(DatasetBase):
|
|
|
451
451
|
return self.dataflow_builder
|
|
452
452
|
|
|
453
453
|
@staticmethod
|
|
454
|
-
def from_dataset_card(file_path:
|
|
454
|
+
def from_dataset_card(file_path: PathLikeOrStr, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
|
|
455
455
|
"""
|
|
456
456
|
This static method creates a CustomDataset instance from a dataset card.
|
|
457
457
|
|
|
@@ -24,8 +24,8 @@ from pathlib import Path
|
|
|
24
24
|
from typing import Mapping, Optional, Sequence, Union
|
|
25
25
|
|
|
26
26
|
from ..dataflow import DataFlow
|
|
27
|
-
from ..utils.detection_types import Pathlike
|
|
28
27
|
from ..utils.fs import get_dataset_dir_path
|
|
28
|
+
from ..utils.types import PathLikeOrStr
|
|
29
29
|
from .info import DatasetCategories
|
|
30
30
|
|
|
31
31
|
|
|
@@ -44,7 +44,7 @@ class DataFlowBaseBuilder(ABC):
|
|
|
44
44
|
|
|
45
45
|
def __init__(
|
|
46
46
|
self,
|
|
47
|
-
location:
|
|
47
|
+
location: PathLikeOrStr,
|
|
48
48
|
annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
|
|
49
49
|
):
|
|
50
50
|
"""
|
|
@@ -100,7 +100,7 @@ class DataFlowBaseBuilder(ABC):
|
|
|
100
100
|
|
|
101
101
|
:return: local workdir
|
|
102
102
|
"""
|
|
103
|
-
return get_dataset_dir_path() / self.location
|
|
103
|
+
return Path(get_dataset_dir_path()) / self.location
|
|
104
104
|
|
|
105
105
|
@abstractmethod
|
|
106
106
|
def build(self, **kwargs: Union[str, int]) -> DataFlow:
|
deepdoctection/datasets/info.py
CHANGED
|
@@ -22,7 +22,7 @@ Module for storing dataset info (e.g. general meta data or categories)
|
|
|
22
22
|
from copy import copy
|
|
23
23
|
from dataclasses import dataclass, field
|
|
24
24
|
from itertools import chain
|
|
25
|
-
from typing import Any,
|
|
25
|
+
from typing import Any, Literal, Mapping, Optional, Sequence, Union, no_type_check, overload
|
|
26
26
|
|
|
27
27
|
from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
|
|
28
28
|
from ..utils.utils import call_only_once
|
|
@@ -31,25 +31,25 @@ __all__ = ["DatasetInfo", "DatasetCategories", "get_merged_categories"]
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
@overload
|
|
34
|
-
def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[True], starts_with: int = ...) ->
|
|
34
|
+
def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[True], starts_with: int = ...) -> dict[ObjectTypes, int]:
|
|
35
35
|
...
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
@overload
|
|
39
|
-
def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[False], starts_with: int = ...) ->
|
|
39
|
+
def _get_dict(l: Sequence[ObjectTypes], name_as_key: Literal[False], starts_with: int = ...) -> dict[int, ObjectTypes]:
|
|
40
40
|
...
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
@overload
|
|
44
44
|
def _get_dict(
|
|
45
45
|
l: Sequence[ObjectTypes], name_as_key: bool, starts_with: int = ...
|
|
46
|
-
) -> Union[
|
|
46
|
+
) -> Union[dict[ObjectTypes, int], dict[int, ObjectTypes]]:
|
|
47
47
|
...
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def _get_dict(
|
|
51
51
|
l: Sequence[ObjectTypes], name_as_key: bool, starts_with: int = 1
|
|
52
|
-
) -> Union[
|
|
52
|
+
) -> Union[dict[ObjectTypes, int], dict[int, ObjectTypes]]:
|
|
53
53
|
"""
|
|
54
54
|
Converts a list into a dict, where keys/values are the list indices.
|
|
55
55
|
|
|
@@ -59,8 +59,8 @@ def _get_dict(
|
|
|
59
59
|
:return: A dictionary of list indices/list elements.
|
|
60
60
|
"""
|
|
61
61
|
if name_as_key:
|
|
62
|
-
return {v:
|
|
63
|
-
return
|
|
62
|
+
return {v: k for k, v in enumerate(l, starts_with)}
|
|
63
|
+
return dict(enumerate(l, starts_with))
|
|
64
64
|
|
|
65
65
|
|
|
66
66
|
@dataclass
|
|
@@ -89,7 +89,7 @@ class DatasetInfo:
|
|
|
89
89
|
license: str = field(default="")
|
|
90
90
|
url: Union[str, Sequence[str]] = field(default="")
|
|
91
91
|
splits: Mapping[str, str] = field(default_factory=dict)
|
|
92
|
-
type: DatasetType = field(default=DatasetType.
|
|
92
|
+
type: DatasetType = field(default=DatasetType.DEFAULT)
|
|
93
93
|
|
|
94
94
|
def get_split(self, key: str) -> str:
|
|
95
95
|
"""
|
|
@@ -143,13 +143,13 @@ class DatasetCategories:
|
|
|
143
143
|
@overload
|
|
144
144
|
def get_categories(
|
|
145
145
|
self, *, name_as_key: Literal[True], init: bool = ..., filtered: bool = ...
|
|
146
|
-
) -> Mapping[ObjectTypes,
|
|
146
|
+
) -> Mapping[ObjectTypes, int]:
|
|
147
147
|
...
|
|
148
148
|
|
|
149
149
|
@overload
|
|
150
150
|
def get_categories(
|
|
151
151
|
self, *, name_as_key: Literal[False] = ..., init: bool = ..., filtered: bool = ...
|
|
152
|
-
) -> Mapping[
|
|
152
|
+
) -> Mapping[int, ObjectTypes]:
|
|
153
153
|
...
|
|
154
154
|
|
|
155
155
|
@overload
|
|
@@ -161,12 +161,12 @@ class DatasetCategories:
|
|
|
161
161
|
@overload
|
|
162
162
|
def get_categories(
|
|
163
163
|
self, as_dict: Literal[True] = ..., name_as_key: bool = False, init: bool = False, filtered: bool = False
|
|
164
|
-
) -> Union[Mapping[ObjectTypes,
|
|
164
|
+
) -> Union[Mapping[ObjectTypes, int], Mapping[int, ObjectTypes]]:
|
|
165
165
|
...
|
|
166
166
|
|
|
167
167
|
def get_categories(
|
|
168
168
|
self, as_dict: bool = True, name_as_key: bool = False, init: bool = False, filtered: bool = False
|
|
169
|
-
) -> Union[Sequence[ObjectTypes], Mapping[ObjectTypes,
|
|
169
|
+
) -> Union[Sequence[ObjectTypes], Mapping[ObjectTypes, int], Mapping[int, ObjectTypes]]:
|
|
170
170
|
"""
|
|
171
171
|
Get categories of a dataset. The returned value also respects modifications of the inventory like filtered
|
|
172
172
|
categories of replaced categories with sub categories. However, you must correctly pass arguments to return the
|
|
@@ -229,7 +229,7 @@ class DatasetCategories:
|
|
|
229
229
|
if sub_categories is None:
|
|
230
230
|
sub_categories = {}
|
|
231
231
|
|
|
232
|
-
sub_cat:
|
|
232
|
+
sub_cat: dict[ObjectTypes, Union[ObjectTypes, list[ObjectTypes]]] = {}
|
|
233
233
|
for cat in _categories:
|
|
234
234
|
assert cat in self.get_categories( # pylint: disable=E1135
|
|
235
235
|
as_dict=False, filtered=True
|
|
@@ -254,9 +254,9 @@ class DatasetCategories:
|
|
|
254
254
|
for category, value in sub_cat.items():
|
|
255
255
|
if category not in sub_categories:
|
|
256
256
|
continue
|
|
257
|
-
sub_cat_tmp:
|
|
257
|
+
sub_cat_tmp: dict[str, Union[dict[int, ObjectTypes], dict[ObjectTypes, int], Sequence[str]]] = {}
|
|
258
258
|
sub_categories_list: Union[
|
|
259
|
-
ObjectTypes, str,
|
|
259
|
+
ObjectTypes, str, list[Sequence[Union[ObjectTypes, str]]], Sequence[Union[ObjectTypes, str]]
|
|
260
260
|
]
|
|
261
261
|
if isinstance(sub_categories[category], ObjectTypes):
|
|
262
262
|
sub_categories_list = [sub_categories[category]]
|
|
@@ -267,14 +267,12 @@ class DatasetCategories:
|
|
|
267
267
|
continue
|
|
268
268
|
if values_as_dict:
|
|
269
269
|
if not name_as_key:
|
|
270
|
-
sub_cat_tmp[sub_cat_key] =
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
}
|
|
270
|
+
sub_cat_tmp[sub_cat_key] = dict(
|
|
271
|
+
enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
|
|
272
|
+
)
|
|
274
273
|
else:
|
|
275
274
|
sub_cat_tmp[sub_cat_key] = {
|
|
276
|
-
v:
|
|
277
|
-
for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
|
|
275
|
+
v: k for k, v in enumerate(self.init_sub_categories[category][get_type(sub_cat_key)], 1)
|
|
278
276
|
}
|
|
279
277
|
else:
|
|
280
278
|
sub_cat_tmp[sub_cat_key] = self.init_sub_categories[category][get_type(sub_cat_key)]
|
|
@@ -284,7 +282,7 @@ class DatasetCategories:
|
|
|
284
282
|
return sub_cat
|
|
285
283
|
|
|
286
284
|
@call_only_once
|
|
287
|
-
def set_cat_to_sub_cat(self, cat_to_sub_cat:
|
|
285
|
+
def set_cat_to_sub_cat(self, cat_to_sub_cat: dict[TypeOrStr, TypeOrStr]) -> None:
|
|
288
286
|
"""
|
|
289
287
|
Change category representation if sub-categories are available. Pass a dictionary of the main category
|
|
290
288
|
and the requested sub-category. This will change the dictionary of categories and the category names
|
|
@@ -323,7 +321,7 @@ class DatasetCategories:
|
|
|
323
321
|
self._categories_update = _categories_update_list
|
|
324
322
|
|
|
325
323
|
@call_only_once
|
|
326
|
-
def filter_categories(self, categories: Union[TypeOrStr,
|
|
324
|
+
def filter_categories(self, categories: Union[TypeOrStr, list[TypeOrStr]]) -> None:
|
|
327
325
|
"""
|
|
328
326
|
Filter categories of a dataset. This will keep all the categories chosen and remove all others.
|
|
329
327
|
This method can only be called once per object.
|
|
@@ -415,7 +413,7 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
|
|
|
415
413
|
# form a set of possible sub category values. To get a list of all values from all dataset, take the union
|
|
416
414
|
intersect_init_sub_cat_values = {}
|
|
417
415
|
for sub_cat_key in intersect_sub_cat_per_key:
|
|
418
|
-
val:
|
|
416
|
+
val: set[ObjectTypes] = set()
|
|
419
417
|
for cat in categories:
|
|
420
418
|
val.update(cat.init_sub_categories[key][sub_cat_key])
|
|
421
419
|
intersect_init_sub_cat_values[sub_cat_key] = list(val)
|
|
@@ -425,7 +423,7 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
|
|
|
425
423
|
# construction is not deterministic but guarantees for unique values in all sub categories. Now we build the
|
|
426
424
|
# ensemble dict of sub categories where we guarantee unique values on one hand side and always maintain the
|
|
427
425
|
# same arrangements for all category/ sub category lists
|
|
428
|
-
init_sub_cat:
|
|
426
|
+
init_sub_cat: dict[ObjectTypes, Any] = {}
|
|
429
427
|
for category in categories:
|
|
430
428
|
for cat in intersect_sub_cat_keys:
|
|
431
429
|
for sub_cat_key in category.init_sub_categories[cat]:
|
|
@@ -31,14 +31,14 @@ import os
|
|
|
31
31
|
from typing import Mapping, Sequence, Union
|
|
32
32
|
|
|
33
33
|
from ...dataflow import DataFlow, MapData, MapDataComponent, SerializerCoco
|
|
34
|
-
from ...datapoint.annotation import CategoryAnnotation
|
|
34
|
+
from ...datapoint.annotation import CategoryAnnotation
|
|
35
35
|
from ...datapoint.image import Image
|
|
36
36
|
from ...mapper.cats import add_summary, cat_to_sub_cat, filter_cat, filter_summary
|
|
37
37
|
from ...mapper.cocostruct import coco_to_image
|
|
38
38
|
from ...mapper.maputils import curry
|
|
39
|
-
from ...utils.detection_types import JsonDict
|
|
40
39
|
from ...utils.fs import load_image_from_file
|
|
41
|
-
from ...utils.settings import DatasetType, DocumentType, LayoutType, ObjectTypes, PageType, TypeOrStr
|
|
40
|
+
from ...utils.settings import DatasetType, DocumentType, LayoutType, ObjectTypes, PageType, SummaryType, TypeOrStr
|
|
41
|
+
from ...utils.types import CocoDatapointDict
|
|
42
42
|
from ..base import DatasetBase
|
|
43
43
|
from ..dataflow_builder import DataFlowBaseBuilder
|
|
44
44
|
from ..info import DatasetCategories, DatasetInfo
|
|
@@ -64,36 +64,36 @@ _DESCRIPTION = (
|
|
|
64
64
|
_LICENSE = "CDLA-Permissive"
|
|
65
65
|
_URL = "https://codait-cos-dax.s3.us.cloud-object-storage.appdomain.cloud/dax-doclaynet/1.0.0/DocLayNet_core.zip"
|
|
66
66
|
_SPLITS: Mapping[str, str] = {"train": "train", "val": "val", "test": "test"}
|
|
67
|
-
_TYPE = DatasetType.
|
|
67
|
+
_TYPE = DatasetType.OBJECT_DETECTION
|
|
68
68
|
|
|
69
69
|
_LOCATION = "DocLayNet_core"
|
|
70
70
|
|
|
71
71
|
_ANNOTATION_FILES: Mapping[str, str] = {"train": "COCO/train.json", "val": "COCO/val.json", "test": "COCO/test.json"}
|
|
72
72
|
_INIT_CATEGORIES = [
|
|
73
|
-
LayoutType.
|
|
74
|
-
LayoutType.
|
|
75
|
-
LayoutType.
|
|
76
|
-
LayoutType.
|
|
77
|
-
LayoutType.
|
|
78
|
-
LayoutType.
|
|
79
|
-
LayoutType.
|
|
80
|
-
LayoutType.
|
|
81
|
-
LayoutType.
|
|
82
|
-
LayoutType.
|
|
83
|
-
LayoutType.
|
|
73
|
+
LayoutType.CAPTION,
|
|
74
|
+
LayoutType.FOOTNOTE,
|
|
75
|
+
LayoutType.FORMULA,
|
|
76
|
+
LayoutType.LIST,
|
|
77
|
+
LayoutType.PAGE_FOOTER,
|
|
78
|
+
LayoutType.PAGE_HEADER,
|
|
79
|
+
LayoutType.FIGURE,
|
|
80
|
+
LayoutType.SECTION_HEADER,
|
|
81
|
+
LayoutType.TABLE,
|
|
82
|
+
LayoutType.TEXT,
|
|
83
|
+
LayoutType.TITLE,
|
|
84
84
|
]
|
|
85
85
|
_SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]]] = {
|
|
86
|
-
LayoutType.
|
|
87
|
-
LayoutType.
|
|
88
|
-
LayoutType.
|
|
89
|
-
LayoutType.
|
|
90
|
-
LayoutType.
|
|
91
|
-
LayoutType.
|
|
92
|
-
LayoutType.
|
|
93
|
-
LayoutType.
|
|
94
|
-
LayoutType.
|
|
95
|
-
LayoutType.
|
|
96
|
-
LayoutType.
|
|
86
|
+
LayoutType.CAPTION: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
|
|
87
|
+
LayoutType.FOOTNOTE: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
|
|
88
|
+
LayoutType.FORMULA: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
|
|
89
|
+
LayoutType.LIST: {DatasetType.PUBLAYNET: [LayoutType.LIST]},
|
|
90
|
+
LayoutType.PAGE_FOOTER: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
|
|
91
|
+
LayoutType.PAGE_HEADER: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
|
|
92
|
+
LayoutType.FIGURE: {DatasetType.PUBLAYNET: [LayoutType.FIGURE]},
|
|
93
|
+
LayoutType.SECTION_HEADER: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
|
|
94
|
+
LayoutType.TABLE: {DatasetType.PUBLAYNET: [LayoutType.TABLE]},
|
|
95
|
+
LayoutType.TEXT: {DatasetType.PUBLAYNET: [LayoutType.TEXT]},
|
|
96
|
+
LayoutType.TITLE: {DatasetType.PUBLAYNET: [LayoutType.TITLE]},
|
|
97
97
|
}
|
|
98
98
|
|
|
99
99
|
|
|
@@ -162,7 +162,7 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
|
|
|
162
162
|
filter_empty_image=True,
|
|
163
163
|
fake_score=fake_score,
|
|
164
164
|
coarse_mapping={1: 10, 2: 10, 3: 10, 4: 4, 5: 10, 6: 11, 7: 7, 8: 11, 9: 9, 10: 10, 11: 11},
|
|
165
|
-
coarse_sub_cat_name=DatasetType.
|
|
165
|
+
coarse_sub_cat_name=DatasetType.PUBLAYNET,
|
|
166
166
|
),
|
|
167
167
|
)
|
|
168
168
|
|
|
@@ -186,14 +186,14 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
|
|
|
186
186
|
|
|
187
187
|
|
|
188
188
|
_NAME_SEQ = "doclaynet-seq"
|
|
189
|
-
_TYPE_SEQ = DatasetType.
|
|
189
|
+
_TYPE_SEQ = DatasetType.SEQUENCE_CLASSIFICATION
|
|
190
190
|
_INIT_CATEGORIES_SEQ = [
|
|
191
|
-
DocumentType.
|
|
192
|
-
DocumentType.
|
|
193
|
-
DocumentType.
|
|
194
|
-
DocumentType.
|
|
195
|
-
DocumentType.
|
|
196
|
-
DocumentType.
|
|
191
|
+
DocumentType.FINANCIAL_REPORT,
|
|
192
|
+
DocumentType.SCIENTIFIC_PUBLICATION,
|
|
193
|
+
DocumentType.LAWS_AND_REGULATIONS,
|
|
194
|
+
DocumentType.GOVERNMENT_TENDERS,
|
|
195
|
+
DocumentType.MANUALS,
|
|
196
|
+
DocumentType.PATENTS,
|
|
197
197
|
]
|
|
198
198
|
|
|
199
199
|
|
|
@@ -245,22 +245,22 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
|
|
|
245
245
|
df = MapDataComponent(df, lambda dp: self.get_workdir() / "PNG" / dp, "file_name")
|
|
246
246
|
|
|
247
247
|
@curry
|
|
248
|
-
def _map_to_image(dp:
|
|
248
|
+
def _map_to_image(dp: CocoDatapointDict, load_img: bool) -> Image:
|
|
249
249
|
image = Image(location=dp["file_name"], file_name=os.path.split(dp["file_name"])[1])
|
|
250
250
|
image.image = load_image_from_file(image.location)
|
|
251
|
-
summary =
|
|
251
|
+
summary = CategoryAnnotation(category_name=SummaryType.SUMMARY)
|
|
252
252
|
label_to_category_name = {
|
|
253
|
-
"financial_reports": DocumentType.
|
|
254
|
-
"scientific_articles": DocumentType.
|
|
255
|
-
"laws_and_regulations": DocumentType.
|
|
256
|
-
"government_tenders": DocumentType.
|
|
257
|
-
"manuals": DocumentType.
|
|
258
|
-
"patents": DocumentType.
|
|
253
|
+
"financial_reports": DocumentType.FINANCIAL_REPORT,
|
|
254
|
+
"scientific_articles": DocumentType.SCIENTIFIC_PUBLICATION,
|
|
255
|
+
"laws_and_regulations": DocumentType.LAWS_AND_REGULATIONS,
|
|
256
|
+
"government_tenders": DocumentType.GOVERNMENT_TENDERS,
|
|
257
|
+
"manuals": DocumentType.MANUALS,
|
|
258
|
+
"patents": DocumentType.PATENTS,
|
|
259
259
|
}
|
|
260
260
|
categories_dict = self.categories.get_categories(init=True, name_as_key=True)
|
|
261
261
|
category_name = label_to_category_name[dp["doc_category"]]
|
|
262
262
|
summary.dump_sub_category(
|
|
263
|
-
PageType.
|
|
263
|
+
PageType.DOCUMENT_TYPE,
|
|
264
264
|
CategoryAnnotation(category_name=category_name, category_id=categories_dict[category_name]),
|
|
265
265
|
)
|
|
266
266
|
image.summary = summary
|
|
@@ -274,15 +274,14 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
|
|
|
274
274
|
if self.categories.is_filtered():
|
|
275
275
|
df = MapData(
|
|
276
276
|
df,
|
|
277
|
-
filter_summary({PageType.
|
|
277
|
+
filter_summary({PageType.DOCUMENT_TYPE: self.categories.get_categories(as_dict=False, filtered=True)}),
|
|
278
278
|
)
|
|
279
279
|
|
|
280
280
|
@curry
|
|
281
|
-
def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr,
|
|
282
|
-
if dp.summary:
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
|
|
281
|
+
def _re_map_cat_ids(dp: Image, filtered_categories_name_as_key: Mapping[TypeOrStr, int]) -> Image:
|
|
282
|
+
if PageType.DOCUMENT_TYPE in dp.summary.sub_categories:
|
|
283
|
+
summary_cat = dp.summary.get_sub_category(PageType.DOCUMENT_TYPE)
|
|
284
|
+
summary_cat.category_id = filtered_categories_name_as_key[summary_cat.category_name]
|
|
286
285
|
return dp
|
|
287
286
|
|
|
288
287
|
df = MapData(df, _re_map_cat_ids(self.categories.get_categories(filtered=True, name_as_key=True)))
|