deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +4 -2
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +919 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +162 -108
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +205 -119
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +26 -17
- deepdoctection/utils/env_info.py +86 -37
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -71
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.1.dist-info/METADATA +376 -0
- deepdoctection-0.43.1.dist-info/RECORD +149 -0
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.1.dist-info/METADATA +0 -431
- deepdoctection-0.42.1.dist-info/RECORD +0 -148
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
|
@@ -16,14 +16,10 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
|
|
19
|
+
# Dataset concept: Building, training and evaluating datasets
|
|
20
20
|
|
|
21
|
-
<https://huggingface.co/docs/datasets/>
|
|
21
|
+
Simple framework inspired by <https://huggingface.co/docs/datasets/> for creating datasets.
|
|
22
22
|
|
|
23
|
-
for creating datasets.
|
|
24
|
-
|
|
25
|
-
Create an info card, a DataFlowBaseBuilder derived instance, possibly a category card and a
|
|
26
|
-
DatasetBase derived instance to create a data set.
|
|
27
23
|
"""
|
|
28
24
|
|
|
29
25
|
from .adapter import *
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
|
|
19
|
+
Wrapping datasets into a PyTorch dataset framework
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
|
|
@@ -46,9 +46,8 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
46
46
|
pytorch frameworks (e.g. Detectron2). It wraps the dataset and defines the compulsory
|
|
47
47
|
`__iter__` using `dataflow.build` .
|
|
48
48
|
|
|
49
|
-
DatasetAdapter is meant for training and will therefore produce an infinite number of datapoints
|
|
49
|
+
`DatasetAdapter` is meant for training and will therefore produce an infinite number of datapoints
|
|
50
50
|
by shuffling and restart iteration once the previous dataflow is exhausted.
|
|
51
|
-
|
|
52
51
|
"""
|
|
53
52
|
|
|
54
53
|
def __init__(
|
|
@@ -61,14 +60,15 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
61
60
|
**build_kwargs: str,
|
|
62
61
|
) -> None:
|
|
63
62
|
"""
|
|
64
|
-
:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
63
|
+
Args:
|
|
64
|
+
name_or_dataset: Registered name of the dataset or an instance.
|
|
65
|
+
cache_dataset: If set to `True`, it will cache the dataset (without loading images). If possible,
|
|
66
|
+
some statistics, e.g. number of specific labels will be printed.
|
|
67
|
+
image_to_framework_func: A mapping function that converts image datapoints into the framework format
|
|
68
|
+
use_token_tag: Will only be used for dataset_type="token_classification". If `use_token_tag=True`, will use
|
|
69
|
+
labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
|
|
70
|
+
`WordType.token_class`.
|
|
71
|
+
build_kwargs: optional parameters for defining the dataflow.
|
|
72
72
|
"""
|
|
73
73
|
if number_repetitions == -1 and not cache_dataset:
|
|
74
74
|
raise ValueError(
|
deepdoctection/datasets/base.py
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
|
|
19
|
+
DatasetBase, MergeDatasets and CustomDataset
|
|
20
20
|
"""
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
@@ -42,9 +42,9 @@ from .info import DatasetCategories, DatasetInfo, get_merged_categories
|
|
|
42
42
|
|
|
43
43
|
class DatasetBase(ABC):
|
|
44
44
|
"""
|
|
45
|
-
Base class for a dataset. Requires to
|
|
46
|
-
yourself. These methods must return a DatasetCategories
|
|
47
|
-
together give a complete description of the dataset. Compare some specific dataset cards in the
|
|
45
|
+
Base class for a dataset. Requires to implement `_categories`, `_info` and `_builder` by
|
|
46
|
+
yourself. These methods must return a `DatasetCategories`, a `DatasetInfo` and a `DataFlow_Builder` instance, which
|
|
47
|
+
together give a complete description of the dataset. Compare some specific dataset cards in the `instance`.
|
|
48
48
|
"""
|
|
49
49
|
|
|
50
50
|
def __init__(self) -> None:
|
|
@@ -65,21 +65,21 @@ class DatasetBase(ABC):
|
|
|
65
65
|
@property
|
|
66
66
|
def dataset_info(self) -> DatasetInfo:
|
|
67
67
|
"""
|
|
68
|
-
dataset_info
|
|
68
|
+
`dataset_info`
|
|
69
69
|
"""
|
|
70
70
|
return self._dataset_info
|
|
71
71
|
|
|
72
72
|
@property
|
|
73
73
|
def dataflow(self) -> DataFlowBaseBuilder:
|
|
74
74
|
"""
|
|
75
|
-
dataflow
|
|
75
|
+
`dataflow`
|
|
76
76
|
"""
|
|
77
77
|
return self._dataflow_builder
|
|
78
78
|
|
|
79
79
|
@abstractmethod
|
|
80
80
|
def _categories(self) -> DatasetCategories:
|
|
81
81
|
"""
|
|
82
|
-
Construct the DatasetCategory object.
|
|
82
|
+
Construct the `DatasetCategory` object.
|
|
83
83
|
"""
|
|
84
84
|
|
|
85
85
|
raise NotImplementedError()
|
|
@@ -88,7 +88,7 @@ class DatasetBase(ABC):
|
|
|
88
88
|
@abstractmethod
|
|
89
89
|
def _info(cls) -> DatasetInfo:
|
|
90
90
|
"""
|
|
91
|
-
Construct the DatasetInfo object.
|
|
91
|
+
Construct the `DatasetInfo` object.
|
|
92
92
|
"""
|
|
93
93
|
|
|
94
94
|
raise NotImplementedError()
|
|
@@ -96,7 +96,7 @@ class DatasetBase(ABC):
|
|
|
96
96
|
@abstractmethod
|
|
97
97
|
def _builder(self) -> DataFlowBaseBuilder:
|
|
98
98
|
"""
|
|
99
|
-
Construct the DataFlowBaseBuilder object. It needs to be implemented in the derived class.
|
|
99
|
+
Construct the `DataFlowBaseBuilder` object. It needs to be implemented in the derived class.
|
|
100
100
|
"""
|
|
101
101
|
|
|
102
102
|
raise NotImplementedError()
|
|
@@ -113,7 +113,7 @@ class DatasetBase(ABC):
|
|
|
113
113
|
@staticmethod
|
|
114
114
|
def is_built_in() -> bool:
|
|
115
115
|
"""
|
|
116
|
-
Returns flag to indicate if dataset is custom or built
|
|
116
|
+
Returns flag to indicate if dataset is custom or built-in.
|
|
117
117
|
"""
|
|
118
118
|
return False
|
|
119
119
|
|
|
@@ -140,9 +140,10 @@ class SplitDataFlow(DataFlowBaseBuilder):
|
|
|
140
140
|
|
|
141
141
|
def __init__(self, train: list[Image], val: list[Image], test: Optional[list[Image]]):
|
|
142
142
|
"""
|
|
143
|
-
:
|
|
144
|
-
|
|
145
|
-
|
|
143
|
+
Args:
|
|
144
|
+
train: Cached `train` split
|
|
145
|
+
val: Cached `val` split
|
|
146
|
+
test: Cached `test` split
|
|
146
147
|
"""
|
|
147
148
|
super().__init__(location="")
|
|
148
149
|
self.split_cache: dict[str, list[Image]]
|
|
@@ -154,8 +155,12 @@ class SplitDataFlow(DataFlowBaseBuilder):
|
|
|
154
155
|
def build(self, **kwargs: Union[str, int]) -> DataFlow:
|
|
155
156
|
"""
|
|
156
157
|
Dataflow builder for merged split datasets
|
|
157
|
-
|
|
158
|
-
:
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
kwargs: Only split and max_datapoints arguments will be considered.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Dataflow
|
|
159
164
|
"""
|
|
160
165
|
|
|
161
166
|
split = kwargs.get("split", "train")
|
|
@@ -175,44 +180,49 @@ class MergeDataset(DatasetBase):
|
|
|
175
180
|
guarantee flexibility it is possible to pass customized dataflows explicitly to maybe reduce the dataflow size from
|
|
176
181
|
one dataset or to use different splits from different datasets.
|
|
177
182
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
183
|
+
Note:
|
|
184
|
+
When yielding datapoints from `build` dataflows, note that one dataset will pass all its samples successively
|
|
185
|
+
which might reduce randomness for training. Buffering all datasets (without loading heavy components like
|
|
186
|
+
images) is therefore possible and the merged dataset can be shuffled.
|
|
181
187
|
|
|
182
|
-
|
|
183
|
-
|
|
188
|
+
When the datasets that are buffered are split functionality one can divide the buffered samples into an `train`,
|
|
189
|
+
`val` and `test` set.
|
|
184
190
|
|
|
185
191
|
While the selection of categories is given by the union of all categories of all datasets, sub categories need to
|
|
186
192
|
be handled with care: Only sub categories for one specific category are available provided that every dataset has
|
|
187
193
|
this sub category available for this specific category. The range of sub category values again is defined as the
|
|
188
194
|
range of all values from all datasets.
|
|
189
195
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
dataset_1 = get_dataset("dataset_1")
|
|
193
|
-
dataset_2 = get_dataset("dataset_2")
|
|
196
|
+
Example:
|
|
194
197
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
+
```python
|
|
199
|
+
dataset_1 = get_dataset("dataset_1")
|
|
200
|
+
dataset_2 = get_dataset("dataset_2")
|
|
198
201
|
|
|
202
|
+
union_dataset = MergeDataset(dataset_1,dataset_2)
|
|
203
|
+
union_dataset.buffer_datasets(split="train") # will cache the train split of dataset_1 and dataset_2
|
|
204
|
+
merge.split_datasets(ratio=0.1, add_test=False) # will create a new split of the union.
|
|
205
|
+
```
|
|
199
206
|
|
|
200
|
-
|
|
207
|
+
Example:
|
|
201
208
|
|
|
202
|
-
|
|
203
|
-
|
|
209
|
+
```python
|
|
210
|
+
dataset_1 = get_dataset("dataset_1")
|
|
211
|
+
dataset_2 = get_dataset("dataset_2")
|
|
204
212
|
|
|
205
|
-
|
|
206
|
-
|
|
213
|
+
df_1 = dataset_1.dataflow.build(max_datapoints=20) # handle separate dataflow configs ...
|
|
214
|
+
df_2 = dataset_1.dataflow.build(max_datapoints=30)
|
|
207
215
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
216
|
+
union_dataset = MergeDataset(dataset_1,dataset_2)
|
|
217
|
+
union_dataset.explicit_dataflows(df_1,df_2) # ... and pass them explicitly. Filtering is another
|
|
218
|
+
# possibility
|
|
219
|
+
```
|
|
211
220
|
"""
|
|
212
221
|
|
|
213
222
|
def __init__(self, *datasets: DatasetBase):
|
|
214
223
|
"""
|
|
215
|
-
:
|
|
224
|
+
Args:
|
|
225
|
+
datasets: An arbitrary number of datasets
|
|
216
226
|
"""
|
|
217
227
|
self.datasets = datasets
|
|
218
228
|
self.dataflows: Optional[tuple[DataFlow, ...]] = None
|
|
@@ -244,12 +254,17 @@ class MergeDataset(DatasetBase):
|
|
|
244
254
|
def build(self, **kwargs: Union[str, int]) -> DataFlow:
|
|
245
255
|
"""
|
|
246
256
|
Building the dataflow of merged datasets. No argument will affect the stream if the dataflows have
|
|
247
|
-
been explicitly passed. Otherwise, all kwargs will be passed to all dataflows.
|
|
248
|
-
|
|
249
|
-
|
|
257
|
+
been explicitly passed. Otherwise, all kwargs will be passed to all dataflows.
|
|
258
|
+
|
|
259
|
+
Note:
|
|
260
|
+
Note that each dataflow will iterate until it is exhausted. To guarantee randomness across
|
|
261
|
+
different datasets cache all datapoints and shuffle them afterwards (e.g. use `buffer_dataset()`).
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
kwargs: arguments for `build()`
|
|
250
265
|
|
|
251
|
-
:
|
|
252
|
-
|
|
266
|
+
Return:
|
|
267
|
+
`Dataflow`
|
|
253
268
|
"""
|
|
254
269
|
df_list = []
|
|
255
270
|
if self.dataflows is not None:
|
|
@@ -272,7 +287,8 @@ class MergeDataset(DatasetBase):
|
|
|
272
287
|
Pass explicit dataflows for each dataset. Using several dataflow configurations for one dataset is possible as
|
|
273
288
|
well. However, the number of dataflow must exceed the number of merged datasets.
|
|
274
289
|
|
|
275
|
-
:
|
|
290
|
+
Args:
|
|
291
|
+
dataflows args: An arbitrary number of dataflows
|
|
276
292
|
"""
|
|
277
293
|
self.dataflows = dataflows
|
|
278
294
|
if len(self.datasets) > len(self.dataflows):
|
|
@@ -286,19 +302,23 @@ class MergeDataset(DatasetBase):
|
|
|
286
302
|
"""
|
|
287
303
|
Buffer datasets with given configs. If dataflows are passed explicitly it will cache their streamed output.
|
|
288
304
|
|
|
289
|
-
:
|
|
290
|
-
|
|
305
|
+
Args:
|
|
306
|
+
kwargs: arguments for `build()`
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Dataflow
|
|
291
310
|
"""
|
|
292
311
|
df = self.dataflow.build(**kwargs)
|
|
293
312
|
self.datapoint_list = CacheData(df, shuffle=True).get_cache()
|
|
294
313
|
|
|
295
314
|
def split_datasets(self, ratio: float = 0.1, add_test: bool = True) -> None:
|
|
296
315
|
"""
|
|
297
|
-
Split cached datasets into train
|
|
316
|
+
Split cached datasets into `train`/`val`(/`test`).
|
|
298
317
|
|
|
299
|
-
:
|
|
300
|
-
|
|
301
|
-
|
|
318
|
+
Args:
|
|
319
|
+
ratio: 1-ratio will be assigned to the train split. The remaining bit will be assigned to val and test
|
|
320
|
+
split.
|
|
321
|
+
add_test: Add a test split
|
|
302
322
|
"""
|
|
303
323
|
assert self.datapoint_list is not None, "Datasets need to be buffered before splitting"
|
|
304
324
|
number_datapoints = len(self.datapoint_list)
|
|
@@ -332,7 +352,8 @@ class MergeDataset(DatasetBase):
|
|
|
332
352
|
To reproduce a dataset split at a later stage, get a summary of the by having a dict of list with split and
|
|
333
353
|
the image ids contained in the split.
|
|
334
354
|
|
|
335
|
-
:
|
|
355
|
+
Returns:
|
|
356
|
+
A dict with keys `train`, `val` and `test`: `{"train": ['ab','ac'],"val":['bc','bd']}`
|
|
336
357
|
"""
|
|
337
358
|
if isinstance(self._dataflow_builder, SplitDataFlow):
|
|
338
359
|
return {
|
|
@@ -345,8 +366,11 @@ class MergeDataset(DatasetBase):
|
|
|
345
366
|
self, split_dict: Mapping[str, Sequence[str]], **dataflow_build_kwargs: Union[str, int]
|
|
346
367
|
) -> None:
|
|
347
368
|
"""
|
|
348
|
-
Reproducing a dataset split from a dataset or a dataflow by a dict of list of
|
|
369
|
+
Reproducing a dataset split from a dataset or a dataflow by a dict of list of `image_id`s.
|
|
370
|
+
|
|
371
|
+
Example:
|
|
349
372
|
|
|
373
|
+
```python
|
|
350
374
|
merge = dd.MergeDataset(doclaynet)
|
|
351
375
|
merge.explicit_dataflows(df_doc)
|
|
352
376
|
merge.buffer_datasets()
|
|
@@ -357,8 +381,10 @@ class MergeDataset(DatasetBase):
|
|
|
357
381
|
df_doc_2 = doclaynet.dataflow.build(split="train", max_datapoints=4000)
|
|
358
382
|
merge_2.explicit_dataflows(df_doc_2)
|
|
359
383
|
merge_2.create_split_by_id(out) # merge_2 now has the same split as merge
|
|
384
|
+
```
|
|
360
385
|
|
|
361
|
-
:
|
|
386
|
+
Args:
|
|
387
|
+
split_dict: e.g. `{"train":['ab','ac',...],"val":['bc'],"test":[]}`
|
|
362
388
|
"""
|
|
363
389
|
|
|
364
390
|
if set(split_dict.keys()) != {"train", "val", "test"}:
|
|
@@ -399,33 +425,41 @@ class CustomDataset(DatasetBase):
|
|
|
399
425
|
description: Optional[str] = None,
|
|
400
426
|
):
|
|
401
427
|
"""
|
|
402
|
-
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
name: Name of the dataset. It will not be used in the code, however it might be helpful, if several
|
|
403
431
|
custom datasets are in use.
|
|
404
|
-
|
|
432
|
+
dataset_type: Datasets need to be characterized by one of the `enum` members `DatasetType` that describe
|
|
405
433
|
the machine learning task the dataset is built for. You can get all registered types with
|
|
406
434
|
|
|
407
|
-
|
|
408
|
-
|
|
435
|
+
```python
|
|
436
|
+
types = dd.object_types_registry.get("DatasetType")
|
|
437
|
+
print({t for t in types})
|
|
438
|
+
```
|
|
409
439
|
|
|
410
|
-
|
|
440
|
+
location: Datasets should be stored a sub folder of name `location` in the local cache
|
|
411
441
|
`get_dataset_dir_path()`. There are good reasons to use `name`.
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
442
|
+
init_categories: A list of all available categories in this dataset. You must use a list as the order
|
|
443
|
+
of the categories must always be preserved: they determine the category id that in turn
|
|
444
|
+
will be used for model training.
|
|
445
|
+
dataflow_builder: A subclass of `DataFlowBaseBuilder`. Do not instantiate the class by yourself.
|
|
446
|
+
init_sub_categories: A dict mapping main categories to sub categories, if there are any available.
|
|
447
|
+
Suppose an object `LayoutType.cell` has two additional information in the annotation
|
|
448
|
+
file: `CellType.header, CellType.body`. You can then write:
|
|
419
449
|
|
|
420
|
-
|
|
450
|
+
```python
|
|
451
|
+
{LayoutType.cell: {CellType.header: [CellType.header, CellType.body]}
|
|
452
|
+
```
|
|
421
453
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
454
|
+
This setting assumes that later in the mapping the `ImageAnnotation` with
|
|
455
|
+
`category_name=LayoutType.cell` will have a sub category of key `CellType.header`
|
|
456
|
+
and one of the two values `CellType.header, CellType.body`.
|
|
457
|
+
annotation_files: A mapping to one or more annotation files, e.g.
|
|
426
458
|
|
|
427
|
-
|
|
428
|
-
|
|
459
|
+
```python
|
|
460
|
+
annotation_file = {"train": "train_file.json", "test": "test_file.json"}
|
|
461
|
+
```
|
|
462
|
+
description: A description of the dataset.
|
|
429
463
|
"""
|
|
430
464
|
|
|
431
465
|
self.name = name
|
|
@@ -467,13 +501,16 @@ class CustomDataset(DatasetBase):
|
|
|
467
501
|
"""
|
|
468
502
|
This static method creates a CustomDataset instance from a dataset card.
|
|
469
503
|
|
|
470
|
-
A dataset card is a JSON file that contains metadata about the dataset such as its name
|
|
471
|
-
initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is
|
|
472
|
-
that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
|
|
504
|
+
A dataset card is a `JSON` file that contains metadata about the dataset such as its `name`, `dataset_type`,
|
|
505
|
+
`location`, initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is
|
|
506
|
+
a class that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
|
|
473
507
|
|
|
474
|
-
:
|
|
475
|
-
|
|
476
|
-
|
|
508
|
+
Args:
|
|
509
|
+
file_path: The path to the dataset card (`JSON` file).
|
|
510
|
+
dataflow_builder: The class used to build the dataflow for the dataset.
|
|
511
|
+
|
|
512
|
+
Returns:
|
|
513
|
+
A CustomDataset instance created from the dataset card.
|
|
477
514
|
"""
|
|
478
515
|
|
|
479
516
|
with open(file_path, "r", encoding="UTF-8") as file:
|
|
@@ -496,9 +533,8 @@ class CustomDataset(DatasetBase):
|
|
|
496
533
|
|
|
497
534
|
def as_dict(self) -> Mapping[str, Any]:
|
|
498
535
|
"""
|
|
499
|
-
Return
|
|
500
|
-
|
|
501
|
-
:return: A dictionary containing the meta-data of the dataset.
|
|
536
|
+
Return:
|
|
537
|
+
The meta-data of the dataset as a dictionary.
|
|
502
538
|
"""
|
|
503
539
|
return {
|
|
504
540
|
"name": self.name,
|
|
@@ -519,9 +555,10 @@ class CustomDataset(DatasetBase):
|
|
|
519
555
|
|
|
520
556
|
def save_dataset_card(self, file_path: str) -> None:
|
|
521
557
|
"""
|
|
522
|
-
Save the dataset card to a JSON file.
|
|
558
|
+
Save the dataset card to a `JSON` file.
|
|
523
559
|
|
|
524
|
-
:
|
|
560
|
+
Args:
|
|
561
|
+
file_path: file_path
|
|
525
562
|
"""
|
|
526
563
|
with open(file_path, "w", encoding="UTF-8") as file:
|
|
527
564
|
json.dump(self.as_dict(), file, indent=4)
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
Module for DataFlowBaseBuilder class.
|
|
19
|
+
Module for `DataFlowBaseBuilder` class.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
from abc import ABC, abstractmethod
|
|
@@ -48,8 +48,9 @@ class DataFlowBaseBuilder(ABC):
|
|
|
48
48
|
annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
|
|
49
49
|
):
|
|
50
50
|
"""
|
|
51
|
-
:
|
|
52
|
-
|
|
51
|
+
Args:
|
|
52
|
+
location: Relative path of the physical dataset.
|
|
53
|
+
annotation_files: Dict of annotation files e.g. depending on the split.
|
|
53
54
|
"""
|
|
54
55
|
self.location = location
|
|
55
56
|
if annotation_files is None:
|
|
@@ -61,7 +62,7 @@ class DataFlowBaseBuilder(ABC):
|
|
|
61
62
|
@property
|
|
62
63
|
def categories(self) -> DatasetCategories:
|
|
63
64
|
"""
|
|
64
|
-
categories
|
|
65
|
+
`categories`
|
|
65
66
|
"""
|
|
66
67
|
if self._categories is not None:
|
|
67
68
|
return self._categories
|
|
@@ -70,27 +71,28 @@ class DataFlowBaseBuilder(ABC):
|
|
|
70
71
|
@categories.setter
|
|
71
72
|
def categories(self, categories: DatasetCategories) -> None:
|
|
72
73
|
"""
|
|
73
|
-
categories setter
|
|
74
|
+
`categories` setter
|
|
74
75
|
"""
|
|
75
76
|
self._categories = categories
|
|
76
77
|
|
|
77
78
|
def get_split(self, key: str) -> str:
|
|
78
79
|
"""
|
|
79
|
-
|
|
80
|
+
Args:
|
|
81
|
+
key: split value
|
|
80
82
|
"""
|
|
81
83
|
return self._splits[key]
|
|
82
84
|
|
|
83
85
|
@property
|
|
84
86
|
def splits(self) -> Mapping[str, str]:
|
|
85
87
|
"""
|
|
86
|
-
splits
|
|
88
|
+
`splits`
|
|
87
89
|
"""
|
|
88
90
|
return self._splits
|
|
89
91
|
|
|
90
92
|
@splits.setter
|
|
91
93
|
def splits(self, splits: Mapping[str, str]) -> None:
|
|
92
94
|
"""
|
|
93
|
-
|
|
95
|
+
`splits` setter
|
|
94
96
|
"""
|
|
95
97
|
self._splits = splits
|
|
96
98
|
|
|
@@ -98,7 +100,8 @@ class DataFlowBaseBuilder(ABC):
|
|
|
98
100
|
"""
|
|
99
101
|
Get the absolute path to the locally physically stored dataset.
|
|
100
102
|
|
|
101
|
-
:
|
|
103
|
+
Returns:
|
|
104
|
+
local workdir
|
|
102
105
|
"""
|
|
103
106
|
return Path(get_dataset_dir_path()) / self.location
|
|
104
107
|
|
|
@@ -107,13 +110,16 @@ class DataFlowBaseBuilder(ABC):
|
|
|
107
110
|
"""
|
|
108
111
|
Consult the docstring w.r.t `DataFlowBaseBuilder`.
|
|
109
112
|
|
|
110
|
-
:
|
|
111
|
-
|
|
113
|
+
Args:
|
|
114
|
+
kwargs: A custom set of arguments/values
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
dataflow
|
|
112
118
|
"""
|
|
113
119
|
raise NotImplementedError()
|
|
114
120
|
|
|
115
121
|
def get_annotation_file(self, split: str) -> str:
|
|
116
|
-
"""Get single annotation file
|
|
122
|
+
"""Get single annotation file"""
|
|
117
123
|
split_file = self.annotation_files[split]
|
|
118
124
|
if isinstance(split_file, str):
|
|
119
125
|
return split_file
|