deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +4 -2
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +919 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +162 -108
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +205 -119
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +26 -17
  104. deepdoctection/utils/env_info.py +86 -37
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -71
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.1.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.1.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
@@ -16,14 +16,10 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Simple framework inspired by
19
+ # Dataset concept: Building, training and evaluating datasets
20
20
 
21
- <https://huggingface.co/docs/datasets/>
21
+ Simple framework inspired by <https://huggingface.co/docs/datasets/> for creating datasets.
22
22
 
23
- for creating datasets.
24
-
25
- Create an info card, a DataFlowBaseBuilder derived instance, possibly a category card and a
26
- DatasetBase derived instance to create a data set.
27
23
  """
28
24
 
29
25
  from .adapter import *
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for wrapping datasets into a pytorch dataset framework.
19
+ Wrapping datasets into a PyTorch dataset framework
20
20
  """
21
21
 
22
22
 
@@ -46,9 +46,8 @@ class DatasetAdapter(IterableDataset): # type: ignore
46
46
  pytorch frameworks (e.g. Detectron2). It wraps the dataset and defines the compulsory
47
47
  `__iter__` using `dataflow.build` .
48
48
 
49
- DatasetAdapter is meant for training and will therefore produce an infinite number of datapoints
49
+ `DatasetAdapter` is meant for training and will therefore produce an infinite number of datapoints
50
50
  by shuffling and restart iteration once the previous dataflow is exhausted.
51
-
52
51
  """
53
52
 
54
53
  def __init__(
@@ -61,14 +60,15 @@ class DatasetAdapter(IterableDataset): # type: ignore
61
60
  **build_kwargs: str,
62
61
  ) -> None:
63
62
  """
64
- :param name_or_dataset: Registered name of the dataset or an instance.
65
- :param cache_dataset: If set to true, it will cache the dataset (without loading images). If possible,
66
- some statistics, e.g. number of specific labels will be printed.
67
- :param image_to_framework_func: A mapping function that converts image datapoints into the framework format
68
- :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
69
- labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
70
- `WordType.token_class`.
71
- :param build_kwargs: optional parameters for defining the dataflow.
63
+ Args:
64
+ name_or_dataset: Registered name of the dataset or an instance.
65
+ cache_dataset: If set to `True`, it will cache the dataset (without loading images). If possible,
66
+ some statistics, e.g. number of specific labels will be printed.
67
+ image_to_framework_func: A mapping function that converts image datapoints into the framework format
68
+ use_token_tag: Will only be used for dataset_type="token_classification". If `use_token_tag=True`, will use
69
+ labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
70
+ `WordType.token_class`.
71
+ build_kwargs: optional parameters for defining the dataflow.
72
72
  """
73
73
  if number_repetitions == -1 and not cache_dataset:
74
74
  raise ValueError(
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for the base class of datasets.
19
+ DatasetBase, MergeDatasets and CustomDataset
20
20
  """
21
21
  from __future__ import annotations
22
22
 
@@ -42,9 +42,9 @@ from .info import DatasetCategories, DatasetInfo, get_merged_categories
42
42
 
43
43
  class DatasetBase(ABC):
44
44
  """
45
- Base class for a dataset. Requires to implementing `_categories` `_info` and `_builder` by
46
- yourself. These methods must return a DatasetCategories, a DatasetInfo and a DataFlow_Builder instance, which
47
- together give a complete description of the dataset. Compare some specific dataset cards in the :mod:`instance` .
45
+ Base class for a dataset. Requires to implement `_categories`, `_info` and `_builder` by
46
+ yourself. These methods must return a `DatasetCategories`, a `DatasetInfo` and a `DataFlow_Builder` instance, which
47
+ together give a complete description of the dataset. Compare some specific dataset cards in the `instance`.
48
48
  """
49
49
 
50
50
  def __init__(self) -> None:
@@ -65,21 +65,21 @@ class DatasetBase(ABC):
65
65
  @property
66
66
  def dataset_info(self) -> DatasetInfo:
67
67
  """
68
- dataset_info
68
+ `dataset_info`
69
69
  """
70
70
  return self._dataset_info
71
71
 
72
72
  @property
73
73
  def dataflow(self) -> DataFlowBaseBuilder:
74
74
  """
75
- dataflow
75
+ `dataflow`
76
76
  """
77
77
  return self._dataflow_builder
78
78
 
79
79
  @abstractmethod
80
80
  def _categories(self) -> DatasetCategories:
81
81
  """
82
- Construct the DatasetCategory object.
82
+ Construct the `DatasetCategory` object.
83
83
  """
84
84
 
85
85
  raise NotImplementedError()
@@ -88,7 +88,7 @@ class DatasetBase(ABC):
88
88
  @abstractmethod
89
89
  def _info(cls) -> DatasetInfo:
90
90
  """
91
- Construct the DatasetInfo object.
91
+ Construct the `DatasetInfo` object.
92
92
  """
93
93
 
94
94
  raise NotImplementedError()
@@ -96,7 +96,7 @@ class DatasetBase(ABC):
96
96
  @abstractmethod
97
97
  def _builder(self) -> DataFlowBaseBuilder:
98
98
  """
99
- Construct the DataFlowBaseBuilder object. It needs to be implemented in the derived class.
99
+ Construct the `DataFlowBaseBuilder` object. It needs to be implemented in the derived class.
100
100
  """
101
101
 
102
102
  raise NotImplementedError()
@@ -113,7 +113,7 @@ class DatasetBase(ABC):
113
113
  @staticmethod
114
114
  def is_built_in() -> bool:
115
115
  """
116
- Returns flag to indicate if dataset is custom or built int.
116
+ Returns flag to indicate if dataset is custom or built-in.
117
117
  """
118
118
  return False
119
119
 
@@ -140,9 +140,10 @@ class SplitDataFlow(DataFlowBaseBuilder):
140
140
 
141
141
  def __init__(self, train: list[Image], val: list[Image], test: Optional[list[Image]]):
142
142
  """
143
- :param train: Cached train split
144
- :param val: Cached val split
145
- :param test: Cached test split
143
+ Args:
144
+ train: Cached `train` split
145
+ val: Cached `val` split
146
+ test: Cached `test` split
146
147
  """
147
148
  super().__init__(location="")
148
149
  self.split_cache: dict[str, list[Image]]
@@ -154,8 +155,12 @@ class SplitDataFlow(DataFlowBaseBuilder):
154
155
  def build(self, **kwargs: Union[str, int]) -> DataFlow:
155
156
  """
156
157
  Dataflow builder for merged split datasets
157
- :param kwargs: Only split and max_datapoints arguments will be considered.
158
- :return: Dataflow
158
+
159
+ Args:
160
+ kwargs: Only split and max_datapoints arguments will be considered.
161
+
162
+ Returns:
163
+ Dataflow
159
164
  """
160
165
 
161
166
  split = kwargs.get("split", "train")
@@ -175,44 +180,49 @@ class MergeDataset(DatasetBase):
175
180
  guarantee flexibility it is possible to pass customized dataflows explicitly to maybe reduce the dataflow size from
176
181
  one dataset or to use different splits from different datasets.
177
182
 
178
- When yielding datapoint from :build(), note that one dataset will pass all its samples successively which
179
- might reduce randomness for training, especially when using datasets from the same domain. Buffering all datasets
180
- (without loading heavy components like images) is therefore possible and the merged dataset can be shuffled.
183
+ Note:
184
+ When yielding datapoints from `build` dataflows, note that one dataset will pass all its samples successively
185
+ which might reduce randomness for training. Buffering all datasets (without loading heavy components like
186
+ images) is therefore possible and the merged dataset can be shuffled.
181
187
 
182
- When the datasets are buffered are split functionality can divide the buffered samples into an train, val and test
183
- set.
188
+ When the datasets that are buffered are split functionality one can divide the buffered samples into an `train`,
189
+ `val` and `test` set.
184
190
 
185
191
  While the selection of categories is given by the union of all categories of all datasets, sub categories need to
186
192
  be handled with care: Only sub categories for one specific category are available provided that every dataset has
187
193
  this sub category available for this specific category. The range of sub category values again is defined as the
188
194
  range of all values from all datasets.
189
195
 
190
- **Example:**
191
-
192
- dataset_1 = get_dataset("dataset_1")
193
- dataset_2 = get_dataset("dataset_2")
196
+ Example:
194
197
 
195
- union_dataset = MergeDataset(dataset_1,dataset_2)
196
- union_dataset.buffer_datasets(split="train") # will cache the train split of dataset_1 and dataset_2
197
- merge.split_datasets(ratio=0.1, add_test=False) # will create a new split of the union.
198
+ ```python
199
+ dataset_1 = get_dataset("dataset_1")
200
+ dataset_2 = get_dataset("dataset_2")
198
201
 
202
+ union_dataset = MergeDataset(dataset_1,dataset_2)
203
+ union_dataset.buffer_datasets(split="train") # will cache the train split of dataset_1 and dataset_2
204
+ merge.split_datasets(ratio=0.1, add_test=False) # will create a new split of the union.
205
+ ```
199
206
 
200
- **Example:**
207
+ Example:
201
208
 
202
- dataset_1 = get_dataset("dataset_1")
203
- dataset_2 = get_dataset("dataset_2")
209
+ ```python
210
+ dataset_1 = get_dataset("dataset_1")
211
+ dataset_2 = get_dataset("dataset_2")
204
212
 
205
- df_1 = dataset_1.dataflow.build(max_datapoints=20) # handle separate dataflow configs ...
206
- df_2 = dataset_1.dataflow.build(max_datapoints=30)
213
+ df_1 = dataset_1.dataflow.build(max_datapoints=20) # handle separate dataflow configs ...
214
+ df_2 = dataset_1.dataflow.build(max_datapoints=30)
207
215
 
208
- union_dataset = MergeDataset(dataset_1,dataset_2)
209
- union_dataset.explicit_dataflows(df_1,df_2) # ... and pass them explicitly. Filtering is another
210
- # possibility
216
+ union_dataset = MergeDataset(dataset_1,dataset_2)
217
+ union_dataset.explicit_dataflows(df_1,df_2) # ... and pass them explicitly. Filtering is another
218
+ # possibility
219
+ ```
211
220
  """
212
221
 
213
222
  def __init__(self, *datasets: DatasetBase):
214
223
  """
215
- :param datasets: An arbitrary number of datasets
224
+ Args:
225
+ datasets: An arbitrary number of datasets
216
226
  """
217
227
  self.datasets = datasets
218
228
  self.dataflows: Optional[tuple[DataFlow, ...]] = None
@@ -244,12 +254,17 @@ class MergeDataset(DatasetBase):
244
254
  def build(self, **kwargs: Union[str, int]) -> DataFlow:
245
255
  """
246
256
  Building the dataflow of merged datasets. No argument will affect the stream if the dataflows have
247
- been explicitly passed. Otherwise, all kwargs will be passed to all dataflows. Note that each dataflow
248
- will iterate until it is exhausted. To guarantee randomness across different datasets cache all
249
- datapoints and shuffle them afterwards (e.g. use :buffer_dataset() ).
257
+ been explicitly passed. Otherwise, all kwargs will be passed to all dataflows.
258
+
259
+ Note:
260
+ Note that each dataflow will iterate until it is exhausted. To guarantee randomness across
261
+ different datasets cache all datapoints and shuffle them afterwards (e.g. use `buffer_dataset()`).
262
+
263
+ Args:
264
+ kwargs: arguments for `build()`
250
265
 
251
- :param kwargs: arguments for :build()
252
- :return: Dataflow
266
+ Return:
267
+ `Dataflow`
253
268
  """
254
269
  df_list = []
255
270
  if self.dataflows is not None:
@@ -272,7 +287,8 @@ class MergeDataset(DatasetBase):
272
287
  Pass explicit dataflows for each dataset. Using several dataflow configurations for one dataset is possible as
273
288
  well. However, the number of dataflow must exceed the number of merged datasets.
274
289
 
275
- :param dataflows: An arbitrary number of dataflows
290
+ Args:
291
+ dataflows args: An arbitrary number of dataflows
276
292
  """
277
293
  self.dataflows = dataflows
278
294
  if len(self.datasets) > len(self.dataflows):
@@ -286,19 +302,23 @@ class MergeDataset(DatasetBase):
286
302
  """
287
303
  Buffer datasets with given configs. If dataflows are passed explicitly it will cache their streamed output.
288
304
 
289
- :param kwargs: arguments for :build()
290
- :return: Dataflow
305
+ Args:
306
+ kwargs: arguments for `build()`
307
+
308
+ Returns:
309
+ Dataflow
291
310
  """
292
311
  df = self.dataflow.build(**kwargs)
293
312
  self.datapoint_list = CacheData(df, shuffle=True).get_cache()
294
313
 
295
314
  def split_datasets(self, ratio: float = 0.1, add_test: bool = True) -> None:
296
315
  """
297
- Split cached datasets into train/val(/test).
316
+ Split cached datasets into `train`/`val`(/`test`).
298
317
 
299
- :param ratio: 1-ratio will be assigned to the train split. The remaining bit will be assigned to val and test
300
- split.
301
- :param add_test: Add a test split
318
+ Args:
319
+ ratio: 1-ratio will be assigned to the train split. The remaining bit will be assigned to val and test
320
+ split.
321
+ add_test: Add a test split
302
322
  """
303
323
  assert self.datapoint_list is not None, "Datasets need to be buffered before splitting"
304
324
  number_datapoints = len(self.datapoint_list)
@@ -332,7 +352,8 @@ class MergeDataset(DatasetBase):
332
352
  To reproduce a dataset split at a later stage, get a summary of the by having a dict of list with split and
333
353
  the image ids contained in the split.
334
354
 
335
- :return: E.g. `{"train": ['ab','ac'],"val":['bc','bd']}`
355
+ Returns:
356
+ A dict with keys `train`, `val` and `test`: `{"train": ['ab','ac'],"val":['bc','bd']}`
336
357
  """
337
358
  if isinstance(self._dataflow_builder, SplitDataFlow):
338
359
  return {
@@ -345,8 +366,11 @@ class MergeDataset(DatasetBase):
345
366
  self, split_dict: Mapping[str, Sequence[str]], **dataflow_build_kwargs: Union[str, int]
346
367
  ) -> None:
347
368
  """
348
- Reproducing a dataset split from a dataset or a dataflow by a dict of list of image ids.
369
+ Reproducing a dataset split from a dataset or a dataflow by a dict of list of `image_id`s.
370
+
371
+ Example:
349
372
 
373
+ ```python
350
374
  merge = dd.MergeDataset(doclaynet)
351
375
  merge.explicit_dataflows(df_doc)
352
376
  merge.buffer_datasets()
@@ -357,8 +381,10 @@ class MergeDataset(DatasetBase):
357
381
  df_doc_2 = doclaynet.dataflow.build(split="train", max_datapoints=4000)
358
382
  merge_2.explicit_dataflows(df_doc_2)
359
383
  merge_2.create_split_by_id(out) # merge_2 now has the same split as merge
384
+ ```
360
385
 
361
- :param split_dict: e.g. `{"train":['ab','ac',...],"val":['bc'],"test":[]}`
386
+ Args:
387
+ split_dict: e.g. `{"train":['ab','ac',...],"val":['bc'],"test":[]}`
362
388
  """
363
389
 
364
390
  if set(split_dict.keys()) != {"train", "val", "test"}:
@@ -399,33 +425,41 @@ class CustomDataset(DatasetBase):
399
425
  description: Optional[str] = None,
400
426
  ):
401
427
  """
402
- :param name: Name of the dataset. It will not be used in the code, however it might be helpful, if several
428
+
429
+ Args:
430
+ name: Name of the dataset. It will not be used in the code, however it might be helpful, if several
403
431
  custom datasets are in use.
404
- :param dataset_type: Datasets need to be characterized by one of the `enum` members `DatasetType` that describe
432
+ dataset_type: Datasets need to be characterized by one of the `enum` members `DatasetType` that describe
405
433
  the machine learning task the dataset is built for. You can get all registered types with
406
434
 
407
- types = dd.object_types_registry.get("DatasetType")
408
- print({t for t in types})
435
+ ```python
436
+ types = dd.object_types_registry.get("DatasetType")
437
+ print({t for t in types})
438
+ ```
409
439
 
410
- :param location: Datasets should be stored a sub folder of name `location` in the local cache
440
+ location: Datasets should be stored a sub folder of name `location` in the local cache
411
441
  `get_dataset_dir_path()`. There are good reasons to use `name`.
412
- :param init_categories: A list of all available categories in this dataset. You must use a list as the order
413
- of the categories must always be preserved: they determine the category id that in turn
414
- will be used for model training.
415
- :param dataflow_builder: A subclass of `DataFlowBaseBuilder`. Do not instantiate the class by yourself.
416
- :param init_sub_categories: A dict mapping main categories to sub categories, if there are any available.
417
- Suppose an object `LayoutType.cell` has two additional information in the annotation
418
- file: `CellType.header, CellType.body`. You can then write:
442
+ init_categories: A list of all available categories in this dataset. You must use a list as the order
443
+ of the categories must always be preserved: they determine the category id that in turn
444
+ will be used for model training.
445
+ dataflow_builder: A subclass of `DataFlowBaseBuilder`. Do not instantiate the class by yourself.
446
+ init_sub_categories: A dict mapping main categories to sub categories, if there are any available.
447
+ Suppose an object `LayoutType.cell` has two additional information in the annotation
448
+ file: `CellType.header, CellType.body`. You can then write:
419
449
 
420
- {LayoutType.cell: {CellType.header: [CellType.header, CellType.body]}
450
+ ```python
451
+ {LayoutType.cell: {CellType.header: [CellType.header, CellType.body]}
452
+ ```
421
453
 
422
- This setting assumes that later in the mapping the `ImageAnnotation` with
423
- `category_name=LayoutType.cell` will have a sub category of key `CellType.header`
424
- and one of the two values `CellType.header, CellType.body`
425
- :param annotation_files: A mapping to one or more annotation files, e.g.
454
+ This setting assumes that later in the mapping the `ImageAnnotation` with
455
+ `category_name=LayoutType.cell` will have a sub category of key `CellType.header`
456
+ and one of the two values `CellType.header, CellType.body`.
457
+ annotation_files: A mapping to one or more annotation files, e.g.
426
458
 
427
- annotation_file = {"train": "train_file.json", "test": "test_file.json"}
428
- :param description: A description of the dataset.
459
+ ```python
460
+ annotation_file = {"train": "train_file.json", "test": "test_file.json"}
461
+ ```
462
+ description: A description of the dataset.
429
463
  """
430
464
 
431
465
  self.name = name
@@ -467,13 +501,16 @@ class CustomDataset(DatasetBase):
467
501
  """
468
502
  This static method creates a CustomDataset instance from a dataset card.
469
503
 
470
- A dataset card is a JSON file that contains metadata about the dataset such as its name, type, location,
471
- initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is a class
472
- that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
504
+ A dataset card is a `JSON` file that contains metadata about the dataset such as its `name`, `dataset_type`,
505
+ `location`, initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is
506
+ a class that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
473
507
 
474
- :param file_path: The path to the dataset card (JSON file).
475
- :param dataflow_builder: The class used to build the dataflow for the dataset.
476
- :return: A CustomDataset instance created from the dataset card.
508
+ Args:
509
+ file_path: The path to the dataset card (`JSON` file).
510
+ dataflow_builder: The class used to build the dataflow for the dataset.
511
+
512
+ Returns:
513
+ A CustomDataset instance created from the dataset card.
477
514
  """
478
515
 
479
516
  with open(file_path, "r", encoding="UTF-8") as file:
@@ -496,9 +533,8 @@ class CustomDataset(DatasetBase):
496
533
 
497
534
  def as_dict(self) -> Mapping[str, Any]:
498
535
  """
499
- Return the meta-data of the dataset as a dictionary.
500
-
501
- :return: A dictionary containing the meta-data of the dataset.
536
+ Return:
537
+ The meta-data of the dataset as a dictionary.
502
538
  """
503
539
  return {
504
540
  "name": self.name,
@@ -519,9 +555,10 @@ class CustomDataset(DatasetBase):
519
555
 
520
556
  def save_dataset_card(self, file_path: str) -> None:
521
557
  """
522
- Save the dataset card to a JSON file.
558
+ Save the dataset card to a `JSON` file.
523
559
 
524
- :param file_path: file_path
560
+ Args:
561
+ file_path: file_path
525
562
  """
526
563
  with open(file_path, "w", encoding="UTF-8") as file:
527
564
  json.dump(self.as_dict(), file, indent=4)
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for DataFlowBaseBuilder class.
19
+ Module for `DataFlowBaseBuilder` class.
20
20
  """
21
21
 
22
22
  from abc import ABC, abstractmethod
@@ -48,8 +48,9 @@ class DataFlowBaseBuilder(ABC):
48
48
  annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
49
49
  ):
50
50
  """
51
- :param location: Relative path of the physical dataset.
52
- :param annotation_files: Dict of annotation files e.g. depending on the split.
51
+ Args:
52
+ location: Relative path of the physical dataset.
53
+ annotation_files: Dict of annotation files e.g. depending on the split.
53
54
  """
54
55
  self.location = location
55
56
  if annotation_files is None:
@@ -61,7 +62,7 @@ class DataFlowBaseBuilder(ABC):
61
62
  @property
62
63
  def categories(self) -> DatasetCategories:
63
64
  """
64
- categories
65
+ `categories`
65
66
  """
66
67
  if self._categories is not None:
67
68
  return self._categories
@@ -70,27 +71,28 @@ class DataFlowBaseBuilder(ABC):
70
71
  @categories.setter
71
72
  def categories(self, categories: DatasetCategories) -> None:
72
73
  """
73
- categories setter
74
+ `categories` setter
74
75
  """
75
76
  self._categories = categories
76
77
 
77
78
  def get_split(self, key: str) -> str:
78
79
  """
79
- split value
80
+ Args:
81
+ key: split value
80
82
  """
81
83
  return self._splits[key]
82
84
 
83
85
  @property
84
86
  def splits(self) -> Mapping[str, str]:
85
87
  """
86
- splits
88
+ `splits`
87
89
  """
88
90
  return self._splits
89
91
 
90
92
  @splits.setter
91
93
  def splits(self, splits: Mapping[str, str]) -> None:
92
94
  """
93
- set splits
95
+ `splits` setter
94
96
  """
95
97
  self._splits = splits
96
98
 
@@ -98,7 +100,8 @@ class DataFlowBaseBuilder(ABC):
98
100
  """
99
101
  Get the absolute path to the locally physically stored dataset.
100
102
 
101
- :return: local workdir
103
+ Returns:
104
+ local workdir
102
105
  """
103
106
  return Path(get_dataset_dir_path()) / self.location
104
107
 
@@ -107,13 +110,16 @@ class DataFlowBaseBuilder(ABC):
107
110
  """
108
111
  Consult the docstring w.r.t `DataFlowBaseBuilder`.
109
112
 
110
- :param kwargs: A custom set of arguments/values
111
- :return: dataflow
113
+ Args:
114
+ kwargs: A custom set of arguments/values
115
+
116
+ Returns:
117
+ dataflow
112
118
  """
113
119
  raise NotImplementedError()
114
120
 
115
121
  def get_annotation_file(self, split: str) -> str:
116
- """Get single annotation file."""
122
+ """Get single annotation file"""
117
123
  split_file = self.annotation_files[split]
118
124
  if isinstance(split_file, str):
119
125
  return split_file