deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +4 -2
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +919 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +162 -108
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +205 -119
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +26 -17
  104. deepdoctection/utils/env_info.py +86 -37
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -71
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.1.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.1.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for storing dataset info (e.g. general meta data or categories)
19
+ General meta-data or categories
20
20
  """
21
21
 
22
22
  from copy import copy
@@ -53,10 +53,13 @@ def _get_dict(
53
53
  """
54
54
  Converts a list into a dict, where keys/values are the list indices.
55
55
 
56
- :param l: A list of categories
57
- :param name_as_key: Whether to return the dict with category names as key (True)
58
- :param starts_with: index count start
59
- :return: A dictionary of list indices/list elements.
56
+ Args:
57
+ l: A list of categories
58
+ name_as_key: Whether to return the dict with category names as key (`True`)
59
+ starts_with: index count start
60
+
61
+ Returns:
62
+ A dictionary of list indices/list elements.
60
63
  """
61
64
  if name_as_key:
62
65
  return {v: k for k, v in enumerate(l, starts_with)}
@@ -66,22 +69,17 @@ def _get_dict(
66
69
  @dataclass
67
70
  class DatasetInfo:
68
71
  """
69
- DatasetInfo is a simple dataclass that stores some meta-data information about a dataset.
70
-
71
- `name`: Name of the dataset. Using the name you can retrieve the dataset from the
72
- `registry.DatasetRegistry`.
73
-
74
- `description`: Short description of the dataset.
75
-
76
- `license`: License to the dataset.
77
-
78
- `url`: url, where the dataset can be downloaded from.
79
-
80
- `splits`: A dict of splits. The value must store the relative path, where the split can be found.
81
-
82
- `type`: The type describes whether this is a dataset for object detection (pass 'OBJECT_DETECTION'),
83
- sequence classification (pass 'SEQUENCE_CLASSIFICATION') or token classification ('TOKEN_CLASSIFICATION').
84
- Optionally, pass `None`.
72
+ `DatasetInfo` is a simple dataclass that stores some meta-data information about a dataset.
73
+
74
+ Attributes:
75
+ name: Name of the dataset. Using the name you can retrieve the dataset from the `registry.DatasetRegistry`.
76
+ description: Short description of the dataset.
77
+ license: License to the dataset.
78
+ url: url, where the dataset can be downloaded from.
79
+ splits: A `dict` of splits. The value must store the relative path, where the split can be found.
80
+ type: The type describes whether this is a dataset for object detection (pass 'OBJECT_DETECTION'),
81
+ sequence classification (pass 'SEQUENCE_CLASSIFICATION') or token classification ('TOKEN_CLASSIFICATION').
82
+ Optionally, pass `None`.
85
83
  """
86
84
 
87
85
  name: str
@@ -96,8 +94,11 @@ class DatasetInfo:
96
94
  """
97
95
  Get the split directory by its key (if it exists).
98
96
 
99
- :param key: The key to a split (i.e. "train", "val", "test")
100
- :return: The local directory path to the split. An empty string if the key doesn't exist.
97
+ Args:
98
+ key: The key to a split (i.e. `train`, `val`,`test`)
99
+
100
+ Returns:
101
+ The local directory path to the split. An empty string if the key doesn't exist.
101
102
  """
102
103
 
103
104
  return self.splits[key]
@@ -112,22 +113,26 @@ class DatasetCategories:
112
113
  for the index/category name relationship and guarantees that a sequence of natural numbers for the categories
113
114
  is always returned as the category-id even after replacing and/or filtering.
114
115
 
115
- `init_categories`: A list of category names. The list must include all categories that can occur within the
116
- annotations.
117
-
118
- `init_sub_categories`: A dict of categories/sub-categories. Each sub-category that can appear in the
119
- annotations in combination with a category must be listed.
116
+ Attributes:
117
+ init_categories: A list of `category_name`s. The list must include all categories that can occur within the
118
+ annotations.
119
+ init_sub_categories: A dict of categories/sub-categories. Each sub-category that can appear in the
120
+ annotations in combination with a category must be listed.
120
121
 
121
- **Example:**
122
+ Example:
122
123
 
123
124
  An annotation file hast the category/sub-category combinations for three datapoints:
124
125
 
125
- (cat1,s1),(cat1,s2), (cat2,s2).
126
+ ```python
127
+ (cat1,s1),(cat1,s2), (cat2,s2).
128
+ ```
126
129
 
127
130
  You must list `init_categories`, `init_sub_categories` as follows:
128
131
 
129
- init_categories = [cat1,cat2]
130
- init_sub_categories = {cat1: [s1,s2],cat2: [s2]}
132
+ ```python
133
+ init_categories = [cat1,cat2]
134
+ init_sub_categories = {cat1: [s1,s2],cat2: [s2]}
135
+ ```
131
136
 
132
137
  Use `filter_categories` or `set_cat_to_sub_cat` to filter or swap categories with sub-categories.
133
138
  """
@@ -173,14 +178,17 @@ class DatasetCategories:
173
178
  categories of replaced categories with sub categories. However, you must correctly pass arguments to return the
174
179
  state you want.
175
180
 
176
- :param as_dict: Will pass a dict if set to 'True' otherwise a list.
177
- :param name_as_key: Categories are stored as key/value pair in a dict with integers as keys. name_as_key set to
178
- "False" will swap keys and values.
179
- :param init: If set to "True" it will return the list/dict of categories as initially provided. Manipulations
180
- due to replacing/filtering will not be regarded.
181
- :param filtered: If set to "True" will return an unfiltered list of all categories. If a replacing has been
182
- invoked selected sub categories will be returned.
183
- :return: A dict of index/category names (or the other way around) or a list of category names.
181
+ Args:
182
+ as_dict: Will pass a dict if set to 'True' otherwise a list.
183
+ name_as_key: Categories are stored as key/value pair in a dict with integers as keys. `name_as_key` set to
184
+ `False` will swap keys and values.
185
+ init: If set to `True` it will return the list/dict of categories as initially provided. Manipulations
186
+ due to replacing/filtering will not be regarded.
187
+ filtered: If set to `True` will return an unfiltered list of all categories. If a replacing has been
188
+ invoked selected sub categories will be returned.
189
+
190
+ Returns:
191
+ A dict of index/category names (or the other way around) or a list of category names.
184
192
  """
185
193
  if init:
186
194
  if as_dict:
@@ -209,14 +217,17 @@ class DatasetCategories:
209
217
  """
210
218
  Returns a dict of list with a category name and their sub categories.
211
219
 
212
- :param categories: A single category or list of category names
213
- :param sub_categories: A mapping of categories to sub category keys on which the result should be filtered. Only
220
+ Args:
221
+ categories: A single category or list of category names
222
+ sub_categories: A mapping of categories to sub category keys on which the result should be filtered. Only
214
223
  relevant, if `keys=False`
215
- :param keys: Will only pass keys if set to `True`.
216
- :param values_as_dict: Will generate a dict with indices and sub category value names if set to `True`.
217
- :param name_as_key: sub category values are stored as key/value pair in a dict with integers as keys.
224
+ keys: Will only pass keys if set to `True`.
225
+ values_as_dict: Will generate a dict with indices and sub category value names if set to `True`.
226
+ name_as_key: sub category values are stored as key/value pair in a dict with integers as keys.
218
227
  name_as_key set to `False` will swap keys and values.
219
- :return: Dict with all selected categories.
228
+
229
+ Returns:
230
+ Dict with all selected categories.
220
231
  """
221
232
  _categories: Sequence[ObjectTypes]
222
233
  if isinstance(categories, (ObjectTypes, str)):
@@ -293,14 +304,16 @@ class DatasetCategories:
293
304
  This method can only be called once per object. Re-setting or further replacing of categories would make the
294
305
  code messy and is therefore not allowed.
295
306
 
296
- **Example:**
297
-
298
- cat_to_sub_cat={cat1: sub_cat1}
307
+ Example:
308
+ ```python
309
+ cat_to_sub_cat={cat1: sub_cat1}
310
+ ```
299
311
 
300
312
  will replace cat1 with sub_cat1 as category. This will also be respected when returning datapoints.
301
313
 
302
- :param cat_to_sub_cat: A dict of pairs of category/sub-category. Note that the combination must be available
303
- according to the initial settings.
314
+ Args:
315
+ cat_to_sub_cat: A dict of pairs of category/sub-category. Note that the combination must be available
316
+ according to the initial settings.
304
317
  """
305
318
 
306
319
  _cat_to_sub_cat = {get_type(key): get_type(value) for key, value in cat_to_sub_cat.items()}
@@ -327,7 +340,8 @@ class DatasetCategories:
327
340
  Filter categories of a dataset. This will keep all the categories chosen and remove all others.
328
341
  This method can only be called once per object.
329
342
 
330
- :param categories: A single category name or a list of category names.
343
+ Args:
344
+ categories: A single `category_name` or a list of `category_name`s.
331
345
  """
332
346
 
333
347
  if not self._allow_update:
@@ -344,13 +358,14 @@ class DatasetCategories:
344
358
  @property
345
359
  def cat_to_sub_cat(self) -> Optional[Mapping[ObjectTypes, ObjectTypes]]:
346
360
  """
347
- cat_to_sub_cat
361
+ `cat_to_sub_cat`
348
362
  """
349
363
  return self._cat_to_sub_cat
350
364
 
351
365
  def is_cat_to_sub_cat(self) -> bool:
352
366
  """
353
- returns `True` if a category is replaced with sub categories
367
+ Returns:
368
+ `True` if a category is replaced with sub categories
354
369
  """
355
370
  if self._cat_to_sub_cat is not None:
356
371
  return True
@@ -358,7 +373,8 @@ class DatasetCategories:
358
373
 
359
374
  def is_filtered(self) -> bool:
360
375
  """
361
- return `True` if categories are filtered
376
+ Returns:
377
+ `True` if categories are filtered
362
378
  """
363
379
  if hasattr(self, "_categories_filter_update"):
364
380
  return True
@@ -379,8 +395,11 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
379
395
  as well but no sub category than the merged dataset will have no sub categories at all. Whereas in a similar setting
380
396
  dataset B has sub category `foo`:`bak`, then `bak` will be an optional sub category for the merged dataset as well.
381
397
 
382
- :param categories: A tuple/list of dataset categories
383
- :return: An instance of `DatasetCategories` to be used as `DatasetCategories` for merged datasets
398
+ Args:
399
+ categories: A tuple/list of dataset categories
400
+
401
+ Returns:
402
+ An instance of `DatasetCategories` to be used as `DatasetCategories` for merged datasets
384
403
  """
385
404
 
386
405
  # working with lists is not possible as the order of categories is important here
@@ -16,7 +16,9 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Init file for instances package. Place all datasets in a directory using the structure
19
+ Dataset samples for pre-training and fine-tuning models
20
+
21
+ Place all datasets in a **deep**doctection's cache
20
22
 
21
23
  deepdoctection
22
24
  ├── datasets
@@ -24,9 +26,11 @@ Init file for instances package. Place all datasets in a directory using the str
24
26
  │ ├── dataset_2
25
27
  │ ├── dataset_3
26
28
 
27
- If not sure use
29
+ If not sure:
28
30
 
31
+ ```python
29
32
  print(dataset_instance.dataflow.get_workdir())
33
+ ```
30
34
  """
31
35
 
32
36
  from .doclaynet import *
@@ -25,6 +25,7 @@ Module for DocLayNet dataset. Place the dataset as follows
25
25
  ├── PNG
26
26
  │ ├── 0a0d43e301facee9e99cc33b9b16e732dd207135f4027e75f6aea2bf117535a2.png
27
27
  """
28
+
28
29
  from __future__ import annotations
29
30
 
30
31
  import os
@@ -101,7 +102,7 @@ _SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]
101
102
  @dataset_registry.register("doclaynet")
102
103
  class DocLayNet(DatasetBase):
103
104
  """
104
- DocLayNetSeq
105
+ `DocLayNet`
105
106
  """
106
107
 
107
108
  @classmethod
@@ -125,7 +126,7 @@ class DocLayNet(DatasetBase):
125
126
 
126
127
  class DocLayNetBuilder(DataFlowBaseBuilder):
127
128
  """
128
- DocLayNetBuilder dataflow builder
129
+ `DocLayNetBuilder` dataflow builder
129
130
  """
130
131
 
131
132
  def build(self, **kwargs: Union[str, int]) -> DataFlow:
@@ -133,15 +134,14 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
133
134
  Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
134
135
  of the dataflow:
135
136
 
136
- `split:` Split of the dataset. Can be `train`,`val` or `test`. Default: `val`
137
-
138
- `max_datapoints:` Will stop iterating after max_datapoints. Default: None
139
-
140
- `load_image:` Will load the image for each datapoint. Default: False
141
-
142
- `fake_score:` Will add a fake score so that annotations look like predictions
137
+ Args:
138
+ kwargs: (split) Split of the dataset. Can be `train`,`val` or `test`. Default: `val`
139
+ (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
140
+ (load_image) Will load the image for each datapoint. Default: `False`
141
+ (fake_score) Will add a fake score so that annotations look like predictions
143
142
 
144
- :return: dataflow
143
+ Returns:
144
+ Dataflow
145
145
  """
146
146
  split = str(kwargs.get("split", "val"))
147
147
  max_datapoints = kwargs.get("max_datapoints")
@@ -233,11 +233,14 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
233
233
  Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
234
234
  of the dataflow:
235
235
 
236
- `split:` Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
237
- `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
238
- `load_image:` Will load the image for each datapoint. Default: `False`
236
+ Args:
237
+ kwargs:
238
+ (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
239
+ (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
240
+ (load_image) Will load the image for each datapoint. Default: `False`
239
241
 
240
- :return: dataflow
242
+ Returns:
243
+ Dataflow
241
244
  """
242
245
  split = str(kwargs.get("split", "val"))
243
246
  max_datapoints = kwargs.get("max_datapoints")
@@ -157,28 +157,22 @@ class FintabnetBuilder(DataFlowBaseBuilder):
157
157
  Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
158
158
  of the dataflow:
159
159
 
160
- `split:` Split of the dataset. Can be `train`,`val` or `test`. Default: `val`
161
-
162
- `build_mode:` Returns the full image or crops a table according to the table bounding box. Pass `table`
163
- if you only want the cropped table. Default: ""
164
-
165
- `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
166
-
167
- `rows_and_cols:` Will add a 'item' image annotations that either represent a row or a column of a table.
168
- Note, that the type of the item (i.e. being a row or a column) can be inferred from the
169
- sub category added. Note further, that "item" are not originally part of the annotations
170
- and are inferred from cell positions and their associated table semantic. Default: `True`
171
- `load_image:` Will load the image for each datapoint. Default: `False`
172
-
173
- `use_multi_proc:` As the original files are stored as pdf conversion into a numpy array is time-consuming.
174
- When setting use_multi_proc to True is will use several processes depending on the number
175
- of CPUs available.
176
-
177
- `use_multi_proc_strict:` Will use strict mode in multiprocessing.
178
-
179
- `fake_score:` Will add a fake score so that annotations look like predictions
180
-
181
- :return: dataflow
160
+ Args:
161
+ kwargs:
162
+ (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
163
+ (build_mode) Returns the full image or crops a table according to the table bounding box. Pass `table`
164
+ if you only want the cropped table. Default: `""`
165
+ (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
166
+ (rows_and_cols) Will add 'item' image annotations that either represent a row or a column of a table.
167
+ Default: `True`
168
+ (load_image) Will load the image for each datapoint. Default: `False`
169
+ (use_multi_proc) Uses multiple processes for PDF conversion. Default: `True`
170
+ (use_multi_proc_strict) Uses strict mode in multiprocessing. Default: `False`
171
+ (fake_score) Adds a fake score so that annotations look like predictions. Default: `False`
172
+ (pubtables_like) Treats the dataset as PubTables-like. Default: `False`
173
+
174
+ Returns:
175
+ Dataflow
182
176
  """
183
177
 
184
178
  split = str(kwargs.get("split", "val"))
@@ -54,8 +54,10 @@ def load_file(path_ann: PathLikeOrStr) -> FunsdDict:
54
54
  """
55
55
  Loading json file
56
56
 
57
- :param path_ann: path
58
- :return: dict
57
+ Args:
58
+ path_ann: path
59
+ Returns:
60
+ dict
59
61
  """
60
62
  anns = load_json(path_ann)
61
63
  path, file_name = os.path.split(path_ann)
@@ -144,11 +146,14 @@ class FunsdBuilder(DataFlowBaseBuilder):
144
146
  Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
145
147
  of the dataflow:
146
148
 
147
- `split:` Split of the dataset. "train" and "test" is available
148
- `load_image:` Will load the image for each datapoint. Default: `False`
149
- `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
149
+ Args:
150
+ kwargs:
151
+ (split) Split of the dataset. Can be `train` or `test`. Default: `test`
152
+ (load_image) Will load the image for each datapoint. Default: `False`
153
+ (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
150
154
 
151
- :return: Dataflow
155
+ Returns:
156
+ Dataflow
152
157
  """
153
158
 
154
159
  split = str(kwargs.get("split", "test"))
@@ -124,15 +124,15 @@ class IIITar13KBuilder(DataFlowBaseBuilder):
124
124
  Returns a dataflow from which you can stream datapoints of images. The following arguments affect the return
125
125
  values of the dataflow:
126
126
 
127
- `split:` Split of the dataset. Can be `train`,`val` or `test`. Default: `val`
128
-
129
- `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
130
-
131
- `load_image:` Will load the image for each datapoint. Default: `False`
132
-
133
- `fake_score:` Will add a fake score so that annotations look like predictions
134
-
135
- :return: dataflow
127
+ Args:
128
+ kwargs:
129
+ (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
130
+ (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
131
+ (load_image) Will load the image for each datapoint. Default: `False`
132
+ (fake_score) Will add a fake score so that annotations look like predictions. Default: `False`
133
+
134
+ Returns:
135
+ Dataflow
136
136
  """
137
137
 
138
138
  if not lxml_available():
@@ -66,7 +66,7 @@ _INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutTy
66
66
  @dataset_registry.register("testlayout")
67
67
  class LayoutTest(_BuiltInDataset):
68
68
  """
69
- LayoutTest
69
+ `LayoutTest`
70
70
  """
71
71
 
72
72
  _name = _NAME
@@ -99,15 +99,15 @@ class LayoutTestBuilder(DataFlowBaseBuilder):
99
99
  Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
100
100
  of the dataflow:
101
101
 
102
- `split:` Split of the dataset. Only "test" is for this small sample available
102
+ Args:
103
+ kwargs:
104
+ (split) Split of the dataset. Only `test` is available for this small sample. Default: `test`
105
+ (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
106
+ (load_image) Will load the image for each datapoint. Default: `False`
107
+ (fake_score) Will add a fake score so that annotations look like predictions. Default: `False`
103
108
 
104
- `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
105
-
106
- `load_image:` Will load the image for each datapoint. Default: `False`
107
-
108
- `fake_score:` Will add a fake score so that annotations look like predictions
109
-
110
- :return: Dataflow
109
+ Returns:
110
+ Dataflow
111
111
  """
112
112
  split = str(kwargs.get("split", "test"))
113
113
  max_datapoints = kwargs.get("max_datapoints")
@@ -73,7 +73,7 @@ _INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutTy
73
73
  @dataset_registry.register("publaynet")
74
74
  class Publaynet(_BuiltInDataset):
75
75
  """
76
- Publaynet
76
+ `Publaynet`
77
77
  """
78
78
 
79
79
  _name = _NAME
@@ -107,15 +107,15 @@ class PublaynetBuilder(DataFlowBaseBuilder):
107
107
  Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
108
108
  of the dataflow:
109
109
 
110
- `split:` Split of the dataset. Can be `train`,`val` or `test`. Default: `val`
110
+ Args:
111
+ kwargs:
112
+ (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
113
+ (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
114
+ (load_image) Will load the image for each datapoint. Default: `False`
115
+ (fake_score) Will add a fake score so that annotations look like predictions. Default: `False`
111
116
 
112
- `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
113
-
114
- `load_image:` Will load the image for each datapoint. Default: `False`
115
-
116
- `fake_score:` Will add a fake score so that annotations look like predictions
117
-
118
- :return: dataflow
117
+ Returns:
118
+ Dataflow
119
119
  """
120
120
  split = str(kwargs.get("split", "val"))
121
121
  max_datapoints = kwargs.get("max_datapoints")
@@ -119,23 +119,23 @@ class Pubtables1MDet(_BuiltInDataset):
119
119
 
120
120
  class Pubtables1MBuilder(DataFlowBaseBuilder):
121
121
  """
122
- Pubtables1M dataflow builder
122
+ `Pubtables1M` dataflow builder
123
123
  """
124
124
 
125
125
  def build(self, **kwargs: Union[str, int]) -> DataFlow:
126
126
  """
127
- Returns a dataflow from which you can stream datapoints of images. The following arguments affect the return
128
- values of the dataflow:
129
-
130
- `split:` Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
131
-
132
- `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
133
-
134
- `load_image:` Will load the image for each datapoint. Default: `False`
135
-
136
- `fake_score:` Will add a fake score so that annotations look like predictions
137
-
138
- :return: dataflow
127
+ Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
128
+ of the dataflow:
129
+
130
+ Args:
131
+ kwargs:
132
+ (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
133
+ (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
134
+ (load_image) Will load the image for each datapoint. Default: `False`
135
+ (fake_score) Will add a fake score so that annotations look like predictions. Default: `False`
136
+
137
+ Returns:
138
+ Dataflow
139
139
  """
140
140
 
141
141
  if not lxml_available():
@@ -109,7 +109,7 @@ _SUB_CATEGORIES = {
109
109
  @dataset_registry.register("pubtabnet")
110
110
  class Pubtabnet(_BuiltInDataset):
111
111
  """
112
- Pubtabnet
112
+ `Pubtabnet`
113
113
  """
114
114
 
115
115
  _name = _NAME
@@ -143,20 +143,18 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
143
143
  Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
144
144
  of the dataflow:
145
145
 
146
- `split:` Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
147
-
148
- `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
149
-
150
- `load_image:` Will load the image for each datapoint. Default: `False`
151
-
152
- `rows_and_cols:` Will add a 'item' image annotations that either represent a row or a column of a table.
153
- Note, that the type of the item (i.e. being a row or a column) can be inferred from the
154
- sub category added. Note further, that 'ITEM' are not originally part of the annotations
155
- and are inferred from cell positions and their associated table semantic. Default: `True`
156
-
157
- `fake_score:` Will add a fake score so that annotations look like predictions
158
-
159
- :return: dataflow
146
+ Args:
147
+ kwargs:
148
+ (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
149
+ (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
150
+ (load_image) Will load the image for each datapoint. Default: `False`
151
+ (rows_and_cols) Will add 'item' image annotations that represent rows or columns of a
152
+ table. Default: `True`
153
+ (fake_score) Will add a fake score so that annotations look like predictions. Default: `False`
154
+ (dd_pipe_like) If `True`, sets `load_image` to `True`. Default: `False`
155
+
156
+ Returns:
157
+ Dataflow
160
158
  """
161
159
  split = str(kwargs.get("split", "val"))
162
160
  if split == "val":
@@ -123,16 +123,16 @@ class RvlcdipBuilder(DataFlowBaseBuilder):
123
123
 
124
124
  def build(self, **kwargs: Union[str, int]) -> DataFlow:
125
125
  """
126
- Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
127
- of the dataflow:
126
+ Returns a dataflow from which you can stream datapoints of images.
128
127
 
129
- `split:` Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
128
+ Args:
129
+ kwargs:
130
+ split (str): Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
131
+ max_datapoints (int): Will stop iterating after max_datapoints. Default: `None`
132
+ load_image (bool): Will load the image for each datapoint. Default: `False`
130
133
 
131
- max_datapoints: Will stop iterating after max_datapoints. Default: `None`
132
-
133
- load_image: Will load the image for each datapoint. Default: `False`
134
-
135
- :return: dataflow
134
+ Returns:
135
+ Dataflow
136
136
  """
137
137
 
138
138
  split = str(kwargs.get("split", "val"))
@@ -131,15 +131,17 @@ class XfundBuilder(DataFlowBaseBuilder):
131
131
  Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
132
132
  of the dataflow:
133
133
 
134
- `split:` Split of the dataset. `train` and `val` is available
135
-
136
- `load_image:` Will load the image for each datapoint. Default: `False`
137
-
138
- `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
139
-
140
- `languages:` Will select only samples of selected languages. Available languages: `de`, `es`, `fr`, `it`, `ja` ,
141
- `pt`, `zh`. If default will take any language.
142
- :return: Dataflow
134
+ Args:
135
+ kwargs:
136
+ (split) Split of the dataset. `train` and `val` are available. Default: `val`
137
+ (load_image) Will load the image for each datapoint. Default: `False`
138
+ (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
139
+ (languages) Will select only samples of selected languages. Available languages:
140
+ `de`, `es`, `fr`, `it`, `ja`, `pt`, `zh`. If `None`, all
141
+ languages are taken. Default: `None`
142
+
143
+ Returns:
144
+ Dataflow
143
145
  """
144
146
 
145
147
  split = str(kwargs.get("split", "val"))