deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.0.dist-info/METADATA +0 -431
- deepdoctection-0.42.0.dist-info/RECORD +0 -148
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
|
|
19
|
+
Classes to load data and produce dataflows
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
from __future__ import annotations
|
|
@@ -63,13 +63,14 @@ class FileClosingIterator:
|
|
|
63
63
|
reading the data from it. It is used in the context of reading data from a file
|
|
64
64
|
in a streaming manner, where the data is not loaded into memory all at once.
|
|
65
65
|
|
|
66
|
-
|
|
67
|
-
|
|
66
|
+
Example:
|
|
67
|
+
```python
|
|
68
68
|
file = open(path, "r")
|
|
69
69
|
iterator = Reader(file)
|
|
70
70
|
closing_iterator = FileClosingIterator(file, iter(iterator))
|
|
71
71
|
|
|
72
|
-
df = CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints)
|
|
72
|
+
df = CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints)
|
|
73
|
+
```
|
|
73
74
|
|
|
74
75
|
"""
|
|
75
76
|
|
|
@@ -77,8 +78,9 @@ class FileClosingIterator:
|
|
|
77
78
|
"""
|
|
78
79
|
Initializes the FileClosingIterator with a file object and its iterator.
|
|
79
80
|
|
|
80
|
-
:
|
|
81
|
-
|
|
81
|
+
Args:
|
|
82
|
+
file_obj: The file object to read data from.
|
|
83
|
+
iterator: The actual iterator of the file object.
|
|
82
84
|
"""
|
|
83
85
|
self.file_obj = file_obj
|
|
84
86
|
self.iterator = iterator
|
|
@@ -87,7 +89,8 @@ class FileClosingIterator:
|
|
|
87
89
|
"""
|
|
88
90
|
Returns the iterator object itself.
|
|
89
91
|
|
|
90
|
-
:
|
|
92
|
+
Returns:
|
|
93
|
+
FileClosingIterator: The instance of the class itself.
|
|
91
94
|
"""
|
|
92
95
|
return self
|
|
93
96
|
|
|
@@ -96,7 +99,8 @@ class FileClosingIterator:
|
|
|
96
99
|
Returns the next item from the file object's iterator.
|
|
97
100
|
Closes the file object if the iteration is finished.
|
|
98
101
|
|
|
99
|
-
:
|
|
102
|
+
Returns:
|
|
103
|
+
The next item from the file object's iterator.
|
|
100
104
|
|
|
101
105
|
Raises:
|
|
102
106
|
StopIteration: If there are no more items to return.
|
|
@@ -110,23 +114,27 @@ class FileClosingIterator:
|
|
|
110
114
|
|
|
111
115
|
class SerializerJsonlines:
|
|
112
116
|
"""
|
|
113
|
-
Serialize a dataflow from a jsonlines file. Alternatively, save a dataflow of JSON objects to a
|
|
117
|
+
Serialize a dataflow from a jsonlines file. Alternatively, save a dataflow of `JSON` objects to a `.jsonl` file.
|
|
114
118
|
|
|
115
|
-
|
|
119
|
+
Example:
|
|
120
|
+
```python
|
|
121
|
+
df = SerializerJsonlines.load("path/to/file.jsonl")
|
|
122
|
+
df.reset_state()
|
|
116
123
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
... # is a dict
|
|
124
|
+
for dp in df:
|
|
125
|
+
... # is a dict
|
|
126
|
+
```
|
|
121
127
|
"""
|
|
122
128
|
|
|
123
129
|
@staticmethod
|
|
124
130
|
def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> CustomDataFromIterable:
|
|
125
131
|
"""
|
|
126
|
-
:
|
|
127
|
-
|
|
132
|
+
Args:
|
|
133
|
+
path: a path to a .jsonl file.
|
|
134
|
+
max_datapoints: Will stop the iteration once max_datapoints have been streamed
|
|
128
135
|
|
|
129
|
-
:
|
|
136
|
+
Returns:
|
|
137
|
+
Dataflow to iterate from
|
|
130
138
|
"""
|
|
131
139
|
file = open(path, "r") # pylint: disable=W1514,R1732
|
|
132
140
|
iterator = Reader(file)
|
|
@@ -136,14 +144,15 @@ class SerializerJsonlines:
|
|
|
136
144
|
@staticmethod
|
|
137
145
|
def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
|
|
138
146
|
"""
|
|
139
|
-
Writes a dataflow iteratively to a
|
|
140
|
-
As the length of the dataflow cannot be determined in every case max_datapoint prevents
|
|
141
|
-
unexpectedly large file
|
|
147
|
+
Writes a dataflow iteratively to a `.jsonl` file. Every datapoint must be a dict where all items are
|
|
148
|
+
serializable. As the length of the dataflow cannot be determined in every case max_datapoint prevents
|
|
149
|
+
generating an unexpectedly large file
|
|
142
150
|
|
|
143
|
-
:
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
151
|
+
Args:
|
|
152
|
+
df: The dataflow to write from.
|
|
153
|
+
path: The path, the .jsonl file to write to.
|
|
154
|
+
file_name: name of the target file.
|
|
155
|
+
max_datapoints: maximum number of datapoint to consider writing to a file.
|
|
147
156
|
"""
|
|
148
157
|
|
|
149
158
|
if not os.path.isdir(path):
|
|
@@ -166,27 +175,30 @@ class SerializerJsonlines:
|
|
|
166
175
|
class SerializerTabsepFiles:
|
|
167
176
|
"""
|
|
168
177
|
Serialize a dataflow from a tab separated text file. Alternatively, save a dataflow of plain text
|
|
169
|
-
to a
|
|
178
|
+
to a `.txt` file.
|
|
170
179
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
180
|
+
Example:
|
|
181
|
+
```python
|
|
182
|
+
df = SerializerTabsepFiles.load("path/to/file.txt")
|
|
174
183
|
|
|
175
184
|
will yield each text line of the file.
|
|
185
|
+
```
|
|
176
186
|
"""
|
|
177
187
|
|
|
178
188
|
@staticmethod
|
|
179
|
-
def load(path: PathLikeOrStr,
|
|
189
|
+
def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> CustomDataFromList:
|
|
180
190
|
"""
|
|
181
|
-
:
|
|
182
|
-
|
|
191
|
+
Args:
|
|
192
|
+
path: a path to a .txt file.
|
|
193
|
+
max_datapoints: Will stop the iteration once max_datapoints have been streamed
|
|
183
194
|
|
|
184
|
-
:
|
|
195
|
+
Returns:
|
|
196
|
+
Dataflow to iterate from
|
|
185
197
|
"""
|
|
186
198
|
|
|
187
199
|
with open(path, "r", encoding="UTF-8") as file:
|
|
188
200
|
file_list = file.readlines()
|
|
189
|
-
return CustomDataFromList(file_list, max_datapoints=
|
|
201
|
+
return CustomDataFromList(file_list, max_datapoints=max_datapoints)
|
|
190
202
|
|
|
191
203
|
@staticmethod
|
|
192
204
|
def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
|
|
@@ -195,10 +207,11 @@ class SerializerTabsepFiles:
|
|
|
195
207
|
As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
|
|
196
208
|
unexpectedly large file
|
|
197
209
|
|
|
198
|
-
:
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
210
|
+
Args:
|
|
211
|
+
df: The dataflow to write from.
|
|
212
|
+
path: The path, the .txt file to write to.
|
|
213
|
+
file_name: Name of the target file.
|
|
214
|
+
max_datapoints: Maximum number of datapoint to consider writing to a file.
|
|
202
215
|
"""
|
|
203
216
|
|
|
204
217
|
if not os.path.isdir(path):
|
|
@@ -220,6 +233,13 @@ class SerializerFiles:
|
|
|
220
233
|
"""
|
|
221
234
|
Serialize files from a directory and all subdirectories. Only one file type can be serialized. Once specified, all
|
|
222
235
|
other types will be filtered out.
|
|
236
|
+
|
|
237
|
+
Example:
|
|
238
|
+
```python
|
|
239
|
+
df = SerializerFiles.load("path/to/dir",file_type=".pdf")
|
|
240
|
+
|
|
241
|
+
will yield absolute paths to all `.pdf` files in the directory and all subdirectories.
|
|
242
|
+
```
|
|
223
243
|
"""
|
|
224
244
|
|
|
225
245
|
@staticmethod
|
|
@@ -234,12 +254,15 @@ class SerializerFiles:
|
|
|
234
254
|
Generates a dataflow where a datapoint consists of a string of names of files with respect to some file type.
|
|
235
255
|
If you want to load the files you need to do this in a following step by yourself.
|
|
236
256
|
|
|
237
|
-
:
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
257
|
+
Args:
|
|
258
|
+
path: A path to some base directory. Will inspect all subdirectories, as well
|
|
259
|
+
file_type: A file type (suffix) to look out for (single str or list of stings)
|
|
260
|
+
max_datapoints: Stop iteration after passing max_datapoints
|
|
261
|
+
shuffle: Shuffle the files, so that the order of appearance in dataflow is random.
|
|
262
|
+
sort: If set to `True` it will sort all selected files by its string
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Dataflow to iterate from
|
|
243
266
|
"""
|
|
244
267
|
df: DataFlow
|
|
245
268
|
df1: DataFlow
|
|
@@ -286,10 +309,11 @@ class SerializerFiles:
|
|
|
286
309
|
|
|
287
310
|
class CocoParser:
|
|
288
311
|
"""
|
|
289
|
-
A simplified version of the
|
|
312
|
+
A simplified version of the COCO helper class for reading annotations. It currently supports only
|
|
290
313
|
bounding box annotations
|
|
291
314
|
|
|
292
|
-
:
|
|
315
|
+
Args:
|
|
316
|
+
annotation_file: Location of annotation file
|
|
293
317
|
"""
|
|
294
318
|
|
|
295
319
|
def __init__(self, annotation_file: Optional[PathLikeOrStr] = None) -> None:
|
|
@@ -358,14 +382,16 @@ class CocoParser:
|
|
|
358
382
|
is_crowd: Optional[bool] = None,
|
|
359
383
|
) -> Sequence[int]:
|
|
360
384
|
"""
|
|
361
|
-
Get
|
|
385
|
+
Get annotation ids that satisfy given filter conditions. default skips that filter
|
|
362
386
|
|
|
363
|
-
:
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
387
|
+
Args:
|
|
388
|
+
img_ids: get anns for given imgs
|
|
389
|
+
cat_ids: get anns for given cats
|
|
390
|
+
area_range: get anns for given area range (e.g. [0 inf])
|
|
391
|
+
is_crowd: get anns for given crowd label (False or True)
|
|
367
392
|
|
|
368
|
-
:
|
|
393
|
+
Returns:
|
|
394
|
+
ids: integer array of ann ids
|
|
369
395
|
"""
|
|
370
396
|
|
|
371
397
|
if img_ids is None:
|
|
@@ -403,13 +429,15 @@ class CocoParser:
|
|
|
403
429
|
category_ids: Optional[Union[int, Sequence[int]]] = None,
|
|
404
430
|
) -> Sequence[int]:
|
|
405
431
|
"""
|
|
406
|
-
Filtering parameters.
|
|
432
|
+
Filtering parameters. Default does not filter anything.
|
|
407
433
|
|
|
408
|
-
:
|
|
409
|
-
|
|
410
|
-
|
|
434
|
+
Args:
|
|
435
|
+
category_names: get cats for given cat names
|
|
436
|
+
super_category_names: get cats for given super category names
|
|
437
|
+
category_ids: get cats for given cat ids
|
|
411
438
|
|
|
412
|
-
:
|
|
439
|
+
Returns:
|
|
440
|
+
ids: integer array of cat ids
|
|
413
441
|
"""
|
|
414
442
|
|
|
415
443
|
if category_names is None:
|
|
@@ -441,12 +469,14 @@ class CocoParser:
|
|
|
441
469
|
self, img_ids: Optional[Union[int, Sequence[int]]] = None, cat_ids: Optional[Union[int, Sequence[int]]] = None
|
|
442
470
|
) -> Sequence[int]:
|
|
443
471
|
"""
|
|
444
|
-
Get
|
|
472
|
+
Get image ids that satisfy given filter conditions.
|
|
445
473
|
|
|
446
|
-
:
|
|
447
|
-
|
|
474
|
+
Args:
|
|
475
|
+
img_ids: get imgs for given ids
|
|
476
|
+
cat_ids: get imgs with all given cats
|
|
448
477
|
|
|
449
|
-
:
|
|
478
|
+
Returns:
|
|
479
|
+
ids: integer array of img ids
|
|
450
480
|
"""
|
|
451
481
|
|
|
452
482
|
if img_ids is None:
|
|
@@ -472,9 +502,11 @@ class CocoParser:
|
|
|
472
502
|
"""
|
|
473
503
|
Load anns with the specified ids.
|
|
474
504
|
|
|
475
|
-
:
|
|
505
|
+
Args:
|
|
506
|
+
ids: integer ids specifying anns
|
|
476
507
|
|
|
477
|
-
:
|
|
508
|
+
Returns:
|
|
509
|
+
anns: loaded ann objects
|
|
478
510
|
"""
|
|
479
511
|
if ids is None:
|
|
480
512
|
ids = []
|
|
@@ -486,9 +518,11 @@ class CocoParser:
|
|
|
486
518
|
"""
|
|
487
519
|
Load cats with the specified ids.
|
|
488
520
|
|
|
489
|
-
:
|
|
521
|
+
Args:
|
|
522
|
+
ids: integer ids specifying cats
|
|
490
523
|
|
|
491
|
-
:
|
|
524
|
+
Returns:
|
|
525
|
+
cats: loaded cat objects
|
|
492
526
|
"""
|
|
493
527
|
if ids is None:
|
|
494
528
|
ids = []
|
|
@@ -500,9 +534,11 @@ class CocoParser:
|
|
|
500
534
|
"""
|
|
501
535
|
Load anns with the specified ids.
|
|
502
536
|
|
|
503
|
-
:
|
|
537
|
+
Args:
|
|
538
|
+
ids: integer ids specifying img
|
|
504
539
|
|
|
505
|
-
:
|
|
540
|
+
Returns:
|
|
541
|
+
imgs: loaded img objects
|
|
506
542
|
"""
|
|
507
543
|
if ids is None:
|
|
508
544
|
ids = []
|
|
@@ -513,31 +549,34 @@ class CocoParser:
|
|
|
513
549
|
|
|
514
550
|
class SerializerCoco:
|
|
515
551
|
"""
|
|
516
|
-
Class for serializing annotation files in
|
|
552
|
+
Class for serializing annotation files in COCO format. COCO comes in `JSON` format which is a priori not
|
|
517
553
|
serialized. This class implements only the very basic methods to generate a dataflow. It wraps the coco class
|
|
518
|
-
from pycocotools and assembles annotations that belong to the image.
|
|
519
|
-
|
|
554
|
+
from `pycocotools` and assembles annotations that belong to the image.
|
|
555
|
+
|
|
556
|
+
Note:
|
|
557
|
+
Conversion into the core `Image` has to be done by yourself.
|
|
558
|
+
|
|
559
|
+
Example:
|
|
560
|
+
```python
|
|
561
|
+
df = SerializerCoco.load("path/to/annotations.json")
|
|
562
|
+
df.reset_state()
|
|
563
|
+
for dp in df:
|
|
564
|
+
# {'image':{'id',...},'annotations':[{'id':…,'bbox':...}]}
|
|
565
|
+
```
|
|
566
|
+
|
|
520
567
|
"""
|
|
521
568
|
|
|
522
569
|
@staticmethod
|
|
523
570
|
def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
|
|
524
571
|
"""
|
|
525
|
-
Loads a
|
|
526
|
-
|
|
527
|
-
**Example:**
|
|
528
|
-
|
|
529
|
-
{'images':[img1,img2,...], 'annotations':[ann1,ann2,...],...}
|
|
530
|
-
|
|
531
|
-
it will generate a dataflow with datapoints
|
|
572
|
+
Loads a `.json` file and generates a dataflow.
|
|
532
573
|
|
|
574
|
+
Args:
|
|
575
|
+
max_datapoints: Will stop the iteration once max_datapoints have been streamed.
|
|
576
|
+
path: a path to a .json file.
|
|
533
577
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
for each image id. We use the type hint CocoDatapointDict to describe this dictionary
|
|
537
|
-
|
|
538
|
-
:param max_datapoints: Will stop the iteration once max_datapoints have been streamed.
|
|
539
|
-
:param path: a path to a .json file.
|
|
540
|
-
:return: dataflow to iterate from
|
|
578
|
+
Returns:
|
|
579
|
+
dataflow to iterate from
|
|
541
580
|
"""
|
|
542
581
|
if not os.path.isfile(path):
|
|
543
582
|
raise FileNotFoundError(path)
|
|
@@ -570,13 +609,15 @@ class SerializerPdfDoc:
|
|
|
570
609
|
"""
|
|
571
610
|
Serialize a pdf document with an arbitrary number of pages.
|
|
572
611
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
612
|
+
Example:
|
|
613
|
+
```python
|
|
614
|
+
df = SerializerPdfDoc.load("path/to/document.pdf")
|
|
576
615
|
|
|
577
616
|
will yield datapoints:
|
|
578
617
|
|
|
579
|
-
|
|
618
|
+
{"path": "path/to/document.pdf", "file_name" document_page_1.pdf, "pdf_bytes": b"some-bytes"}
|
|
619
|
+
```
|
|
620
|
+
|
|
580
621
|
"""
|
|
581
622
|
|
|
582
623
|
@staticmethod
|
|
@@ -584,10 +625,13 @@ class SerializerPdfDoc:
|
|
|
584
625
|
"""
|
|
585
626
|
Loads the document page wise and returns a dataflow accordingly.
|
|
586
627
|
|
|
587
|
-
:
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
628
|
+
Args:
|
|
629
|
+
path: Path to the pdf document.
|
|
630
|
+
max_datapoints: The maximum number of pages to stream.
|
|
631
|
+
|
|
632
|
+
Returns:
|
|
633
|
+
A dict with structure `{"path":... ,"file_name": ..., "pdf_bytes": ...}`. The file name is a
|
|
634
|
+
concatenation of the physical file name and the current page number.
|
|
591
635
|
"""
|
|
592
636
|
|
|
593
637
|
file_name = os.path.split(path)[1]
|
|
@@ -9,7 +9,6 @@ Replaces relevant parts of the Dataflow package. Most of the functions have been
|
|
|
9
9
|
|
|
10
10
|
<https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/parallel.py>
|
|
11
11
|
<https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/parallel_map.py>
|
|
12
|
-
|
|
13
12
|
"""
|
|
14
13
|
|
|
15
14
|
import atexit
|
|
@@ -163,18 +162,19 @@ class MultiThreadMapData(_ParallelMapData):
|
|
|
163
162
|
The semantics of this class is **identical** to `MapData` except for the ordering.
|
|
164
163
|
Threads run in parallel and can take different time to run the
|
|
165
164
|
mapping function. Therefore, the order of datapoints won't be preserved.
|
|
166
|
-
When
|
|
167
|
-
is guaranteed to produce the exact set of data as
|
|
168
|
-
if both are iterated until
|
|
169
|
-
The behavior of strict mode is undefined if the given dataflow
|
|
170
|
-
When
|
|
171
|
-
is a
|
|
172
|
-
In other words, first pass of
|
|
173
|
-
datapoints from the second pass of
|
|
165
|
+
When `strict=True`, `MultiThreadMapData(df, ...)`
|
|
166
|
+
is guaranteed to produce the exact set of data as `MapData(df, ...)`,
|
|
167
|
+
if both are iterated until `StopIteration`. But the produced data will have different ordering.
|
|
168
|
+
The behavior of strict mode is undefined if the given dataflow `df` is infinite.
|
|
169
|
+
When `strict=False`, the data that's produced by `MultiThreadMapData(df, ...)`
|
|
170
|
+
is a re-ordering of the data produced by `RepeatedData(MapData(df, ...), -1)`.
|
|
171
|
+
In other words, first pass of `MultiThreadMapData.__iter__` may contain
|
|
172
|
+
datapoints from the second pass of `df.__iter__`.
|
|
173
|
+
|
|
174
174
|
Note:
|
|
175
175
|
1. You should avoid starting many threads in your main process to reduce GIL contention.
|
|
176
176
|
The threads will only start in the process which calls `reset_state()`.
|
|
177
|
-
Therefore you can use
|
|
177
|
+
Therefore you can use `MultiProcessRunnerZMQ(MultiThreadMapData(...), 1)`
|
|
178
178
|
to reduce GIL contention.
|
|
179
179
|
"""
|
|
180
180
|
|
|
@@ -215,12 +215,13 @@ class MultiThreadMapData(_ParallelMapData):
|
|
|
215
215
|
strict: bool = False,
|
|
216
216
|
):
|
|
217
217
|
"""
|
|
218
|
-
:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
218
|
+
Args:
|
|
219
|
+
df: the dataflow to map
|
|
220
|
+
num_thread: number of threads to use
|
|
221
|
+
map_func: datapoint -> datapoint | None. Return None to
|
|
222
|
+
discard/skip the datapoint.
|
|
223
|
+
buffer_size: number of datapoints in the buffer
|
|
224
|
+
strict: use "strict mode", see notes above.
|
|
224
225
|
"""
|
|
225
226
|
if strict:
|
|
226
227
|
# In strict mode, buffer size cannot be larger than the total number of datapoints
|
|
@@ -290,7 +291,7 @@ class _MultiProcessZMQDataFlow(DataFlow, ABC):
|
|
|
290
291
|
|
|
291
292
|
def reset_state(self) -> Any:
|
|
292
293
|
"""
|
|
293
|
-
All forked dataflows should only be reset
|
|
294
|
+
All forked dataflows should only be reset once and only once in spawned processes.
|
|
294
295
|
Subclasses should call this method with super.
|
|
295
296
|
"""
|
|
296
297
|
assert not self._reset_done, "reset_state() was called twice! This violates the API of DataFlow!"
|
|
@@ -338,17 +339,17 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
|
|
|
338
339
|
"""
|
|
339
340
|
Same as `MapData`, but start processes to run the mapping function,
|
|
340
341
|
and communicate with ZeroMQ pipe.
|
|
341
|
-
The semantics of this class is
|
|
342
|
+
The semantics of this class is identical to `MapData` except for the ordering.
|
|
342
343
|
Processes run in parallel and can take different time to run the
|
|
343
344
|
mapping function. Therefore, the order of datapoints won't be preserved.
|
|
344
|
-
When
|
|
345
|
-
is guaranteed to produce the exact set of data as
|
|
346
|
-
if both are iterated until
|
|
347
|
-
The behavior of strict mode is undefined if the given dataflow
|
|
348
|
-
When
|
|
349
|
-
is a reordering of the data produced by
|
|
350
|
-
In other words, first pass of
|
|
351
|
-
datapoints from the second pass of
|
|
345
|
+
When `strict=True`, `MultiProcessMapData(df, ...)`
|
|
346
|
+
is guaranteed to produce the exact set of data as `MapData(df, ...)`,
|
|
347
|
+
if both are iterated until `StopIteration`. But the produced data will have different ordering.
|
|
348
|
+
The behavior of strict mode is undefined if the given dataflow `df` is infinite.
|
|
349
|
+
When `strict=False`, the data that's produced by `MultiProcessMapData(df, ...)`
|
|
350
|
+
is a reordering of the data produced by `RepeatedData(MapData(df, ...), -1)`.
|
|
351
|
+
In other words, first pass of `MultiProcessMapData.__iter__` may contain
|
|
352
|
+
datapoints from the second pass of `df.__iter__`.
|
|
352
353
|
"""
|
|
353
354
|
|
|
354
355
|
class _Worker(mp.Process):
|
|
@@ -384,11 +385,12 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
|
|
|
384
385
|
strict: bool = False,
|
|
385
386
|
) -> None:
|
|
386
387
|
"""
|
|
387
|
-
:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
388
|
+
Args:
|
|
389
|
+
df: the dataflow to map
|
|
390
|
+
num_proc: number of threads to use
|
|
391
|
+
map_func: datapoint -> datapoint | None. Return None to
|
|
392
|
+
buffer_size: number of datapoints in the buffer
|
|
393
|
+
strict: use "strict mode", see notes above.
|
|
392
394
|
"""
|
|
393
395
|
if strict:
|
|
394
396
|
# In strict mode, buffer size cannot be larger than the total number of datapoints
|
|
@@ -25,8 +25,9 @@ class DataFromList(RNGDataFlow):
|
|
|
25
25
|
|
|
26
26
|
def __init__(self, lst: list[Any], shuffle: bool = True) -> None:
|
|
27
27
|
"""
|
|
28
|
-
:
|
|
29
|
-
|
|
28
|
+
Args:
|
|
29
|
+
lst: input list. Each element is a datapoint.
|
|
30
|
+
shuffle: shuffle data.
|
|
30
31
|
"""
|
|
31
32
|
super().__init__()
|
|
32
33
|
self.lst = lst
|
|
@@ -53,7 +54,8 @@ class DataFromIterable(DataFlow):
|
|
|
53
54
|
|
|
54
55
|
def __init__(self, iterable: Iterable[Any]) -> None:
|
|
55
56
|
"""
|
|
56
|
-
:
|
|
57
|
+
Args:
|
|
58
|
+
iterable: an iterable object
|
|
57
59
|
"""
|
|
58
60
|
self._itr = iterable
|
|
59
61
|
self._len: Optional[int] = None
|
|
@@ -86,12 +88,13 @@ class FakeData(RNGDataFlow):
|
|
|
86
88
|
domain: tuple[Union[float, int], Union[float, int]] = (0, 1),
|
|
87
89
|
):
|
|
88
90
|
"""
|
|
89
|
-
:
|
|
90
|
-
|
|
91
|
-
|
|
91
|
+
Args:
|
|
92
|
+
shapes: a list of lists/tuples. Shapes of each component.
|
|
93
|
+
size: size of this DataFlow.
|
|
94
|
+
random: whether to randomly generate data every iteration.
|
|
92
95
|
Note that merely generating the data could sometimes be time-consuming!
|
|
93
|
-
|
|
94
|
-
|
|
96
|
+
dtype: data type as string, or a list of data types.
|
|
97
|
+
domain: (min, max) tuple, or a list of such tuples
|
|
95
98
|
"""
|
|
96
99
|
|
|
97
100
|
super().__init__()
|
|
@@ -136,13 +139,15 @@ class PickleSerializer:
|
|
|
136
139
|
@staticmethod
|
|
137
140
|
def dumps(obj: Any) -> bytes:
|
|
138
141
|
"""
|
|
139
|
-
:
|
|
142
|
+
Args:
|
|
143
|
+
obj: bytes
|
|
140
144
|
"""
|
|
141
145
|
return pickle.dumps(obj, protocol=-1)
|
|
142
146
|
|
|
143
147
|
@staticmethod
|
|
144
148
|
def loads(buf: Any) -> Any:
|
|
145
149
|
"""
|
|
146
|
-
:
|
|
150
|
+
Args:
|
|
151
|
+
buf: bytes
|
|
147
152
|
"""
|
|
148
153
|
return pickle.loads(buf)
|