deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +4 -2
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +919 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +162 -108
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +205 -119
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +26 -17
  104. deepdoctection/utils/env_info.py +86 -37
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -71
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.1.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.1.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Methods that convert incoming data to dataflows.
19
+ Classes to load data and produce dataflows
20
20
  """
21
21
 
22
22
  from __future__ import annotations
@@ -63,13 +63,14 @@ class FileClosingIterator:
63
63
  reading the data from it. It is used in the context of reading data from a file
64
64
  in a streaming manner, where the data is not loaded into memory all at once.
65
65
 
66
- **Example:**
67
-
66
+ Example:
67
+ ```python
68
68
  file = open(path, "r")
69
69
  iterator = Reader(file)
70
70
  closing_iterator = FileClosingIterator(file, iter(iterator))
71
71
 
72
- df = CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints) # set up a dataflow
72
+ df = CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints)
73
+ ```
73
74
 
74
75
  """
75
76
 
@@ -77,8 +78,9 @@ class FileClosingIterator:
77
78
  """
78
79
  Initializes the FileClosingIterator with a file object and its iterator.
79
80
 
80
- :param file_obj (TextIO): The file object to read data from.
81
- :param iterator (Iterator): The actual iterator of the file object.
81
+ Args:
82
+ file_obj: The file object to read data from.
83
+ iterator: The actual iterator of the file object.
82
84
  """
83
85
  self.file_obj = file_obj
84
86
  self.iterator = iterator
@@ -87,7 +89,8 @@ class FileClosingIterator:
87
89
  """
88
90
  Returns the iterator object itself.
89
91
 
90
- :return: FileClosingIterator: The instance of the class itself.
92
+ Returns:
93
+ FileClosingIterator: The instance of the class itself.
91
94
  """
92
95
  return self
93
96
 
@@ -96,7 +99,8 @@ class FileClosingIterator:
96
99
  Returns the next item from the file object's iterator.
97
100
  Closes the file object if the iteration is finished.
98
101
 
99
- :return: The next item from the file object's iterator.
102
+ Returns:
103
+ The next item from the file object's iterator.
100
104
 
101
105
  Raises:
102
106
  StopIteration: If there are no more items to return.
@@ -110,23 +114,27 @@ class FileClosingIterator:
110
114
 
111
115
  class SerializerJsonlines:
112
116
  """
113
- Serialize a dataflow from a jsonlines file. Alternatively, save a dataflow of JSON objects to a .jsonl file.
117
+ Serialize a dataflow from a jsonlines file. Alternatively, save a dataflow of `JSON` objects to a `.jsonl` file.
114
118
 
115
- **Example:**
119
+ Example:
120
+ ```python
121
+ df = SerializerJsonlines.load("path/to/file.jsonl")
122
+ df.reset_state()
116
123
 
117
- df = SerializerJsonlines.load("path/to/file.jsonl")
118
- df.reset_state()
119
- for dp in df:
120
- ... # is a dict
124
+ for dp in df:
125
+ ... # is a dict
126
+ ```
121
127
  """
122
128
 
123
129
  @staticmethod
124
130
  def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> CustomDataFromIterable:
125
131
  """
126
- :param path: a path to a .jsonl file.
127
- :param max_datapoints: Will stop the iteration once max_datapoints have been streamed
132
+ Args:
133
+ path: a path to a .jsonl file.
134
+ max_datapoints: Will stop the iteration once max_datapoints have been streamed
128
135
 
129
- :return: dataflow to iterate from
136
+ Returns:
137
+ Dataflow to iterate from
130
138
  """
131
139
  file = open(path, "r") # pylint: disable=W1514,R1732
132
140
  iterator = Reader(file)
@@ -136,14 +144,15 @@ class SerializerJsonlines:
136
144
  @staticmethod
137
145
  def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
138
146
  """
139
- Writes a dataflow iteratively to a .jsonl file. Every datapoint must be a dict where all items are serializable.
140
- As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
141
- unexpectedly large file
147
+ Writes a dataflow iteratively to a `.jsonl` file. Every datapoint must be a dict where all items are
148
+ serializable. As the length of the dataflow cannot be determined in every case max_datapoint prevents
149
+ generating an unexpectedly large file
142
150
 
143
- :param df: The dataflow to write from.
144
- :param path: The path, the .jsonl file to write to.
145
- :param file_name: name of the target file.
146
- :param max_datapoints: maximum number of datapoint to consider writing to a file.
151
+ Args:
152
+ df: The dataflow to write from.
153
+ path: The path, the .jsonl file to write to.
154
+ file_name: name of the target file.
155
+ max_datapoints: maximum number of datapoint to consider writing to a file.
147
156
  """
148
157
 
149
158
  if not os.path.isdir(path):
@@ -166,27 +175,30 @@ class SerializerJsonlines:
166
175
  class SerializerTabsepFiles:
167
176
  """
168
177
  Serialize a dataflow from a tab separated text file. Alternatively, save a dataflow of plain text
169
- to a .txt file.
178
+ to a `.txt` file.
170
179
 
171
- **Example**:
172
-
173
- df = SerializerTabsepFiles.load("path/to/file.txt")
180
+ Example:
181
+ ```python
182
+ df = SerializerTabsepFiles.load("path/to/file.txt")
174
183
 
175
184
  will yield each text line of the file.
185
+ ```
176
186
  """
177
187
 
178
188
  @staticmethod
179
- def load(path: PathLikeOrStr, max_datapoins: Optional[int] = None) -> CustomDataFromList:
189
+ def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> CustomDataFromList:
180
190
  """
181
- :param path: a path to a .txt file.
182
- :param max_datapoins: Will stop the iteration once max_datapoints have been streamed
191
+ Args:
192
+ path: a path to a .txt file.
193
+ max_datapoints: Will stop the iteration once max_datapoints have been streamed
183
194
 
184
- :return: dataflow to iterate from
195
+ Returns:
196
+ Dataflow to iterate from
185
197
  """
186
198
 
187
199
  with open(path, "r", encoding="UTF-8") as file:
188
200
  file_list = file.readlines()
189
- return CustomDataFromList(file_list, max_datapoints=max_datapoins)
201
+ return CustomDataFromList(file_list, max_datapoints=max_datapoints)
190
202
 
191
203
  @staticmethod
192
204
  def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
@@ -195,10 +207,11 @@ class SerializerTabsepFiles:
195
207
  As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
196
208
  unexpectedly large file
197
209
 
198
- :param df: The dataflow to write from.
199
- :param path: The path, the .txt file to write to.
200
- :param file_name: name of the target file.
201
- :param max_datapoints: maximum number of datapoint to consider writing to a file.
210
+ Args:
211
+ df: The dataflow to write from.
212
+ path: The path, the .txt file to write to.
213
+ file_name: Name of the target file.
214
+ max_datapoints: Maximum number of datapoint to consider writing to a file.
202
215
  """
203
216
 
204
217
  if not os.path.isdir(path):
@@ -220,6 +233,13 @@ class SerializerFiles:
220
233
  """
221
234
  Serialize files from a directory and all subdirectories. Only one file type can be serialized. Once specified, all
222
235
  other types will be filtered out.
236
+
237
+ Example:
238
+ ```python
239
+ df = SerializerFiles.load("path/to/dir",file_type=".pdf")
240
+
241
+ will yield absolute paths to all `.pdf` files in the directory and all subdirectories.
242
+ ```
223
243
  """
224
244
 
225
245
  @staticmethod
@@ -234,12 +254,15 @@ class SerializerFiles:
234
254
  Generates a dataflow where a datapoint consists of a string of names of files with respect to some file type.
235
255
  If you want to load the files you need to do this in a following step by yourself.
236
256
 
237
- :param path: A path to some base directory. Will inspect all subdirectories, as well
238
- :param file_type: A file type (suffix) to look out for (single str or list of stings)
239
- :param max_datapoints: Stop iteration after passing max_datapoints
240
- :param shuffle: Shuffle the files, so that the order of appearance in dataflow is random.
241
- :param sort: If set to "True" it will sort all selected files by its string
242
- :return: dataflow to iterate from
257
+ Args:
258
+ path: A path to some base directory. Will inspect all subdirectories, as well
259
+ file_type: A file type (suffix) to look out for (single str or list of stings)
260
+ max_datapoints: Stop iteration after passing max_datapoints
261
+ shuffle: Shuffle the files, so that the order of appearance in dataflow is random.
262
+ sort: If set to `True` it will sort all selected files by its string
263
+
264
+ Returns:
265
+ Dataflow to iterate from
243
266
  """
244
267
  df: DataFlow
245
268
  df1: DataFlow
@@ -286,10 +309,11 @@ class SerializerFiles:
286
309
 
287
310
  class CocoParser:
288
311
  """
289
- A simplified version of the Microsoft COCO helper class for reading annotations. It currently supports only
312
+ A simplified version of the COCO helper class for reading annotations. It currently supports only
290
313
  bounding box annotations
291
314
 
292
- :param annotation_file: location of annotation file
315
+ Args:
316
+ annotation_file: Location of annotation file
293
317
  """
294
318
 
295
319
  def __init__(self, annotation_file: Optional[PathLikeOrStr] = None) -> None:
@@ -358,14 +382,16 @@ class CocoParser:
358
382
  is_crowd: Optional[bool] = None,
359
383
  ) -> Sequence[int]:
360
384
  """
361
- Get ann ids that satisfy given filter conditions. default skips that filter
385
+ Get annotation ids that satisfy given filter conditions. default skips that filter
362
386
 
363
- :param img_ids: get anns for given imgs
364
- :param cat_ids: get anns for given cats
365
- :param area_range: get anns for given area range (e.g. [0 inf])
366
- :param is_crowd: get anns for given crowd label (False or True)
387
+ Args:
388
+ img_ids: get anns for given imgs
389
+ cat_ids: get anns for given cats
390
+ area_range: get anns for given area range (e.g. [0 inf])
391
+ is_crowd: get anns for given crowd label (False or True)
367
392
 
368
- :return: ids: integer array of ann ids
393
+ Returns:
394
+ ids: integer array of ann ids
369
395
  """
370
396
 
371
397
  if img_ids is None:
@@ -403,13 +429,15 @@ class CocoParser:
403
429
  category_ids: Optional[Union[int, Sequence[int]]] = None,
404
430
  ) -> Sequence[int]:
405
431
  """
406
- Filtering parameters. default skips that filter.
432
+ Filtering parameters. Default does not filter anything.
407
433
 
408
- :param category_names: get cats for given cat names
409
- :param super_category_names: get cats for given super category names
410
- :param category_ids: get cats for given cat ids
434
+ Args:
435
+ category_names: get cats for given cat names
436
+ super_category_names: get cats for given super category names
437
+ category_ids: get cats for given cat ids
411
438
 
412
- :return: ids: integer array of cat ids
439
+ Returns:
440
+ ids: integer array of cat ids
413
441
  """
414
442
 
415
443
  if category_names is None:
@@ -441,12 +469,14 @@ class CocoParser:
441
469
  self, img_ids: Optional[Union[int, Sequence[int]]] = None, cat_ids: Optional[Union[int, Sequence[int]]] = None
442
470
  ) -> Sequence[int]:
443
471
  """
444
- Get img ids that satisfy given filter conditions.
472
+ Get image ids that satisfy given filter conditions.
445
473
 
446
- :param img_ids: get imgs for given ids
447
- :param cat_ids: get imgs with all given cats
474
+ Args:
475
+ img_ids: get imgs for given ids
476
+ cat_ids: get imgs with all given cats
448
477
 
449
- :return: ids: integer array of img ids
478
+ Returns:
479
+ ids: integer array of img ids
450
480
  """
451
481
 
452
482
  if img_ids is None:
@@ -472,9 +502,11 @@ class CocoParser:
472
502
  """
473
503
  Load anns with the specified ids.
474
504
 
475
- :param ids: integer ids specifying anns
505
+ Args:
506
+ ids: integer ids specifying anns
476
507
 
477
- :return: anns: loaded ann objects
508
+ Returns:
509
+ anns: loaded ann objects
478
510
  """
479
511
  if ids is None:
480
512
  ids = []
@@ -486,9 +518,11 @@ class CocoParser:
486
518
  """
487
519
  Load cats with the specified ids.
488
520
 
489
- :param ids: integer ids specifying cats
521
+ Args:
522
+ ids: integer ids specifying cats
490
523
 
491
- :return: cats: loaded cat objects
524
+ Returns:
525
+ cats: loaded cat objects
492
526
  """
493
527
  if ids is None:
494
528
  ids = []
@@ -500,9 +534,11 @@ class CocoParser:
500
534
  """
501
535
  Load anns with the specified ids.
502
536
 
503
- :param ids: integer ids specifying img
537
+ Args:
538
+ ids: integer ids specifying img
504
539
 
505
- :return: imgs: loaded img objects
540
+ Returns:
541
+ imgs: loaded img objects
506
542
  """
507
543
  if ids is None:
508
544
  ids = []
@@ -513,31 +549,34 @@ class CocoParser:
513
549
 
514
550
  class SerializerCoco:
515
551
  """
516
- Class for serializing annotation files in Coco format. Coco comes in JSON format which is a priori not
552
+ Class for serializing annotation files in COCO format. COCO comes in `JSON` format which is a priori not
517
553
  serialized. This class implements only the very basic methods to generate a dataflow. It wraps the coco class
518
- from pycocotools and assembles annotations that belong to the image. Note, that the conversion into the core
519
- `Image` has to be done by yourself.
554
+ from `pycocotools` and assembles annotations that belong to the image.
555
+
556
+ Note:
557
+ Conversion into the core `Image` has to be done by yourself.
558
+
559
+ Example:
560
+ ```python
561
+ df = SerializerCoco.load("path/to/annotations.json")
562
+ df.reset_state()
563
+ for dp in df:
564
+ # {'image':{'id',...},'annotations':[{'id':…,'bbox':...}]}
565
+ ```
566
+
520
567
  """
521
568
 
522
569
  @staticmethod
523
570
  def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
524
571
  """
525
- Loads a .json file and generates a dataflow.
526
-
527
- **Example:**
528
-
529
- {'images':[img1,img2,...], 'annotations':[ann1,ann2,...],...}
530
-
531
- it will generate a dataflow with datapoints
572
+ Loads a `.json` file and generates a dataflow.
532
573
 
574
+ Args:
575
+ max_datapoints: Will stop the iteration once max_datapoints have been streamed.
576
+ path: a path to a .json file.
533
577
 
534
- {'image':{'id',...},'annotations':[{'id':…,'bbox':...}]}
535
-
536
- for each image id. We use the type hint CocoDatapointDict to describe this dictionary
537
-
538
- :param max_datapoints: Will stop the iteration once max_datapoints have been streamed.
539
- :param path: a path to a .json file.
540
- :return: dataflow to iterate from
578
+ Returns:
579
+ dataflow to iterate from
541
580
  """
542
581
  if not os.path.isfile(path):
543
582
  raise FileNotFoundError(path)
@@ -570,13 +609,15 @@ class SerializerPdfDoc:
570
609
  """
571
610
  Serialize a pdf document with an arbitrary number of pages.
572
611
 
573
- **Example:**
574
-
575
- df = SerializerPdfDoc.load("path/to/document.pdf")
612
+ Example:
613
+ ```python
614
+ df = SerializerPdfDoc.load("path/to/document.pdf")
576
615
 
577
616
  will yield datapoints:
578
617
 
579
- {"path": "path/to/document.pdf", "file_name" document_page_1.pdf, "pdf_bytes": b"some-bytes"}
618
+ {"path": "path/to/document.pdf", "file_name" document_page_1.pdf, "pdf_bytes": b"some-bytes"}
619
+ ```
620
+
580
621
  """
581
622
 
582
623
  @staticmethod
@@ -584,10 +625,13 @@ class SerializerPdfDoc:
584
625
  """
585
626
  Loads the document page wise and returns a dataflow accordingly.
586
627
 
587
- :param path: Path to the pdf document.
588
- :param max_datapoints: The maximum number of pages to stream.
589
- :return: A dict with structure {"path":... ,"file_name": ..., "pdf_bytes": ...}. The file name is a
590
- concatenation of the physical file name and the current page number.
628
+ Args:
629
+ path: Path to the pdf document.
630
+ max_datapoints: The maximum number of pages to stream.
631
+
632
+ Returns:
633
+ A dict with structure `{"path":... ,"file_name": ..., "pdf_bytes": ...}`. The file name is a
634
+ concatenation of the physical file name and the current page number.
591
635
  """
592
636
 
593
637
  file_name = os.path.split(path)[1]
@@ -9,7 +9,6 @@ Replaces relevant parts of the Dataflow package. Most of the functions have been
9
9
 
10
10
  <https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/parallel.py>
11
11
  <https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/parallel_map.py>
12
-
13
12
  """
14
13
 
15
14
  import atexit
@@ -163,18 +162,19 @@ class MultiThreadMapData(_ParallelMapData):
163
162
  The semantics of this class is **identical** to `MapData` except for the ordering.
164
163
  Threads run in parallel and can take different time to run the
165
164
  mapping function. Therefore, the order of datapoints won't be preserved.
166
- When ``strict=True``, ``MultiThreadMapData(df, ...)``
167
- is guaranteed to produce the exact set of data as ``MapData(df, ...)``,
168
- if both are iterated until ``StopIteration``. But the produced data will have different ordering.
169
- The behavior of strict mode is undefined if the given dataflow ``df`` is infinite.
170
- When ``strict=False``, the data that's produced by ``MultiThreadMapData(df, ...)``
171
- is a reordering of the data produced by ``RepeatedData(MapData(df, ...), -1)``.
172
- In other words, first pass of ``MultiThreadMapData.__iter__`` may contain
173
- datapoints from the second pass of ``df.__iter__``.
165
+ When `strict=True`, `MultiThreadMapData(df, ...)`
166
+ is guaranteed to produce the exact set of data as `MapData(df, ...)`,
167
+ if both are iterated until `StopIteration`. But the produced data will have different ordering.
168
+ The behavior of strict mode is undefined if the given dataflow `df` is infinite.
169
+ When `strict=False`, the data that's produced by `MultiThreadMapData(df, ...)`
170
+ is a re-ordering of the data produced by `RepeatedData(MapData(df, ...), -1)`.
171
+ In other words, first pass of `MultiThreadMapData.__iter__` may contain
172
+ datapoints from the second pass of `df.__iter__`.
173
+
174
174
  Note:
175
175
  1. You should avoid starting many threads in your main process to reduce GIL contention.
176
176
  The threads will only start in the process which calls `reset_state()`.
177
- Therefore you can use ``MultiProcessRunnerZMQ(MultiThreadMapData(...), 1)``
177
+ Therefore you can use `MultiProcessRunnerZMQ(MultiThreadMapData(...), 1)`
178
178
  to reduce GIL contention.
179
179
  """
180
180
 
@@ -215,12 +215,13 @@ class MultiThreadMapData(_ParallelMapData):
215
215
  strict: bool = False,
216
216
  ):
217
217
  """
218
- :param df: the dataflow to map
219
- :param num_thread: number of threads to use
220
- :param map_func: datapoint -> datapoint | None. Return None to
221
- discard/skip the datapoint.
222
- :param buffer_size: number of datapoints in the buffer
223
- :param strict: use "strict mode", see notes above.
218
+ Args:
219
+ df: the dataflow to map
220
+ num_thread: number of threads to use
221
+ map_func: datapoint -> datapoint | None. Return None to
222
+ discard/skip the datapoint.
223
+ buffer_size: number of datapoints in the buffer
224
+ strict: use "strict mode", see notes above.
224
225
  """
225
226
  if strict:
226
227
  # In strict mode, buffer size cannot be larger than the total number of datapoints
@@ -290,7 +291,7 @@ class _MultiProcessZMQDataFlow(DataFlow, ABC):
290
291
 
291
292
  def reset_state(self) -> Any:
292
293
  """
293
- All forked dataflows should only be reset **once and only once** in spawned processes.
294
+ All forked dataflows should only be reset once and only once in spawned processes.
294
295
  Subclasses should call this method with super.
295
296
  """
296
297
  assert not self._reset_done, "reset_state() was called twice! This violates the API of DataFlow!"
@@ -338,17 +339,17 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
338
339
  """
339
340
  Same as `MapData`, but start processes to run the mapping function,
340
341
  and communicate with ZeroMQ pipe.
341
- The semantics of this class is **identical** to `MapData` except for the ordering.
342
+ The semantics of this class is identical to `MapData` except for the ordering.
342
343
  Processes run in parallel and can take different time to run the
343
344
  mapping function. Therefore, the order of datapoints won't be preserved.
344
- When ``strict=True``, ``MultiProcessMapData(df, ...)``
345
- is guaranteed to produce the exact set of data as ``MapData(df, ...)``,
346
- if both are iterated until ``StopIteration``. But the produced data will have different ordering.
347
- The behavior of strict mode is undefined if the given dataflow ``df`` is infinite.
348
- When ``strict=False``, the data that's produced by ``MultiProcessMapData(df, ...)``
349
- is a reordering of the data produced by ``RepeatedData(MapData(df, ...), -1)``.
350
- In other words, first pass of ``MultiProcessMapData.__iter__`` may contain
351
- datapoints from the second pass of ``df.__iter__``.
345
+ When `strict=True`, `MultiProcessMapData(df, ...)`
346
+ is guaranteed to produce the exact set of data as `MapData(df, ...)`,
347
+ if both are iterated until `StopIteration`. But the produced data will have different ordering.
348
+ The behavior of strict mode is undefined if the given dataflow `df` is infinite.
349
+ When `strict=False`, the data that's produced by `MultiProcessMapData(df, ...)`
350
+ is a reordering of the data produced by `RepeatedData(MapData(df, ...), -1)`.
351
+ In other words, first pass of `MultiProcessMapData.__iter__` may contain
352
+ datapoints from the second pass of `df.__iter__`.
352
353
  """
353
354
 
354
355
  class _Worker(mp.Process):
@@ -384,11 +385,12 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
384
385
  strict: bool = False,
385
386
  ) -> None:
386
387
  """
387
- :param df: the dataflow to map
388
- :param num_proc: number of threads to use
389
- :param map_func: datapoint -> datapoint | None. Return None to
390
- :param buffer_size: number of datapoints in the buffer
391
- :param strict: use "strict mode", see notes above.
388
+ Args:
389
+ df: the dataflow to map
390
+ num_proc: number of threads to use
391
+ map_func: datapoint -> datapoint | None. Return None to
392
+ buffer_size: number of datapoints in the buffer
393
+ strict: use "strict mode", see notes above.
392
394
  """
393
395
  if strict:
394
396
  # In strict mode, buffer size cannot be larger than the total number of datapoints
@@ -25,8 +25,9 @@ class DataFromList(RNGDataFlow):
25
25
 
26
26
  def __init__(self, lst: list[Any], shuffle: bool = True) -> None:
27
27
  """
28
- :param lst: input list. Each element is a datapoint.
29
- :param shuffle: shuffle data.
28
+ Args:
29
+ lst: input list. Each element is a datapoint.
30
+ shuffle: shuffle data.
30
31
  """
31
32
  super().__init__()
32
33
  self.lst = lst
@@ -53,7 +54,8 @@ class DataFromIterable(DataFlow):
53
54
 
54
55
  def __init__(self, iterable: Iterable[Any]) -> None:
55
56
  """
56
- :param iterable: an iterable object
57
+ Args:
58
+ iterable: an iterable object
57
59
  """
58
60
  self._itr = iterable
59
61
  self._len: Optional[int] = None
@@ -86,12 +88,13 @@ class FakeData(RNGDataFlow):
86
88
  domain: tuple[Union[float, int], Union[float, int]] = (0, 1),
87
89
  ):
88
90
  """
89
- :param shapes: a list of lists/tuples. Shapes of each component.
90
- :param size: size of this DataFlow.
91
- :param random: whether to randomly generate data every iteration.
91
+ Args:
92
+ shapes: a list of lists/tuples. Shapes of each component.
93
+ size: size of this DataFlow.
94
+ random: whether to randomly generate data every iteration.
92
95
  Note that merely generating the data could sometimes be time-consuming!
93
- :param dtype: data type as string, or a list of data types.
94
- :param domain: (min, max) tuple, or a list of such tuples
96
+ dtype: data type as string, or a list of data types.
97
+ domain: (min, max) tuple, or a list of such tuples
95
98
  """
96
99
 
97
100
  super().__init__()
@@ -136,13 +139,15 @@ class PickleSerializer:
136
139
  @staticmethod
137
140
  def dumps(obj: Any) -> bytes:
138
141
  """
139
- :param obj: bytes
142
+ Args:
143
+ obj: bytes
140
144
  """
141
145
  return pickle.dumps(obj, protocol=-1)
142
146
 
143
147
  @staticmethod
144
148
  def loads(buf: Any) -> Any:
145
149
  """
146
- :param buf: bytes
150
+ Args:
151
+ buf: bytes
147
152
  """
148
153
  return pickle.loads(buf)