deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.0.dist-info/METADATA +0 -431
- deepdoctection-0.42.0.dist-info/RECORD +0 -148
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
|
@@ -6,10 +6,9 @@
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
"""
|
|
9
|
-
Some
|
|
10
|
-
|
|
11
|
-
<https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
|
|
9
|
+
Some DataFlows for transforming and processing datapoints
|
|
12
10
|
"""
|
|
11
|
+
|
|
13
12
|
import itertools
|
|
14
13
|
from copy import copy
|
|
15
14
|
from typing import Any, Callable, Iterator, Union
|
|
@@ -25,9 +24,10 @@ class TestDataSpeed(ProxyDataFlow):
|
|
|
25
24
|
|
|
26
25
|
def __init__(self, df: DataFlow, size: int = 5000, warmup: int = 0) -> None:
|
|
27
26
|
"""
|
|
28
|
-
:
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
Args:
|
|
28
|
+
df: The DataFlow to test.
|
|
29
|
+
size: Number of datapoints to fetch.
|
|
30
|
+
warmup: Warmup iterations.
|
|
31
31
|
"""
|
|
32
32
|
super().__init__(df)
|
|
33
33
|
self.test_size = int(size)
|
|
@@ -63,16 +63,16 @@ class TestDataSpeed(ProxyDataFlow):
|
|
|
63
63
|
|
|
64
64
|
class FlattenData(ProxyDataFlow):
|
|
65
65
|
"""
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
**Example:**
|
|
69
|
-
|
|
70
|
-
dp_1 = ['a','b']
|
|
71
|
-
dp_2 = ['c','d']
|
|
66
|
+
FlattenData flattens an iterator within a datapoint. Will flatten the datapoint if it is a list or a tuple.
|
|
72
67
|
|
|
73
|
-
|
|
68
|
+
Example:
|
|
69
|
+
```python
|
|
70
|
+
dp_1 = ['a','b']
|
|
71
|
+
dp_2 = ['c','d']
|
|
74
72
|
|
|
75
|
-
|
|
73
|
+
yields:
|
|
74
|
+
['a'], ['b'], ['c'], ['d']
|
|
75
|
+
```
|
|
76
76
|
"""
|
|
77
77
|
|
|
78
78
|
def __iter__(self) -> Any:
|
|
@@ -84,23 +84,25 @@ class FlattenData(ProxyDataFlow):
|
|
|
84
84
|
|
|
85
85
|
class MapData(ProxyDataFlow):
|
|
86
86
|
"""
|
|
87
|
-
|
|
88
|
-
Note:
|
|
89
|
-
1. Please make sure func doesn't modify its arguments in place,
|
|
90
|
-
unless you're certain it's safe.
|
|
91
|
-
2. If you discard some datapoints, `len(MapData(ds))` will be incorrect.
|
|
87
|
+
MapData applies a mapper/filter on the datapoints of a DataFlow.
|
|
92
88
|
|
|
93
|
-
|
|
89
|
+
Notes:
|
|
90
|
+
1. Please ensure that `func` does not modify its arguments in-place unless it is safe.
|
|
91
|
+
2. If some datapoints are discarded, `len(MapData(ds))` will be incorrect.
|
|
94
92
|
|
|
95
|
-
|
|
96
|
-
|
|
93
|
+
Example:
|
|
94
|
+
```python
|
|
95
|
+
df = ... # a DataFlow where each datapoint is [img, label]
|
|
96
|
+
ds = MapData(ds, lambda dp: [dp[0] * 255, dp[1]])
|
|
97
|
+
```
|
|
97
98
|
"""
|
|
98
99
|
|
|
99
100
|
def __init__(self, df: DataFlow, func: Callable[[Any], Any]) -> None:
|
|
100
101
|
"""
|
|
101
|
-
:
|
|
102
|
-
|
|
103
|
-
|
|
102
|
+
Args:
|
|
103
|
+
df: input DataFlow
|
|
104
|
+
func: takes a datapoint and returns a new
|
|
105
|
+
datapoint. Return None to discard/skip this datapoint.
|
|
104
106
|
"""
|
|
105
107
|
super().__init__(df)
|
|
106
108
|
self.func = func
|
|
@@ -114,27 +116,27 @@ class MapData(ProxyDataFlow):
|
|
|
114
116
|
|
|
115
117
|
class MapDataComponent(MapData):
|
|
116
118
|
"""
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
1. This
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
ds = MapDataComponent(ds, lambda img: img * 255, 0) # map the 0th component
|
|
119
|
+
MapDataComponent applies a mapper/filter on a component of a datapoint.
|
|
120
|
+
|
|
121
|
+
Notes:
|
|
122
|
+
1. This DataFlow itself does not modify the datapoints. Please ensure that `func` does not modify its arguments
|
|
123
|
+
in-place unless it is safe.
|
|
124
|
+
2. If some datapoints are discarded, `len(MapDataComponent(ds, ..))` will be incorrect.
|
|
125
|
+
|
|
126
|
+
Example:
|
|
127
|
+
```python
|
|
128
|
+
df = ... # a DataFlow where each datapoint is [img, label]
|
|
129
|
+
ds = MapDataComponent(ds, lambda img: img * 255, 0) # maps the 0th component
|
|
130
|
+
```
|
|
130
131
|
"""
|
|
131
132
|
|
|
132
133
|
def __init__(self, df: DataFlow, func: Callable[[Any], Any], index: Union[int, str] = 0) -> None:
|
|
133
134
|
"""
|
|
134
|
-
:
|
|
135
|
-
|
|
135
|
+
Args:
|
|
136
|
+
df: input DataFlow which produces either list or dict.
|
|
137
|
+
func (TYPE -> TYPE|None): takes ``dp[index]``, returns a new value for ``dp[index]``.
|
|
136
138
|
Return None to discard/skip this datapoint.
|
|
137
|
-
|
|
139
|
+
index: index or key of the component.
|
|
138
140
|
"""
|
|
139
141
|
self._index = index
|
|
140
142
|
self._func = func
|
|
@@ -152,16 +154,21 @@ class MapDataComponent(MapData):
|
|
|
152
154
|
|
|
153
155
|
|
|
154
156
|
class RepeatedData(ProxyDataFlow):
|
|
155
|
-
"""
|
|
156
|
-
|
|
157
|
-
|
|
157
|
+
"""
|
|
158
|
+
RepeatedData takes datapoints from another DataFlow and produces them until they are exhausted for a certain number
|
|
159
|
+
of repetitions.
|
|
160
|
+
|
|
161
|
+
Example:
|
|
162
|
+
```python
|
|
163
|
+
dp1, dp2, .... dpn, dp1, dp2, ....dpn
|
|
164
|
+
```
|
|
158
165
|
"""
|
|
159
166
|
|
|
160
167
|
def __init__(self, df: DataFlow, num: int) -> None:
|
|
161
168
|
"""
|
|
162
|
-
:
|
|
163
|
-
|
|
164
|
-
|
|
169
|
+
Args:
|
|
170
|
+
df: Input DataFlow.
|
|
171
|
+
num: Number of repetitions of the DataFlow. Set `-1` to repeat the DataFlow infinitely.
|
|
165
172
|
"""
|
|
166
173
|
self.num = num
|
|
167
174
|
if self.num != -1:
|
|
@@ -173,7 +180,7 @@ class RepeatedData(ProxyDataFlow):
|
|
|
173
180
|
def __len__(self) -> int:
|
|
174
181
|
"""
|
|
175
182
|
Raises:
|
|
176
|
-
|
|
183
|
+
ValueError: when num == -1.
|
|
177
184
|
"""
|
|
178
185
|
if self.num == -1:
|
|
179
186
|
raise NotImplementedError("__len__() is unavailable for infinite dataflow")
|
|
@@ -190,20 +197,23 @@ class RepeatedData(ProxyDataFlow):
|
|
|
190
197
|
|
|
191
198
|
class ConcatData(DataFlow):
|
|
192
199
|
"""
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
200
|
+
ConcatData concatenates multiple DataFlows. Produces datapoints from each DataFlow and starts the next when one
|
|
201
|
+
DataFlow is exhausted. Use this DataFlow to process multiple .pdf files in one step.
|
|
202
|
+
|
|
203
|
+
Example:
|
|
204
|
+
```python
|
|
205
|
+
df_1 = analyzer.analyze(path="path/to/pdf_1.pdf")
|
|
206
|
+
df_2 = analyzer.analyze(path="path/to/pdf_2.pdf")
|
|
207
|
+
df = ConcatData([df_1, df_2])
|
|
208
|
+
```
|
|
196
209
|
|
|
197
|
-
**Example:**
|
|
198
210
|
|
|
199
|
-
df_1 = analyzer.analyze(path=path/to/pdf_1.pdf")
|
|
200
|
-
df_2 = analyzer.analyze(path=path/to/pdf_2.pdf")
|
|
201
|
-
df = ConcatData([df_1,df_2])
|
|
202
211
|
"""
|
|
203
212
|
|
|
204
213
|
def __init__(self, df_lists: list[DataFlow]) -> None:
|
|
205
214
|
"""
|
|
206
|
-
:
|
|
215
|
+
Args:
|
|
216
|
+
df_lists: A list of DataFlows.
|
|
207
217
|
"""
|
|
208
218
|
self.df_lists = df_lists
|
|
209
219
|
|
|
@@ -221,28 +231,31 @@ class ConcatData(DataFlow):
|
|
|
221
231
|
|
|
222
232
|
class JoinData(DataFlow):
|
|
223
233
|
"""
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
**Example:**
|
|
234
|
+
JoinData joins the components from each DataFlow. See below for its behavior. It is not possible to join a DataFlow
|
|
235
|
+
that produces lists with one that produces dictionaries.
|
|
228
236
|
|
|
237
|
+
Example:
|
|
238
|
+
```python
|
|
229
239
|
df1 produces: [[c1], [c2]]
|
|
230
240
|
df2 produces: [[c3], [c4]]
|
|
231
241
|
joined: [[c1, c3], [c2, c4]]
|
|
232
242
|
|
|
233
|
-
df1 produces: {"a":c1, "b":c2}
|
|
234
|
-
df2 produces: {"c":c3}
|
|
235
|
-
joined: {"a":c1, "b":c2, "c":c3}
|
|
243
|
+
df1 produces: {"a": c1, "b": c2}
|
|
244
|
+
df2 produces: {"c": c3}
|
|
245
|
+
joined: {"a": c1, "b": c2, "c": c3}
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
`JoinData` stops once the first DataFlow raises a `StopIteration`.
|
|
249
|
+
|
|
236
250
|
|
|
237
|
-
`JoinData` will stop once the first Dataflow throws a StopIteration
|
|
238
251
|
"""
|
|
239
252
|
|
|
240
253
|
def __init__(self, df_lists: list[DataFlow]) -> None:
|
|
241
254
|
"""
|
|
242
|
-
:
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
255
|
+
Args:
|
|
256
|
+
df_lists: A list of DataFlows. If these DataFlows have different sizes, `JoinData` stops when one of them is
|
|
257
|
+
exhausted. The list can contain the same DataFlow instance multiple times, but note that in this
|
|
258
|
+
case `__iter__` will also be called multiple times.
|
|
246
259
|
"""
|
|
247
260
|
self.df_lists = df_lists
|
|
248
261
|
|
|
@@ -275,18 +288,26 @@ class JoinData(DataFlow):
|
|
|
275
288
|
|
|
276
289
|
class BatchData(ProxyDataFlow):
|
|
277
290
|
"""
|
|
278
|
-
|
|
291
|
+
BatchData stacks datapoints into batches. It produces datapoints with the same number of components as `df`, but
|
|
279
292
|
each datapoint is now a list of datapoints.
|
|
293
|
+
|
|
294
|
+
Example:
|
|
295
|
+
```python
|
|
296
|
+
df produces: [[c1], [c2], [c3], [c4]]
|
|
297
|
+
batch_size = 2
|
|
298
|
+
yields: [[c1, c2], [c3, c4]]
|
|
299
|
+
```
|
|
300
|
+
|
|
280
301
|
"""
|
|
281
302
|
|
|
282
303
|
def __init__(self, df: DataFlow, batch_size: int, remainder: bool = False) -> None:
|
|
283
304
|
"""
|
|
284
|
-
:
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
305
|
+
Args:
|
|
306
|
+
df: A DataFlow.
|
|
307
|
+
batch_size: Batch size.
|
|
308
|
+
remainder: If the remaining datapoints in `df` are not enough to form a batch, whether to produce the
|
|
309
|
+
remaining data as a smaller batch. If set to `False`, all produced datapoints are guaranteed to
|
|
310
|
+
have the same batch size. If set to `True`, `len(ds)` must be accurate.
|
|
290
311
|
"""
|
|
291
312
|
super().__init__(df)
|
|
292
313
|
if not remainder:
|
|
@@ -16,8 +16,7 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
|
|
20
|
-
from
|
|
19
|
+
Some custom dataflow classes. Some ideas have been taken from
|
|
21
20
|
|
|
22
21
|
<https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
|
|
23
22
|
"""
|
|
@@ -40,18 +39,22 @@ class CacheData(ProxyDataFlow):
|
|
|
40
39
|
Completely cache the first pass of a DataFlow in memory,
|
|
41
40
|
and produce from the cache thereafter.
|
|
42
41
|
|
|
43
|
-
|
|
44
|
-
|
|
42
|
+
Note:
|
|
43
|
+
The user should not stop the iterator before it has reached the end.
|
|
44
|
+
Otherwise, the cache may be incomplete.
|
|
45
45
|
|
|
46
|
-
|
|
46
|
+
Example:
|
|
47
|
+
```python
|
|
48
|
+
df_list = CacheData(df).get_cache() # Buffers the whole dataflow and return a list of all datapoints
|
|
49
|
+
```
|
|
47
50
|
|
|
48
|
-
df_list = CacheData(df).get_cache() # buffers the whole dataflow and return a list of all datapoints
|
|
49
51
|
"""
|
|
50
52
|
|
|
51
53
|
def __init__(self, df: DataFlow, shuffle: bool = False) -> None:
|
|
52
54
|
"""
|
|
53
|
-
:
|
|
54
|
-
|
|
55
|
+
Args:
|
|
56
|
+
df: input DataFlow.
|
|
57
|
+
shuffle: whether to shuffle the cache before yielding from it.
|
|
55
58
|
"""
|
|
56
59
|
self.shuffle = shuffle
|
|
57
60
|
self.buffer: list[Any] = []
|
|
@@ -80,9 +83,10 @@ class CacheData(ProxyDataFlow):
|
|
|
80
83
|
|
|
81
84
|
def get_cache(self) -> list[Any]:
|
|
82
85
|
"""
|
|
83
|
-
|
|
86
|
+
Get the cache of the whole dataflow as a list.
|
|
84
87
|
|
|
85
|
-
:
|
|
88
|
+
Returns:
|
|
89
|
+
list of datapoints
|
|
86
90
|
"""
|
|
87
91
|
self.reset_state()
|
|
88
92
|
with get_tqdm() as status_bar:
|
|
@@ -95,21 +99,22 @@ class CacheData(ProxyDataFlow):
|
|
|
95
99
|
|
|
96
100
|
class CustomDataFromList(DataFromList):
|
|
97
101
|
"""
|
|
98
|
-
Wraps a list of datapoints to a dataflow. Compared to `Tensorpack.DataFlow.DataFromList`
|
|
99
|
-
can specify a number of datapoints after that the iteration stops.
|
|
100
|
-
filters on that list.
|
|
102
|
+
Wraps a list of datapoints to a dataflow. Compared to `Tensorpack.DataFlow.DataFromList`
|
|
103
|
+
implementation you can specify a number of datapoints after that the iteration stops.
|
|
104
|
+
You can also pass a re-balance function that filters on that list.
|
|
101
105
|
|
|
102
|
-
|
|
106
|
+
Example:
|
|
103
107
|
|
|
104
|
-
|
|
105
|
-
|
|
108
|
+
```python
|
|
109
|
+
def filter_first(lst):
|
|
110
|
+
return lst.pop(0)
|
|
106
111
|
|
|
107
|
-
|
|
108
|
-
|
|
112
|
+
df = CustomDataFromList(lst=[["a","b"],["c","d"]], rebalance_func=filter_first)
|
|
113
|
+
df.reset_state()
|
|
109
114
|
|
|
110
115
|
will yield:
|
|
111
|
-
|
|
112
116
|
["c","d"]
|
|
117
|
+
```
|
|
113
118
|
|
|
114
119
|
"""
|
|
115
120
|
|
|
@@ -121,13 +126,14 @@ class CustomDataFromList(DataFromList):
|
|
|
121
126
|
rebalance_func: Optional[Callable[[list[Any]], list[Any]]] = None,
|
|
122
127
|
):
|
|
123
128
|
"""
|
|
124
|
-
:
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
129
|
+
Args:
|
|
130
|
+
lst: The input list. Each element represents a datapoint.
|
|
131
|
+
shuffle: Whether to shuffle the list before streaming.
|
|
132
|
+
max_datapoints: The maximum number of datapoints to return before stopping the iteration.
|
|
133
|
+
If None it streams the whole dataflow.
|
|
134
|
+
rebalance_func: A func that inputs a list and outputs a list. Useful, if you want to filter the passed
|
|
135
|
+
list and re-balance the sample. Only the output list of the re-balancing function will be
|
|
136
|
+
considered.
|
|
131
137
|
"""
|
|
132
138
|
super().__init__(lst, shuffle)
|
|
133
139
|
self.max_datapoints = max_datapoints
|
|
@@ -176,9 +182,10 @@ class CustomDataFromIterable(DataFromIterable):
|
|
|
176
182
|
|
|
177
183
|
def __init__(self, iterable: Iterable[Any], max_datapoints: Optional[int] = None):
|
|
178
184
|
"""
|
|
179
|
-
:
|
|
180
|
-
|
|
181
|
-
|
|
185
|
+
Args:
|
|
186
|
+
iterable: An iterable object
|
|
187
|
+
max_datapoints: The maximum number of datapoints to stream. If None it iterates through the whole
|
|
188
|
+
dataflow.
|
|
182
189
|
"""
|
|
183
190
|
super().__init__(iterable)
|
|
184
191
|
self.max_datapoints = max_datapoints
|