datachain 0.30.5__py3-none-any.whl → 0.30.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cli/commands/datasets.py +32 -17
- datachain/data_storage/sqlite.py +18 -15
- datachain/data_storage/warehouse.py +7 -1
- datachain/delta.py +36 -20
- datachain/lib/dc/database.py +2 -2
- datachain/lib/dc/datachain.py +36 -28
- datachain/lib/dc/datasets.py +4 -0
- datachain/lib/dc/records.py +2 -4
- datachain/lib/dc/storage.py +5 -0
- datachain/lib/settings.py +188 -85
- datachain/lib/udf.py +3 -20
- datachain/query/batch.py +2 -2
- datachain/query/dataset.py +44 -17
- datachain/query/dispatch.py +6 -0
- datachain/query/udf.py +2 -0
- datachain/utils.py +9 -10
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/METADATA +1 -1
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/RECORD +22 -22
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/top_level.txt +0 -0
datachain/lib/settings.py
CHANGED
|
@@ -1,111 +1,214 @@
|
|
|
1
|
+
from typing import Any, Optional, Union
|
|
2
|
+
|
|
1
3
|
from datachain.lib.utils import DataChainParamsError
|
|
2
|
-
|
|
4
|
+
|
|
5
|
+
DEFAULT_CACHE = False
|
|
6
|
+
DEFAULT_PREFETCH = 2
|
|
7
|
+
DEFAULT_BATCH_SIZE = 2_000
|
|
3
8
|
|
|
4
9
|
|
|
5
10
|
class SettingsError(DataChainParamsError):
|
|
6
|
-
def __init__(self, msg):
|
|
11
|
+
def __init__(self, msg: str) -> None:
|
|
7
12
|
super().__init__(f"Dataset settings error: {msg}")
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class Settings:
|
|
11
|
-
|
|
16
|
+
"""Settings for datachain."""
|
|
17
|
+
|
|
18
|
+
_cache: Optional[bool]
|
|
19
|
+
_prefetch: Optional[int]
|
|
20
|
+
_parallel: Optional[Union[bool, int]]
|
|
21
|
+
_workers: Optional[int]
|
|
22
|
+
_namespace: Optional[str]
|
|
23
|
+
_project: Optional[str]
|
|
24
|
+
_min_task_size: Optional[int]
|
|
25
|
+
_batch_size: Optional[int]
|
|
26
|
+
|
|
27
|
+
def __init__( # noqa: C901, PLR0912
|
|
12
28
|
self,
|
|
13
|
-
cache=None,
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
):
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
)
|
|
29
|
+
cache: Optional[bool] = None,
|
|
30
|
+
prefetch: Optional[Union[bool, int]] = None,
|
|
31
|
+
parallel: Optional[Union[bool, int]] = None,
|
|
32
|
+
workers: Optional[int] = None,
|
|
33
|
+
namespace: Optional[str] = None,
|
|
34
|
+
project: Optional[str] = None,
|
|
35
|
+
min_task_size: Optional[int] = None,
|
|
36
|
+
batch_size: Optional[int] = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
if cache is None:
|
|
39
|
+
self._cache = None
|
|
40
|
+
else:
|
|
41
|
+
if not isinstance(cache, bool):
|
|
42
|
+
raise SettingsError(
|
|
43
|
+
"'cache' argument must be bool"
|
|
44
|
+
f" while {cache.__class__.__name__} was given"
|
|
45
|
+
)
|
|
46
|
+
self._cache = cache
|
|
47
|
+
|
|
48
|
+
if prefetch is None or prefetch is True:
|
|
49
|
+
self._prefetch = None
|
|
50
|
+
elif prefetch is False:
|
|
51
|
+
self._prefetch = 0 # disable prefetch (False == 0)
|
|
52
|
+
else:
|
|
53
|
+
if not isinstance(prefetch, int):
|
|
54
|
+
raise SettingsError(
|
|
55
|
+
"'prefetch' argument must be int or bool"
|
|
56
|
+
f" while {prefetch.__class__.__name__} was given"
|
|
57
|
+
)
|
|
58
|
+
if prefetch < 0:
|
|
59
|
+
raise SettingsError(
|
|
60
|
+
"'prefetch' argument must be non-negative integer"
|
|
61
|
+
f", {prefetch} was given"
|
|
62
|
+
)
|
|
63
|
+
self._prefetch = prefetch
|
|
64
|
+
|
|
65
|
+
if parallel is None or parallel is False:
|
|
66
|
+
self._parallel = None
|
|
67
|
+
elif parallel is True:
|
|
68
|
+
self._parallel = True
|
|
69
|
+
else:
|
|
70
|
+
if not isinstance(parallel, int):
|
|
71
|
+
raise SettingsError(
|
|
72
|
+
"'parallel' argument must be int or bool"
|
|
73
|
+
f" while {parallel.__class__.__name__} was given"
|
|
74
|
+
)
|
|
75
|
+
if parallel <= 0:
|
|
76
|
+
raise SettingsError(
|
|
77
|
+
"'parallel' argument must be positive integer"
|
|
78
|
+
f", {parallel} was given"
|
|
79
|
+
)
|
|
80
|
+
self._parallel = parallel
|
|
81
|
+
|
|
82
|
+
if workers is None:
|
|
83
|
+
self._workers = None
|
|
84
|
+
else:
|
|
85
|
+
if not isinstance(workers, int) or isinstance(workers, bool):
|
|
86
|
+
raise SettingsError(
|
|
87
|
+
"'workers' argument must be int"
|
|
88
|
+
f" while {workers.__class__.__name__} was given"
|
|
89
|
+
)
|
|
90
|
+
if workers <= 0:
|
|
91
|
+
raise SettingsError(
|
|
92
|
+
f"'workers' argument must be positive integer, {workers} was given"
|
|
93
|
+
)
|
|
94
|
+
self._workers = workers
|
|
95
|
+
|
|
96
|
+
if namespace is None:
|
|
97
|
+
self._namespace = None
|
|
98
|
+
else:
|
|
99
|
+
if not isinstance(namespace, str):
|
|
100
|
+
raise SettingsError(
|
|
101
|
+
"'namespace' argument must be str"
|
|
102
|
+
f", {namespace.__class__.__name__} was given"
|
|
103
|
+
)
|
|
104
|
+
self._namespace = namespace
|
|
105
|
+
|
|
106
|
+
if project is None:
|
|
107
|
+
self._project = None
|
|
108
|
+
else:
|
|
109
|
+
if not isinstance(project, str):
|
|
110
|
+
raise SettingsError(
|
|
111
|
+
"'project' argument must be str"
|
|
112
|
+
f", {project.__class__.__name__} was given"
|
|
113
|
+
)
|
|
114
|
+
self._project = project
|
|
115
|
+
|
|
116
|
+
if min_task_size is None:
|
|
117
|
+
self._min_task_size = None
|
|
118
|
+
else:
|
|
119
|
+
if not isinstance(min_task_size, int) or isinstance(min_task_size, bool):
|
|
120
|
+
raise SettingsError(
|
|
121
|
+
"'min_task_size' argument must be int"
|
|
122
|
+
f", {min_task_size.__class__.__name__} was given"
|
|
123
|
+
)
|
|
124
|
+
if min_task_size <= 0:
|
|
125
|
+
raise SettingsError(
|
|
126
|
+
"'min_task_size' argument must be positive integer"
|
|
127
|
+
f", {min_task_size} was given"
|
|
128
|
+
)
|
|
129
|
+
self._min_task_size = min_task_size
|
|
130
|
+
|
|
131
|
+
if batch_size is None:
|
|
132
|
+
self._batch_size = None
|
|
133
|
+
else:
|
|
134
|
+
if not isinstance(batch_size, int) or isinstance(batch_size, bool):
|
|
135
|
+
raise SettingsError(
|
|
136
|
+
"'batch_size' argument must be int"
|
|
137
|
+
f", {batch_size.__class__.__name__} was given"
|
|
138
|
+
)
|
|
139
|
+
if batch_size <= 0:
|
|
140
|
+
raise SettingsError(
|
|
141
|
+
"'batch_size' argument must be positive integer"
|
|
142
|
+
f", {batch_size} was given"
|
|
143
|
+
)
|
|
144
|
+
self._batch_size = batch_size
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def cache(self) -> bool:
|
|
148
|
+
return self._cache if self._cache is not None else DEFAULT_CACHE
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def prefetch(self) -> Optional[int]:
|
|
152
|
+
return self._prefetch if self._prefetch is not None else DEFAULT_PREFETCH
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def parallel(self) -> Optional[Union[bool, int]]:
|
|
156
|
+
return self._parallel if self._parallel is not None else None
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def workers(self) -> Optional[int]:
|
|
160
|
+
return self._workers if self._workers is not None else None
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def namespace(self) -> Optional[str]:
|
|
164
|
+
return self._namespace if self._namespace is not None else None
|
|
70
165
|
|
|
71
166
|
@property
|
|
72
|
-
def
|
|
73
|
-
return self.
|
|
167
|
+
def project(self) -> Optional[str]:
|
|
168
|
+
return self._project if self._project is not None else None
|
|
74
169
|
|
|
75
170
|
@property
|
|
76
|
-
def
|
|
77
|
-
return self.
|
|
171
|
+
def min_task_size(self) -> Optional[int]:
|
|
172
|
+
return self._min_task_size if self._min_task_size is not None else None
|
|
78
173
|
|
|
79
174
|
@property
|
|
80
|
-
def
|
|
81
|
-
return self.
|
|
175
|
+
def batch_size(self) -> int:
|
|
176
|
+
return self._batch_size if self._batch_size is not None else DEFAULT_BATCH_SIZE
|
|
82
177
|
|
|
83
|
-
def to_dict(self):
|
|
84
|
-
res = {}
|
|
178
|
+
def to_dict(self) -> dict[str, Any]:
|
|
179
|
+
res: dict[str, Any] = {}
|
|
85
180
|
if self._cache is not None:
|
|
86
181
|
res["cache"] = self.cache
|
|
87
|
-
if self.
|
|
182
|
+
if self._prefetch is not None:
|
|
183
|
+
res["prefetch"] = self.prefetch
|
|
184
|
+
if self._parallel is not None:
|
|
88
185
|
res["parallel"] = self.parallel
|
|
89
186
|
if self._workers is not None:
|
|
90
187
|
res["workers"] = self.workers
|
|
91
|
-
if self.
|
|
188
|
+
if self._min_task_size is not None:
|
|
92
189
|
res["min_task_size"] = self.min_task_size
|
|
93
|
-
if self.
|
|
190
|
+
if self._namespace is not None:
|
|
94
191
|
res["namespace"] = self.namespace
|
|
95
|
-
if self.
|
|
192
|
+
if self._project is not None:
|
|
96
193
|
res["project"] = self.project
|
|
97
|
-
if self.
|
|
98
|
-
res["
|
|
194
|
+
if self._batch_size is not None:
|
|
195
|
+
res["batch_size"] = self.batch_size
|
|
99
196
|
return res
|
|
100
197
|
|
|
101
|
-
def add(self, settings: "Settings"):
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
if settings.
|
|
109
|
-
self.
|
|
110
|
-
if settings.
|
|
111
|
-
self.
|
|
198
|
+
def add(self, settings: "Settings") -> None:
|
|
199
|
+
if settings._cache is not None:
|
|
200
|
+
self._cache = settings._cache
|
|
201
|
+
if settings._prefetch is not None:
|
|
202
|
+
self._prefetch = settings._prefetch
|
|
203
|
+
if settings._parallel is not None:
|
|
204
|
+
self._parallel = settings._parallel
|
|
205
|
+
if settings._workers is not None:
|
|
206
|
+
self._workers = settings._workers
|
|
207
|
+
if settings._namespace is not None:
|
|
208
|
+
self._namespace = settings._namespace
|
|
209
|
+
if settings._project is not None:
|
|
210
|
+
self._project = settings._project
|
|
211
|
+
if settings._min_task_size is not None:
|
|
212
|
+
self._min_task_size = settings._min_task_size
|
|
213
|
+
if settings._batch_size is not None:
|
|
214
|
+
self._batch_size = settings._batch_size
|
datachain/lib/udf.py
CHANGED
|
@@ -54,23 +54,11 @@ UDFOutputSpec = Mapping[str, ColumnType]
|
|
|
54
54
|
UDFResult = dict[str, Any]
|
|
55
55
|
|
|
56
56
|
|
|
57
|
-
@attrs.define
|
|
58
|
-
class UDFProperties:
|
|
59
|
-
udf: "UDFAdapter"
|
|
60
|
-
|
|
61
|
-
def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
|
|
62
|
-
return self.udf.get_batching(use_partitioning)
|
|
63
|
-
|
|
64
|
-
@property
|
|
65
|
-
def batch_rows(self):
|
|
66
|
-
return self.udf.batch_rows
|
|
67
|
-
|
|
68
|
-
|
|
69
57
|
@attrs.define(slots=False)
|
|
70
58
|
class UDFAdapter:
|
|
71
59
|
inner: "UDFBase"
|
|
72
60
|
output: UDFOutputSpec
|
|
73
|
-
|
|
61
|
+
batch_size: Optional[int] = None
|
|
74
62
|
batch: int = 1
|
|
75
63
|
|
|
76
64
|
def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
|
|
@@ -83,11 +71,6 @@ class UDFAdapter:
|
|
|
83
71
|
return Batch(self.batch)
|
|
84
72
|
raise ValueError(f"invalid batch size {self.batch}")
|
|
85
73
|
|
|
86
|
-
@property
|
|
87
|
-
def properties(self):
|
|
88
|
-
# For backwards compatibility.
|
|
89
|
-
return UDFProperties(self)
|
|
90
|
-
|
|
91
74
|
def run(
|
|
92
75
|
self,
|
|
93
76
|
udf_fields: "Sequence[str]",
|
|
@@ -237,13 +220,13 @@ class UDFBase(AbstractUDF):
|
|
|
237
220
|
|
|
238
221
|
def to_udf_wrapper(
|
|
239
222
|
self,
|
|
240
|
-
|
|
223
|
+
batch_size: Optional[int] = None,
|
|
241
224
|
batch: int = 1,
|
|
242
225
|
) -> UDFAdapter:
|
|
243
226
|
return UDFAdapter(
|
|
244
227
|
self,
|
|
245
228
|
self.output.to_udf_spec(),
|
|
246
|
-
|
|
229
|
+
batch_size,
|
|
247
230
|
batch,
|
|
248
231
|
)
|
|
249
232
|
|
datachain/query/batch.py
CHANGED
|
@@ -81,8 +81,8 @@ class Batch(BatchingStrategy):
|
|
|
81
81
|
# select rows in batches
|
|
82
82
|
results = []
|
|
83
83
|
|
|
84
|
-
with contextlib.closing(execute(query, page_size=page_size)) as
|
|
85
|
-
for row in
|
|
84
|
+
with contextlib.closing(execute(query, page_size=page_size)) as rows:
|
|
85
|
+
for row in rows:
|
|
86
86
|
results.append(row)
|
|
87
87
|
if len(results) >= self.count:
|
|
88
88
|
batch, results = results[: self.count], results[self.count :]
|
datachain/query/dataset.py
CHANGED
|
@@ -55,7 +55,6 @@ from datachain.query.udf import UdfInfo
|
|
|
55
55
|
from datachain.sql.functions.random import rand
|
|
56
56
|
from datachain.sql.types import SQLType
|
|
57
57
|
from datachain.utils import (
|
|
58
|
-
batched,
|
|
59
58
|
determine_processes,
|
|
60
59
|
determine_workers,
|
|
61
60
|
filtered_cloudpickle_dumps,
|
|
@@ -334,10 +333,10 @@ def process_udf_outputs(
|
|
|
334
333
|
udf_results: Iterator[Iterable["UDFResult"]],
|
|
335
334
|
udf: "UDFAdapter",
|
|
336
335
|
cb: Callback = DEFAULT_CALLBACK,
|
|
336
|
+
batch_size: int = INSERT_BATCH_SIZE,
|
|
337
337
|
) -> None:
|
|
338
338
|
# Optimization: Compute row types once, rather than for every row.
|
|
339
339
|
udf_col_types = get_col_types(warehouse, udf.output)
|
|
340
|
-
batch_rows = udf.batch_rows or INSERT_BATCH_SIZE
|
|
341
340
|
|
|
342
341
|
def _insert_rows():
|
|
343
342
|
for udf_output in udf_results:
|
|
@@ -349,9 +348,7 @@ def process_udf_outputs(
|
|
|
349
348
|
cb.relative_update()
|
|
350
349
|
yield adjust_outputs(warehouse, row, udf_col_types)
|
|
351
350
|
|
|
352
|
-
|
|
353
|
-
warehouse.insert_rows(udf_table, row_chunk)
|
|
354
|
-
|
|
351
|
+
warehouse.insert_rows(udf_table, _insert_rows(), batch_size=batch_size)
|
|
355
352
|
warehouse.insert_rows_done(udf_table)
|
|
356
353
|
|
|
357
354
|
|
|
@@ -388,12 +385,13 @@ class UDFStep(Step, ABC):
|
|
|
388
385
|
udf: "UDFAdapter"
|
|
389
386
|
catalog: "Catalog"
|
|
390
387
|
partition_by: Optional[PartitionByType] = None
|
|
388
|
+
is_generator = False
|
|
389
|
+
# Parameters from Settings
|
|
390
|
+
cache: bool = False
|
|
391
391
|
parallel: Optional[int] = None
|
|
392
392
|
workers: Union[bool, int] = False
|
|
393
393
|
min_task_size: Optional[int] = None
|
|
394
|
-
|
|
395
|
-
cache: bool = False
|
|
396
|
-
batch_rows: Optional[int] = None
|
|
394
|
+
batch_size: Optional[int] = None
|
|
397
395
|
|
|
398
396
|
@abstractmethod
|
|
399
397
|
def create_udf_table(self, query: Select) -> "Table":
|
|
@@ -450,6 +448,7 @@ class UDFStep(Step, ABC):
|
|
|
450
448
|
use_cache=self.cache,
|
|
451
449
|
is_generator=self.is_generator,
|
|
452
450
|
min_task_size=self.min_task_size,
|
|
451
|
+
batch_size=self.batch_size,
|
|
453
452
|
)
|
|
454
453
|
udf_distributor()
|
|
455
454
|
return
|
|
@@ -486,6 +485,7 @@ class UDFStep(Step, ABC):
|
|
|
486
485
|
is_generator=self.is_generator,
|
|
487
486
|
cache=self.cache,
|
|
488
487
|
rows_total=rows_total,
|
|
488
|
+
batch_size=self.batch_size or INSERT_BATCH_SIZE,
|
|
489
489
|
)
|
|
490
490
|
|
|
491
491
|
# Run the UDFDispatcher in another process to avoid needing
|
|
@@ -534,6 +534,7 @@ class UDFStep(Step, ABC):
|
|
|
534
534
|
udf_results,
|
|
535
535
|
self.udf,
|
|
536
536
|
cb=generated_cb,
|
|
537
|
+
batch_size=self.batch_size or INSERT_BATCH_SIZE,
|
|
537
538
|
)
|
|
538
539
|
finally:
|
|
539
540
|
download_cb.close()
|
|
@@ -595,7 +596,7 @@ class UDFStep(Step, ABC):
|
|
|
595
596
|
parallel=self.parallel,
|
|
596
597
|
workers=self.workers,
|
|
597
598
|
min_task_size=self.min_task_size,
|
|
598
|
-
|
|
599
|
+
batch_size=self.batch_size,
|
|
599
600
|
)
|
|
600
601
|
return self.__class__(self.udf, self.catalog)
|
|
601
602
|
|
|
@@ -641,7 +642,16 @@ class UDFStep(Step, ABC):
|
|
|
641
642
|
|
|
642
643
|
@frozen
|
|
643
644
|
class UDFSignal(UDFStep):
|
|
645
|
+
udf: "UDFAdapter"
|
|
646
|
+
catalog: "Catalog"
|
|
647
|
+
partition_by: Optional[PartitionByType] = None
|
|
644
648
|
is_generator = False
|
|
649
|
+
# Parameters from Settings
|
|
650
|
+
cache: bool = False
|
|
651
|
+
parallel: Optional[int] = None
|
|
652
|
+
workers: Union[bool, int] = False
|
|
653
|
+
min_task_size: Optional[int] = None
|
|
654
|
+
batch_size: Optional[int] = None
|
|
645
655
|
|
|
646
656
|
def create_udf_table(self, query: Select) -> "Table":
|
|
647
657
|
udf_output_columns: list[sqlalchemy.Column[Any]] = [
|
|
@@ -711,7 +721,16 @@ class UDFSignal(UDFStep):
|
|
|
711
721
|
class RowGenerator(UDFStep):
|
|
712
722
|
"""Extend dataset with new rows."""
|
|
713
723
|
|
|
724
|
+
udf: "UDFAdapter"
|
|
725
|
+
catalog: "Catalog"
|
|
726
|
+
partition_by: Optional[PartitionByType] = None
|
|
714
727
|
is_generator = True
|
|
728
|
+
# Parameters from Settings
|
|
729
|
+
cache: bool = False
|
|
730
|
+
parallel: Optional[int] = None
|
|
731
|
+
workers: Union[bool, int] = False
|
|
732
|
+
min_task_size: Optional[int] = None
|
|
733
|
+
batch_size: Optional[int] = None
|
|
715
734
|
|
|
716
735
|
def create_udf_table(self, query: Select) -> "Table":
|
|
717
736
|
warehouse = self.catalog.warehouse
|
|
@@ -1626,12 +1645,17 @@ class DatasetQuery:
|
|
|
1626
1645
|
def add_signals(
|
|
1627
1646
|
self,
|
|
1628
1647
|
udf: "UDFAdapter",
|
|
1648
|
+
partition_by: Optional[PartitionByType] = None,
|
|
1649
|
+
# Parameters from Settings
|
|
1650
|
+
cache: bool = False,
|
|
1629
1651
|
parallel: Optional[int] = None,
|
|
1630
1652
|
workers: Union[bool, int] = False,
|
|
1631
1653
|
min_task_size: Optional[int] = None,
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1654
|
+
batch_size: Optional[int] = None,
|
|
1655
|
+
# Parameters are unused, kept only to match the signature of Settings.to_dict
|
|
1656
|
+
prefetch: Optional[int] = None,
|
|
1657
|
+
namespace: Optional[str] = None,
|
|
1658
|
+
project: Optional[str] = None,
|
|
1635
1659
|
) -> "Self":
|
|
1636
1660
|
"""
|
|
1637
1661
|
Adds one or more signals based on the results from the provided UDF.
|
|
@@ -1657,7 +1681,7 @@ class DatasetQuery:
|
|
|
1657
1681
|
workers=workers,
|
|
1658
1682
|
min_task_size=min_task_size,
|
|
1659
1683
|
cache=cache,
|
|
1660
|
-
|
|
1684
|
+
batch_size=batch_size,
|
|
1661
1685
|
)
|
|
1662
1686
|
)
|
|
1663
1687
|
return query
|
|
@@ -1672,14 +1696,17 @@ class DatasetQuery:
|
|
|
1672
1696
|
def generate(
|
|
1673
1697
|
self,
|
|
1674
1698
|
udf: "UDFAdapter",
|
|
1699
|
+
partition_by: Optional[PartitionByType] = None,
|
|
1700
|
+
# Parameters from Settings
|
|
1701
|
+
cache: bool = False,
|
|
1675
1702
|
parallel: Optional[int] = None,
|
|
1676
1703
|
workers: Union[bool, int] = False,
|
|
1677
1704
|
min_task_size: Optional[int] = None,
|
|
1678
|
-
|
|
1705
|
+
batch_size: Optional[int] = None,
|
|
1706
|
+
# Parameters are unused, kept only to match the signature of Settings.to_dict:
|
|
1707
|
+
prefetch: Optional[int] = None,
|
|
1679
1708
|
namespace: Optional[str] = None,
|
|
1680
1709
|
project: Optional[str] = None,
|
|
1681
|
-
cache: bool = False,
|
|
1682
|
-
batch_rows: Optional[int] = None,
|
|
1683
1710
|
) -> "Self":
|
|
1684
1711
|
query = self.clone()
|
|
1685
1712
|
steps = query.steps
|
|
@@ -1692,7 +1719,7 @@ class DatasetQuery:
|
|
|
1692
1719
|
workers=workers,
|
|
1693
1720
|
min_task_size=min_task_size,
|
|
1694
1721
|
cache=cache,
|
|
1695
|
-
|
|
1722
|
+
batch_size=batch_size,
|
|
1696
1723
|
)
|
|
1697
1724
|
)
|
|
1698
1725
|
return query
|
datachain/query/dispatch.py
CHANGED
|
@@ -114,6 +114,7 @@ class UDFDispatcher:
|
|
|
114
114
|
self.is_batching = udf_info["batching"].is_batching
|
|
115
115
|
self.processes = udf_info["processes"]
|
|
116
116
|
self.rows_total = udf_info["rows_total"]
|
|
117
|
+
self.batch_size = udf_info["batch_size"]
|
|
117
118
|
self.buffer_size = buffer_size
|
|
118
119
|
self.task_queue = None
|
|
119
120
|
self.done_queue = None
|
|
@@ -142,6 +143,7 @@ class UDFDispatcher:
|
|
|
142
143
|
self.table,
|
|
143
144
|
self.cache,
|
|
144
145
|
self.is_batching,
|
|
146
|
+
self.batch_size,
|
|
145
147
|
self.udf_fields,
|
|
146
148
|
)
|
|
147
149
|
|
|
@@ -232,6 +234,7 @@ class UDFDispatcher:
|
|
|
232
234
|
udf_results,
|
|
233
235
|
udf,
|
|
234
236
|
cb=generated_cb,
|
|
237
|
+
batch_size=self.batch_size,
|
|
235
238
|
)
|
|
236
239
|
|
|
237
240
|
def input_batch_size(self, n_workers: int) -> int:
|
|
@@ -385,6 +388,7 @@ class UDFWorker:
|
|
|
385
388
|
table: "Table",
|
|
386
389
|
cache: bool,
|
|
387
390
|
is_batching: bool,
|
|
391
|
+
batch_size: int,
|
|
388
392
|
udf_fields: Sequence[str],
|
|
389
393
|
) -> None:
|
|
390
394
|
self.catalog = catalog
|
|
@@ -395,6 +399,7 @@ class UDFWorker:
|
|
|
395
399
|
self.table = table
|
|
396
400
|
self.cache = cache
|
|
397
401
|
self.is_batching = is_batching
|
|
402
|
+
self.batch_size = batch_size
|
|
398
403
|
self.udf_fields = udf_fields
|
|
399
404
|
|
|
400
405
|
self.download_cb = DownloadCallback(self.done_queue)
|
|
@@ -420,6 +425,7 @@ class UDFWorker:
|
|
|
420
425
|
self.notify_and_process(udf_results),
|
|
421
426
|
self.udf,
|
|
422
427
|
cb=self.generated_cb,
|
|
428
|
+
batch_size=self.batch_size,
|
|
423
429
|
)
|
|
424
430
|
put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
|
|
425
431
|
|
datachain/query/udf.py
CHANGED
|
@@ -21,6 +21,7 @@ class UdfInfo(TypedDict):
|
|
|
21
21
|
is_generator: bool
|
|
22
22
|
cache: bool
|
|
23
23
|
rows_total: int
|
|
24
|
+
batch_size: int
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class AbstractUDFDistributor(ABC):
|
|
@@ -39,6 +40,7 @@ class AbstractUDFDistributor(ABC):
|
|
|
39
40
|
use_cache: bool,
|
|
40
41
|
is_generator: bool = False,
|
|
41
42
|
min_task_size: Optional[Union[str, int]] = None,
|
|
43
|
+
batch_size: Optional[int] = None,
|
|
42
44
|
) -> None: ...
|
|
43
45
|
|
|
44
46
|
@abstractmethod
|
datachain/utils.py
CHANGED
|
@@ -25,7 +25,7 @@ if TYPE_CHECKING:
|
|
|
25
25
|
from typing_extensions import Self
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
|
|
28
|
+
DEFAULT_BATCH_SIZE = 2000
|
|
29
29
|
|
|
30
30
|
logger = logging.getLogger("datachain")
|
|
31
31
|
|
|
@@ -228,7 +228,7 @@ _T_co = TypeVar("_T_co", covariant=True)
|
|
|
228
228
|
|
|
229
229
|
def _dynamic_batched_core(
|
|
230
230
|
iterable: Iterable[_T_co],
|
|
231
|
-
|
|
231
|
+
batch_size: int,
|
|
232
232
|
) -> Iterator[list[_T_co]]:
|
|
233
233
|
"""Core batching logic that yields lists."""
|
|
234
234
|
|
|
@@ -236,7 +236,7 @@ def _dynamic_batched_core(
|
|
|
236
236
|
|
|
237
237
|
for item in iterable:
|
|
238
238
|
# Check if adding this item would exceed limits
|
|
239
|
-
if len(batch) >=
|
|
239
|
+
if len(batch) >= batch_size and batch: # Yield current batch if we have one
|
|
240
240
|
yield batch
|
|
241
241
|
batch = []
|
|
242
242
|
|
|
@@ -247,23 +247,22 @@ def _dynamic_batched_core(
|
|
|
247
247
|
yield batch
|
|
248
248
|
|
|
249
249
|
|
|
250
|
-
def batched(iterable: Iterable[_T_co],
|
|
250
|
+
def batched(iterable: Iterable[_T_co], batch_size: int) -> Iterator[tuple[_T_co, ...]]:
|
|
251
251
|
"""
|
|
252
|
-
Batch data into tuples of length
|
|
252
|
+
Batch data into tuples of length batch_size.
|
|
253
253
|
The last batch may be shorter.
|
|
254
254
|
"""
|
|
255
|
-
yield from (tuple(batch) for batch in _dynamic_batched_core(iterable,
|
|
255
|
+
yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_size))
|
|
256
256
|
|
|
257
257
|
|
|
258
258
|
def batched_it(
|
|
259
259
|
iterable: Iterable[_T_co],
|
|
260
|
-
|
|
260
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
261
261
|
) -> Iterator[Iterator[_T_co]]:
|
|
262
262
|
"""
|
|
263
|
-
Batch data into iterators with dynamic sizing
|
|
264
|
-
based on row count and memory usage.
|
|
263
|
+
Batch data into iterators with dynamic sizing based on row count and memory usage.
|
|
265
264
|
"""
|
|
266
|
-
yield from (iter(batch) for batch in _dynamic_batched_core(iterable,
|
|
265
|
+
yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_size))
|
|
267
266
|
|
|
268
267
|
|
|
269
268
|
def flatten(items):
|