datachain 0.30.5__py3-none-any.whl → 0.30.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/settings.py CHANGED
@@ -1,111 +1,214 @@
1
+ from typing import Any, Optional, Union
2
+
1
3
  from datachain.lib.utils import DataChainParamsError
2
- from datachain.utils import DEFAULT_CHUNK_ROWS
4
+
5
+ DEFAULT_CACHE = False
6
+ DEFAULT_PREFETCH = 2
7
+ DEFAULT_BATCH_SIZE = 2_000
3
8
 
4
9
 
5
10
  class SettingsError(DataChainParamsError):
6
- def __init__(self, msg):
11
+ def __init__(self, msg: str) -> None:
7
12
  super().__init__(f"Dataset settings error: {msg}")
8
13
 
9
14
 
10
15
  class Settings:
11
- def __init__(
16
+ """Settings for datachain."""
17
+
18
+ _cache: Optional[bool]
19
+ _prefetch: Optional[int]
20
+ _parallel: Optional[Union[bool, int]]
21
+ _workers: Optional[int]
22
+ _namespace: Optional[str]
23
+ _project: Optional[str]
24
+ _min_task_size: Optional[int]
25
+ _batch_size: Optional[int]
26
+
27
+ def __init__( # noqa: C901, PLR0912
12
28
  self,
13
- cache=None,
14
- parallel=None,
15
- workers=None,
16
- min_task_size=None,
17
- prefetch=None,
18
- namespace=None,
19
- project=None,
20
- batch_rows=None,
21
- ):
22
- self._cache = cache
23
- self.parallel = parallel
24
- self._workers = workers
25
- self.min_task_size = min_task_size
26
- self.prefetch = prefetch
27
- self.namespace = namespace
28
- self.project = project
29
- self._chunk_rows = batch_rows
30
-
31
- if not isinstance(cache, bool) and cache is not None:
32
- raise SettingsError(
33
- "'cache' argument must be bool"
34
- f" while {cache.__class__.__name__} was given"
35
- )
36
-
37
- if not isinstance(parallel, int) and parallel is not None:
38
- raise SettingsError(
39
- "'parallel' argument must be int or None"
40
- f" while {parallel.__class__.__name__} was given"
41
- )
42
-
43
- if (
44
- not isinstance(workers, bool)
45
- and not isinstance(workers, int)
46
- and workers is not None
47
- ):
48
- raise SettingsError(
49
- "'workers' argument must be int or bool"
50
- f" while {workers.__class__.__name__} was given"
51
- )
52
-
53
- if min_task_size is not None and not isinstance(min_task_size, int):
54
- raise SettingsError(
55
- "'min_task_size' argument must be int or None"
56
- f", {min_task_size.__class__.__name__} was given"
57
- )
58
-
59
- if batch_rows is not None and not isinstance(batch_rows, int):
60
- raise SettingsError(
61
- "'batch_rows' argument must be int or None"
62
- f", {batch_rows.__class__.__name__} was given"
63
- )
64
-
65
- if batch_rows is not None and batch_rows <= 0:
66
- raise SettingsError(
67
- "'batch_rows' argument must be positive integer"
68
- f", {batch_rows} was given"
69
- )
29
+ cache: Optional[bool] = None,
30
+ prefetch: Optional[Union[bool, int]] = None,
31
+ parallel: Optional[Union[bool, int]] = None,
32
+ workers: Optional[int] = None,
33
+ namespace: Optional[str] = None,
34
+ project: Optional[str] = None,
35
+ min_task_size: Optional[int] = None,
36
+ batch_size: Optional[int] = None,
37
+ ) -> None:
38
+ if cache is None:
39
+ self._cache = None
40
+ else:
41
+ if not isinstance(cache, bool):
42
+ raise SettingsError(
43
+ "'cache' argument must be bool"
44
+ f" while {cache.__class__.__name__} was given"
45
+ )
46
+ self._cache = cache
47
+
48
+ if prefetch is None or prefetch is True:
49
+ self._prefetch = None
50
+ elif prefetch is False:
51
+ self._prefetch = 0 # disable prefetch (False == 0)
52
+ else:
53
+ if not isinstance(prefetch, int):
54
+ raise SettingsError(
55
+ "'prefetch' argument must be int or bool"
56
+ f" while {prefetch.__class__.__name__} was given"
57
+ )
58
+ if prefetch < 0:
59
+ raise SettingsError(
60
+ "'prefetch' argument must be non-negative integer"
61
+ f", {prefetch} was given"
62
+ )
63
+ self._prefetch = prefetch
64
+
65
+ if parallel is None or parallel is False:
66
+ self._parallel = None
67
+ elif parallel is True:
68
+ self._parallel = True
69
+ else:
70
+ if not isinstance(parallel, int):
71
+ raise SettingsError(
72
+ "'parallel' argument must be int or bool"
73
+ f" while {parallel.__class__.__name__} was given"
74
+ )
75
+ if parallel <= 0:
76
+ raise SettingsError(
77
+ "'parallel' argument must be positive integer"
78
+ f", {parallel} was given"
79
+ )
80
+ self._parallel = parallel
81
+
82
+ if workers is None:
83
+ self._workers = None
84
+ else:
85
+ if not isinstance(workers, int) or isinstance(workers, bool):
86
+ raise SettingsError(
87
+ "'workers' argument must be int"
88
+ f" while {workers.__class__.__name__} was given"
89
+ )
90
+ if workers <= 0:
91
+ raise SettingsError(
92
+ f"'workers' argument must be positive integer, {workers} was given"
93
+ )
94
+ self._workers = workers
95
+
96
+ if namespace is None:
97
+ self._namespace = None
98
+ else:
99
+ if not isinstance(namespace, str):
100
+ raise SettingsError(
101
+ "'namespace' argument must be str"
102
+ f", {namespace.__class__.__name__} was given"
103
+ )
104
+ self._namespace = namespace
105
+
106
+ if project is None:
107
+ self._project = None
108
+ else:
109
+ if not isinstance(project, str):
110
+ raise SettingsError(
111
+ "'project' argument must be str"
112
+ f", {project.__class__.__name__} was given"
113
+ )
114
+ self._project = project
115
+
116
+ if min_task_size is None:
117
+ self._min_task_size = None
118
+ else:
119
+ if not isinstance(min_task_size, int) or isinstance(min_task_size, bool):
120
+ raise SettingsError(
121
+ "'min_task_size' argument must be int"
122
+ f", {min_task_size.__class__.__name__} was given"
123
+ )
124
+ if min_task_size <= 0:
125
+ raise SettingsError(
126
+ "'min_task_size' argument must be positive integer"
127
+ f", {min_task_size} was given"
128
+ )
129
+ self._min_task_size = min_task_size
130
+
131
+ if batch_size is None:
132
+ self._batch_size = None
133
+ else:
134
+ if not isinstance(batch_size, int) or isinstance(batch_size, bool):
135
+ raise SettingsError(
136
+ "'batch_size' argument must be int"
137
+ f", {batch_size.__class__.__name__} was given"
138
+ )
139
+ if batch_size <= 0:
140
+ raise SettingsError(
141
+ "'batch_size' argument must be positive integer"
142
+ f", {batch_size} was given"
143
+ )
144
+ self._batch_size = batch_size
145
+
146
+ @property
147
+ def cache(self) -> bool:
148
+ return self._cache if self._cache is not None else DEFAULT_CACHE
149
+
150
+ @property
151
+ def prefetch(self) -> Optional[int]:
152
+ return self._prefetch if self._prefetch is not None else DEFAULT_PREFETCH
153
+
154
+ @property
155
+ def parallel(self) -> Optional[Union[bool, int]]:
156
+ return self._parallel if self._parallel is not None else None
157
+
158
+ @property
159
+ def workers(self) -> Optional[int]:
160
+ return self._workers if self._workers is not None else None
161
+
162
+ @property
163
+ def namespace(self) -> Optional[str]:
164
+ return self._namespace if self._namespace is not None else None
70
165
 
71
166
  @property
72
- def cache(self):
73
- return self._cache if self._cache is not None else False
167
+ def project(self) -> Optional[str]:
168
+ return self._project if self._project is not None else None
74
169
 
75
170
  @property
76
- def workers(self):
77
- return self._workers if self._workers is not None else False
171
+ def min_task_size(self) -> Optional[int]:
172
+ return self._min_task_size if self._min_task_size is not None else None
78
173
 
79
174
  @property
80
- def batch_rows(self):
81
- return self._chunk_rows if self._chunk_rows is not None else DEFAULT_CHUNK_ROWS
175
+ def batch_size(self) -> int:
176
+ return self._batch_size if self._batch_size is not None else DEFAULT_BATCH_SIZE
82
177
 
83
- def to_dict(self):
84
- res = {}
178
+ def to_dict(self) -> dict[str, Any]:
179
+ res: dict[str, Any] = {}
85
180
  if self._cache is not None:
86
181
  res["cache"] = self.cache
87
- if self.parallel is not None:
182
+ if self._prefetch is not None:
183
+ res["prefetch"] = self.prefetch
184
+ if self._parallel is not None:
88
185
  res["parallel"] = self.parallel
89
186
  if self._workers is not None:
90
187
  res["workers"] = self.workers
91
- if self.min_task_size is not None:
188
+ if self._min_task_size is not None:
92
189
  res["min_task_size"] = self.min_task_size
93
- if self.namespace is not None:
190
+ if self._namespace is not None:
94
191
  res["namespace"] = self.namespace
95
- if self.project is not None:
192
+ if self._project is not None:
96
193
  res["project"] = self.project
97
- if self._chunk_rows is not None:
98
- res["batch_rows"] = self._chunk_rows
194
+ if self._batch_size is not None:
195
+ res["batch_size"] = self.batch_size
99
196
  return res
100
197
 
101
- def add(self, settings: "Settings"):
102
- self._cache = settings._cache or self._cache
103
- self.parallel = settings.parallel or self.parallel
104
- self._workers = settings._workers or self._workers
105
- self.min_task_size = settings.min_task_size or self.min_task_size
106
- self.namespace = settings.namespace or self.namespace
107
- self.project = settings.project or self.project
108
- if settings.prefetch is not None:
109
- self.prefetch = settings.prefetch
110
- if settings._chunk_rows is not None:
111
- self._chunk_rows = settings._chunk_rows
198
+ def add(self, settings: "Settings") -> None:
199
+ if settings._cache is not None:
200
+ self._cache = settings._cache
201
+ if settings._prefetch is not None:
202
+ self._prefetch = settings._prefetch
203
+ if settings._parallel is not None:
204
+ self._parallel = settings._parallel
205
+ if settings._workers is not None:
206
+ self._workers = settings._workers
207
+ if settings._namespace is not None:
208
+ self._namespace = settings._namespace
209
+ if settings._project is not None:
210
+ self._project = settings._project
211
+ if settings._min_task_size is not None:
212
+ self._min_task_size = settings._min_task_size
213
+ if settings._batch_size is not None:
214
+ self._batch_size = settings._batch_size
datachain/lib/udf.py CHANGED
@@ -54,23 +54,11 @@ UDFOutputSpec = Mapping[str, ColumnType]
54
54
  UDFResult = dict[str, Any]
55
55
 
56
56
 
57
- @attrs.define
58
- class UDFProperties:
59
- udf: "UDFAdapter"
60
-
61
- def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
62
- return self.udf.get_batching(use_partitioning)
63
-
64
- @property
65
- def batch_rows(self):
66
- return self.udf.batch_rows
67
-
68
-
69
57
  @attrs.define(slots=False)
70
58
  class UDFAdapter:
71
59
  inner: "UDFBase"
72
60
  output: UDFOutputSpec
73
- batch_rows: Optional[int] = None
61
+ batch_size: Optional[int] = None
74
62
  batch: int = 1
75
63
 
76
64
  def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
@@ -83,11 +71,6 @@ class UDFAdapter:
83
71
  return Batch(self.batch)
84
72
  raise ValueError(f"invalid batch size {self.batch}")
85
73
 
86
- @property
87
- def properties(self):
88
- # For backwards compatibility.
89
- return UDFProperties(self)
90
-
91
74
  def run(
92
75
  self,
93
76
  udf_fields: "Sequence[str]",
@@ -237,13 +220,13 @@ class UDFBase(AbstractUDF):
237
220
 
238
221
  def to_udf_wrapper(
239
222
  self,
240
- batch_rows: Optional[int] = None,
223
+ batch_size: Optional[int] = None,
241
224
  batch: int = 1,
242
225
  ) -> UDFAdapter:
243
226
  return UDFAdapter(
244
227
  self,
245
228
  self.output.to_udf_spec(),
246
- batch_rows,
229
+ batch_size,
247
230
  batch,
248
231
  )
249
232
 
datachain/query/batch.py CHANGED
@@ -81,8 +81,8 @@ class Batch(BatchingStrategy):
81
81
  # select rows in batches
82
82
  results = []
83
83
 
84
- with contextlib.closing(execute(query, page_size=page_size)) as batch_rows:
85
- for row in batch_rows:
84
+ with contextlib.closing(execute(query, page_size=page_size)) as rows:
85
+ for row in rows:
86
86
  results.append(row)
87
87
  if len(results) >= self.count:
88
88
  batch, results = results[: self.count], results[self.count :]
@@ -55,7 +55,6 @@ from datachain.query.udf import UdfInfo
55
55
  from datachain.sql.functions.random import rand
56
56
  from datachain.sql.types import SQLType
57
57
  from datachain.utils import (
58
- batched,
59
58
  determine_processes,
60
59
  determine_workers,
61
60
  filtered_cloudpickle_dumps,
@@ -334,10 +333,10 @@ def process_udf_outputs(
334
333
  udf_results: Iterator[Iterable["UDFResult"]],
335
334
  udf: "UDFAdapter",
336
335
  cb: Callback = DEFAULT_CALLBACK,
336
+ batch_size: int = INSERT_BATCH_SIZE,
337
337
  ) -> None:
338
338
  # Optimization: Compute row types once, rather than for every row.
339
339
  udf_col_types = get_col_types(warehouse, udf.output)
340
- batch_rows = udf.batch_rows or INSERT_BATCH_SIZE
341
340
 
342
341
  def _insert_rows():
343
342
  for udf_output in udf_results:
@@ -349,9 +348,7 @@ def process_udf_outputs(
349
348
  cb.relative_update()
350
349
  yield adjust_outputs(warehouse, row, udf_col_types)
351
350
 
352
- for row_chunk in batched(_insert_rows(), batch_rows):
353
- warehouse.insert_rows(udf_table, row_chunk)
354
-
351
+ warehouse.insert_rows(udf_table, _insert_rows(), batch_size=batch_size)
355
352
  warehouse.insert_rows_done(udf_table)
356
353
 
357
354
 
@@ -388,12 +385,13 @@ class UDFStep(Step, ABC):
388
385
  udf: "UDFAdapter"
389
386
  catalog: "Catalog"
390
387
  partition_by: Optional[PartitionByType] = None
388
+ is_generator = False
389
+ # Parameters from Settings
390
+ cache: bool = False
391
391
  parallel: Optional[int] = None
392
392
  workers: Union[bool, int] = False
393
393
  min_task_size: Optional[int] = None
394
- is_generator = False
395
- cache: bool = False
396
- batch_rows: Optional[int] = None
394
+ batch_size: Optional[int] = None
397
395
 
398
396
  @abstractmethod
399
397
  def create_udf_table(self, query: Select) -> "Table":
@@ -450,6 +448,7 @@ class UDFStep(Step, ABC):
450
448
  use_cache=self.cache,
451
449
  is_generator=self.is_generator,
452
450
  min_task_size=self.min_task_size,
451
+ batch_size=self.batch_size,
453
452
  )
454
453
  udf_distributor()
455
454
  return
@@ -486,6 +485,7 @@ class UDFStep(Step, ABC):
486
485
  is_generator=self.is_generator,
487
486
  cache=self.cache,
488
487
  rows_total=rows_total,
488
+ batch_size=self.batch_size or INSERT_BATCH_SIZE,
489
489
  )
490
490
 
491
491
  # Run the UDFDispatcher in another process to avoid needing
@@ -534,6 +534,7 @@ class UDFStep(Step, ABC):
534
534
  udf_results,
535
535
  self.udf,
536
536
  cb=generated_cb,
537
+ batch_size=self.batch_size or INSERT_BATCH_SIZE,
537
538
  )
538
539
  finally:
539
540
  download_cb.close()
@@ -595,7 +596,7 @@ class UDFStep(Step, ABC):
595
596
  parallel=self.parallel,
596
597
  workers=self.workers,
597
598
  min_task_size=self.min_task_size,
598
- batch_rows=self.batch_rows,
599
+ batch_size=self.batch_size,
599
600
  )
600
601
  return self.__class__(self.udf, self.catalog)
601
602
 
@@ -641,7 +642,16 @@ class UDFStep(Step, ABC):
641
642
 
642
643
  @frozen
643
644
  class UDFSignal(UDFStep):
645
+ udf: "UDFAdapter"
646
+ catalog: "Catalog"
647
+ partition_by: Optional[PartitionByType] = None
644
648
  is_generator = False
649
+ # Parameters from Settings
650
+ cache: bool = False
651
+ parallel: Optional[int] = None
652
+ workers: Union[bool, int] = False
653
+ min_task_size: Optional[int] = None
654
+ batch_size: Optional[int] = None
645
655
 
646
656
  def create_udf_table(self, query: Select) -> "Table":
647
657
  udf_output_columns: list[sqlalchemy.Column[Any]] = [
@@ -711,7 +721,16 @@ class UDFSignal(UDFStep):
711
721
  class RowGenerator(UDFStep):
712
722
  """Extend dataset with new rows."""
713
723
 
724
+ udf: "UDFAdapter"
725
+ catalog: "Catalog"
726
+ partition_by: Optional[PartitionByType] = None
714
727
  is_generator = True
728
+ # Parameters from Settings
729
+ cache: bool = False
730
+ parallel: Optional[int] = None
731
+ workers: Union[bool, int] = False
732
+ min_task_size: Optional[int] = None
733
+ batch_size: Optional[int] = None
715
734
 
716
735
  def create_udf_table(self, query: Select) -> "Table":
717
736
  warehouse = self.catalog.warehouse
@@ -1626,12 +1645,17 @@ class DatasetQuery:
1626
1645
  def add_signals(
1627
1646
  self,
1628
1647
  udf: "UDFAdapter",
1648
+ partition_by: Optional[PartitionByType] = None,
1649
+ # Parameters from Settings
1650
+ cache: bool = False,
1629
1651
  parallel: Optional[int] = None,
1630
1652
  workers: Union[bool, int] = False,
1631
1653
  min_task_size: Optional[int] = None,
1632
- partition_by: Optional[PartitionByType] = None,
1633
- cache: bool = False,
1634
- batch_rows: Optional[int] = None,
1654
+ batch_size: Optional[int] = None,
1655
+ # Parameters are unused, kept only to match the signature of Settings.to_dict
1656
+ prefetch: Optional[int] = None,
1657
+ namespace: Optional[str] = None,
1658
+ project: Optional[str] = None,
1635
1659
  ) -> "Self":
1636
1660
  """
1637
1661
  Adds one or more signals based on the results from the provided UDF.
@@ -1657,7 +1681,7 @@ class DatasetQuery:
1657
1681
  workers=workers,
1658
1682
  min_task_size=min_task_size,
1659
1683
  cache=cache,
1660
- batch_rows=batch_rows,
1684
+ batch_size=batch_size,
1661
1685
  )
1662
1686
  )
1663
1687
  return query
@@ -1672,14 +1696,17 @@ class DatasetQuery:
1672
1696
  def generate(
1673
1697
  self,
1674
1698
  udf: "UDFAdapter",
1699
+ partition_by: Optional[PartitionByType] = None,
1700
+ # Parameters from Settings
1701
+ cache: bool = False,
1675
1702
  parallel: Optional[int] = None,
1676
1703
  workers: Union[bool, int] = False,
1677
1704
  min_task_size: Optional[int] = None,
1678
- partition_by: Optional[PartitionByType] = None,
1705
+ batch_size: Optional[int] = None,
1706
+ # Parameters are unused, kept only to match the signature of Settings.to_dict:
1707
+ prefetch: Optional[int] = None,
1679
1708
  namespace: Optional[str] = None,
1680
1709
  project: Optional[str] = None,
1681
- cache: bool = False,
1682
- batch_rows: Optional[int] = None,
1683
1710
  ) -> "Self":
1684
1711
  query = self.clone()
1685
1712
  steps = query.steps
@@ -1692,7 +1719,7 @@ class DatasetQuery:
1692
1719
  workers=workers,
1693
1720
  min_task_size=min_task_size,
1694
1721
  cache=cache,
1695
- batch_rows=batch_rows,
1722
+ batch_size=batch_size,
1696
1723
  )
1697
1724
  )
1698
1725
  return query
@@ -114,6 +114,7 @@ class UDFDispatcher:
114
114
  self.is_batching = udf_info["batching"].is_batching
115
115
  self.processes = udf_info["processes"]
116
116
  self.rows_total = udf_info["rows_total"]
117
+ self.batch_size = udf_info["batch_size"]
117
118
  self.buffer_size = buffer_size
118
119
  self.task_queue = None
119
120
  self.done_queue = None
@@ -142,6 +143,7 @@ class UDFDispatcher:
142
143
  self.table,
143
144
  self.cache,
144
145
  self.is_batching,
146
+ self.batch_size,
145
147
  self.udf_fields,
146
148
  )
147
149
 
@@ -232,6 +234,7 @@ class UDFDispatcher:
232
234
  udf_results,
233
235
  udf,
234
236
  cb=generated_cb,
237
+ batch_size=self.batch_size,
235
238
  )
236
239
 
237
240
  def input_batch_size(self, n_workers: int) -> int:
@@ -385,6 +388,7 @@ class UDFWorker:
385
388
  table: "Table",
386
389
  cache: bool,
387
390
  is_batching: bool,
391
+ batch_size: int,
388
392
  udf_fields: Sequence[str],
389
393
  ) -> None:
390
394
  self.catalog = catalog
@@ -395,6 +399,7 @@ class UDFWorker:
395
399
  self.table = table
396
400
  self.cache = cache
397
401
  self.is_batching = is_batching
402
+ self.batch_size = batch_size
398
403
  self.udf_fields = udf_fields
399
404
 
400
405
  self.download_cb = DownloadCallback(self.done_queue)
@@ -420,6 +425,7 @@ class UDFWorker:
420
425
  self.notify_and_process(udf_results),
421
426
  self.udf,
422
427
  cb=self.generated_cb,
428
+ batch_size=self.batch_size,
423
429
  )
424
430
  put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
425
431
 
datachain/query/udf.py CHANGED
@@ -21,6 +21,7 @@ class UdfInfo(TypedDict):
21
21
  is_generator: bool
22
22
  cache: bool
23
23
  rows_total: int
24
+ batch_size: int
24
25
 
25
26
 
26
27
  class AbstractUDFDistributor(ABC):
@@ -39,6 +40,7 @@ class AbstractUDFDistributor(ABC):
39
40
  use_cache: bool,
40
41
  is_generator: bool = False,
41
42
  min_task_size: Optional[Union[str, int]] = None,
43
+ batch_size: Optional[int] = None,
42
44
  ) -> None: ...
43
45
 
44
46
  @abstractmethod
datachain/utils.py CHANGED
@@ -25,7 +25,7 @@ if TYPE_CHECKING:
25
25
  from typing_extensions import Self
26
26
 
27
27
 
28
- DEFAULT_CHUNK_ROWS = 2000
28
+ DEFAULT_BATCH_SIZE = 2000
29
29
 
30
30
  logger = logging.getLogger("datachain")
31
31
 
@@ -228,7 +228,7 @@ _T_co = TypeVar("_T_co", covariant=True)
228
228
 
229
229
  def _dynamic_batched_core(
230
230
  iterable: Iterable[_T_co],
231
- batch_rows: int,
231
+ batch_size: int,
232
232
  ) -> Iterator[list[_T_co]]:
233
233
  """Core batching logic that yields lists."""
234
234
 
@@ -236,7 +236,7 @@ def _dynamic_batched_core(
236
236
 
237
237
  for item in iterable:
238
238
  # Check if adding this item would exceed limits
239
- if len(batch) >= batch_rows and batch: # Yield current batch if we have one
239
+ if len(batch) >= batch_size and batch: # Yield current batch if we have one
240
240
  yield batch
241
241
  batch = []
242
242
 
@@ -247,23 +247,22 @@ def _dynamic_batched_core(
247
247
  yield batch
248
248
 
249
249
 
250
- def batched(iterable: Iterable[_T_co], batch_rows: int) -> Iterator[tuple[_T_co, ...]]:
250
+ def batched(iterable: Iterable[_T_co], batch_size: int) -> Iterator[tuple[_T_co, ...]]:
251
251
  """
252
- Batch data into tuples of length batch_rows .
252
+ Batch data into tuples of length batch_size.
253
253
  The last batch may be shorter.
254
254
  """
255
- yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_rows))
255
+ yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_size))
256
256
 
257
257
 
258
258
  def batched_it(
259
259
  iterable: Iterable[_T_co],
260
- batch_rows: int = DEFAULT_CHUNK_ROWS,
260
+ batch_size: int = DEFAULT_BATCH_SIZE,
261
261
  ) -> Iterator[Iterator[_T_co]]:
262
262
  """
263
- Batch data into iterators with dynamic sizing
264
- based on row count and memory usage.
263
+ Batch data into iterators with dynamic sizing based on row count and memory usage.
265
264
  """
266
- yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_rows))
265
+ yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_size))
267
266
 
268
267
 
269
268
  def flatten(items):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.30.5
3
+ Version: 0.30.7
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0