datachain 0.30.6__py3-none-any.whl → 0.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -37,6 +37,7 @@ from datachain import semver
37
37
  from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
38
38
  from datachain.data_storage.db_engine import DatabaseEngine
39
39
  from datachain.data_storage.schema import DefaultSchema
40
+ from datachain.data_storage.warehouse import INSERT_BATCH_SIZE
40
41
  from datachain.dataset import DatasetRecord, StorageURI
41
42
  from datachain.error import DataChainError, OutdatedDatabaseSchemaError
42
43
  from datachain.namespace import Namespace
@@ -44,7 +45,7 @@ from datachain.project import Project
44
45
  from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
45
46
  from datachain.sql.sqlite.base import load_usearch_extension
46
47
  from datachain.sql.types import SQLType
47
- from datachain.utils import DataChainDir, batched_it
48
+ from datachain.utils import DataChainDir, batched, batched_it
48
49
 
49
50
  if TYPE_CHECKING:
50
51
  from sqlalchemy.dialects.sqlite import Insert
@@ -712,19 +713,21 @@ class SQLiteWarehouse(AbstractWarehouse):
712
713
  def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
713
714
  return (e.model_dump() for e in entries)
714
715
 
715
- def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
716
- rows = list(rows)
717
- if not rows:
718
- return
719
-
720
- with self.db.transaction() as conn:
721
- # transactions speeds up inserts significantly as there is no separate
722
- # transaction created for each insert row
723
- self.db.executemany(
724
- table.insert().values({f: bindparam(f) for f in rows[0]}),
725
- rows,
726
- conn=conn,
727
- )
716
+ def insert_rows(
717
+ self,
718
+ table: Table,
719
+ rows: Iterable[dict[str, Any]],
720
+ batch_size: int = INSERT_BATCH_SIZE,
721
+ ) -> None:
722
+ for row_chunk in batched(rows, batch_size):
723
+ with self.db.transaction() as conn:
724
+ # transactions speeds up inserts significantly as there is no separate
725
+ # transaction created for each insert row
726
+ self.db.executemany(
727
+ table.insert().values({f: bindparam(f) for f in row_chunk[0]}),
728
+ row_chunk,
729
+ conn=conn,
730
+ )
728
731
 
729
732
  def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
730
733
  dr = self.dataset_rows(dataset, version)
@@ -797,7 +800,7 @@ class SQLiteWarehouse(AbstractWarehouse):
797
800
  .limit(None)
798
801
  )
799
802
 
800
- for batch in batched_it(ids, 10_000):
803
+ for batch in batched_it(ids, INSERT_BATCH_SIZE):
801
804
  batch_ids = [row[0] for row in batch]
802
805
  select_q._where_criteria = (col_id.in_(batch_ids),)
803
806
  q = table.insert().from_select(list(select_q.selected_columns), select_q)
@@ -43,6 +43,7 @@ if TYPE_CHECKING:
43
43
  logger = logging.getLogger("datachain")
44
44
 
45
45
  SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
46
+ INSERT_BATCH_SIZE = 10_000 # number of rows to insert at a time
46
47
 
47
48
 
48
49
  class AbstractWarehouse(ABC, Serializable):
@@ -415,7 +416,12 @@ class AbstractWarehouse(ABC, Serializable):
415
416
  """Convert File entries so they can be passed on to `insert_rows()`"""
416
417
 
417
418
  @abstractmethod
418
- def insert_rows(self, table: sa.Table, rows: Iterable[dict[str, Any]]) -> None:
419
+ def insert_rows(
420
+ self,
421
+ table: sa.Table,
422
+ rows: Iterable[dict[str, Any]],
423
+ batch_size: int = INSERT_BATCH_SIZE,
424
+ ) -> None:
419
425
  """Does batch inserts of any kind of rows into table"""
420
426
 
421
427
  def insert_rows_done(self, table: sa.Table) -> None:
@@ -73,7 +73,7 @@ def to_database(
73
73
  table_name: str,
74
74
  connection: "ConnectionType",
75
75
  *,
76
- batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
76
+ batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
77
77
  on_conflict: Optional[str] = None,
78
78
  conflict_columns: Optional[list[str]] = None,
79
79
  column_mapping: Optional[dict[str, Optional[str]]] = None,
@@ -124,7 +124,7 @@ def to_database(
124
124
  table.create(conn, checkfirst=True)
125
125
 
126
126
  rows_iter = chain._leaf_values()
127
- for batch in batched(rows_iter, batch_rows):
127
+ for batch in batched(rows_iter, batch_size):
128
128
  rows_affected = _process_batch(
129
129
  conn,
130
130
  table,
@@ -342,15 +342,15 @@ class DataChain:
342
342
 
343
343
  def settings(
344
344
  self,
345
- cache=None,
346
- parallel=None,
347
- workers=None,
348
- min_task_size=None,
349
- prefetch: Optional[int] = None,
350
- sys: Optional[bool] = None,
345
+ cache: Optional[bool] = None,
346
+ prefetch: Optional[Union[bool, int]] = None,
347
+ parallel: Optional[Union[bool, int]] = None,
348
+ workers: Optional[int] = None,
351
349
  namespace: Optional[str] = None,
352
350
  project: Optional[str] = None,
353
- batch_rows: Optional[int] = None,
351
+ min_task_size: Optional[int] = None,
352
+ batch_size: Optional[int] = None,
353
+ sys: Optional[bool] = None,
354
354
  ) -> "Self":
355
355
  """Change settings for chain.
356
356
 
@@ -359,23 +359,23 @@ class DataChain:
359
359
 
360
360
  Parameters:
361
361
  cache : data caching. (default=False)
362
+ prefetch : number of workers to use for downloading files in advance.
363
+ This is enabled by default and uses 2 workers.
364
+ To disable prefetching, set it to 0 or False.
362
365
  parallel : number of thread for processors. True is a special value to
363
366
  enable all available CPUs. (default=1)
364
367
  workers : number of distributed workers. Only for Studio mode. (default=1)
365
- min_task_size : minimum number of tasks. (default=1)
366
- prefetch : number of workers to use for downloading files in advance.
367
- This is enabled by default and uses 2 workers.
368
- To disable prefetching, set it to 0.
369
368
  namespace : namespace name.
370
369
  project : project name.
371
- batch_rows : row limit per insert to balance speed and memory usage.
370
+ min_task_size : minimum number of tasks. (default=1)
371
+ batch_size : row limit per insert to balance speed and memory usage.
372
372
  (default=2000)
373
373
 
374
374
  Example:
375
375
  ```py
376
376
  chain = (
377
377
  chain
378
- .settings(cache=True, parallel=8, batch_rows=300)
378
+ .settings(cache=True, parallel=8, batch_size=300)
379
379
  .map(laion=process_webdataset(spec=WDSLaion), params="file")
380
380
  )
381
381
  ```
@@ -385,14 +385,14 @@ class DataChain:
385
385
  settings = copy.copy(self._settings)
386
386
  settings.add(
387
387
  Settings(
388
- cache,
389
- parallel,
390
- workers,
391
- min_task_size,
392
- prefetch,
393
- namespace,
394
- project,
395
- batch_rows,
388
+ cache=cache,
389
+ prefetch=prefetch,
390
+ parallel=parallel,
391
+ workers=workers,
392
+ namespace=namespace,
393
+ project=project,
394
+ min_task_size=min_task_size,
395
+ batch_size=batch_size,
396
396
  )
397
397
  )
398
398
  return self._evolve(settings=settings, _sys=sys)
@@ -745,7 +745,7 @@ class DataChain:
745
745
 
746
746
  return self._evolve(
747
747
  query=self._query.add_signals(
748
- udf_obj.to_udf_wrapper(self._settings.batch_rows),
748
+ udf_obj.to_udf_wrapper(self._settings.batch_size),
749
749
  **self._settings.to_dict(),
750
750
  ),
751
751
  signal_schema=self.signals_schema | udf_obj.output,
@@ -783,7 +783,7 @@ class DataChain:
783
783
  udf_obj.prefetch = prefetch
784
784
  return self._evolve(
785
785
  query=self._query.generate(
786
- udf_obj.to_udf_wrapper(self._settings.batch_rows),
786
+ udf_obj.to_udf_wrapper(self._settings.batch_size),
787
787
  **self._settings.to_dict(),
788
788
  ),
789
789
  signal_schema=udf_obj.output,
@@ -919,7 +919,7 @@ class DataChain:
919
919
  udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
920
920
  return self._evolve(
921
921
  query=self._query.generate(
922
- udf_obj.to_udf_wrapper(self._settings.batch_rows),
922
+ udf_obj.to_udf_wrapper(self._settings.batch_size),
923
923
  partition_by=processed_partition_by,
924
924
  **self._settings.to_dict(),
925
925
  ),
@@ -968,7 +968,7 @@ class DataChain:
968
968
 
969
969
  return self._evolve(
970
970
  query=self._query.add_signals(
971
- udf_obj.to_udf_wrapper(self._settings.batch_rows, batch=batch),
971
+ udf_obj.to_udf_wrapper(self._settings.batch_size, batch=batch),
972
972
  **self._settings.to_dict(),
973
973
  ),
974
974
  signal_schema=self.signals_schema | udf_obj.output,
@@ -2314,7 +2314,7 @@ class DataChain:
2314
2314
  table_name: str,
2315
2315
  connection: "ConnectionType",
2316
2316
  *,
2317
- batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
2317
+ batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
2318
2318
  on_conflict: Optional[str] = None,
2319
2319
  conflict_columns: Optional[list[str]] = None,
2320
2320
  column_mapping: Optional[dict[str, Optional[str]]] = None,
@@ -2336,7 +2336,7 @@ class DataChain:
2336
2336
  library. If a DBAPI2 object, only sqlite3 is supported. The user is
2337
2337
  responsible for engine disposal and connection closure for the
2338
2338
  SQLAlchemy connectable; str connections are closed automatically.
2339
- batch_rows: Number of rows to insert per batch for optimal performance.
2339
+ batch_size: Number of rows to insert per batch for optimal performance.
2340
2340
  Larger batches are faster but use more memory. Default: 10,000.
2341
2341
  on_conflict: Strategy for handling duplicate rows (requires table
2342
2342
  constraints):
@@ -2417,7 +2417,7 @@ class DataChain:
2417
2417
  self,
2418
2418
  table_name,
2419
2419
  connection,
2420
- batch_rows=batch_rows,
2420
+ batch_size=batch_size,
2421
2421
  on_conflict=on_conflict,
2422
2422
  conflict_columns=conflict_columns,
2423
2423
  column_mapping=column_mapping,
@@ -31,7 +31,7 @@ def read_records(
31
31
 
32
32
  Parameters:
33
33
  to_insert : records (or a single record) to insert. Each record is
34
- a dictionary of signals and theirs values.
34
+ a dictionary of signals and their values.
35
35
  schema : describes chain signals and their corresponding types
36
36
 
37
37
  Example:
@@ -45,7 +45,6 @@ def read_records(
45
45
  """
46
46
  from datachain.query.dataset import adjust_outputs, get_col_types
47
47
  from datachain.sql.types import SQLType
48
- from datachain.utils import batched
49
48
 
50
49
  from .datasets import read_dataset
51
50
 
@@ -96,7 +95,6 @@ def read_records(
96
95
  {c.name: c.type for c in columns if isinstance(c.type, SQLType)},
97
96
  )
98
97
  records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
99
- for chunk in batched(records, READ_RECORDS_BATCH_SIZE):
100
- warehouse.insert_rows(table, chunk)
98
+ warehouse.insert_rows(table, records, batch_size=READ_RECORDS_BATCH_SIZE)
101
99
  warehouse.insert_rows_done(table)
102
100
  return read_dataset(name=dsr.full_name, session=session, settings=settings)
datachain/lib/settings.py CHANGED
@@ -1,111 +1,214 @@
1
+ from typing import Any, Optional, Union
2
+
1
3
  from datachain.lib.utils import DataChainParamsError
2
- from datachain.utils import DEFAULT_CHUNK_ROWS
4
+
5
+ DEFAULT_CACHE = False
6
+ DEFAULT_PREFETCH = 2
7
+ DEFAULT_BATCH_SIZE = 2_000
3
8
 
4
9
 
5
10
  class SettingsError(DataChainParamsError):
6
- def __init__(self, msg):
11
+ def __init__(self, msg: str) -> None:
7
12
  super().__init__(f"Dataset settings error: {msg}")
8
13
 
9
14
 
10
15
  class Settings:
11
- def __init__(
16
+ """Settings for datachain."""
17
+
18
+ _cache: Optional[bool]
19
+ _prefetch: Optional[int]
20
+ _parallel: Optional[Union[bool, int]]
21
+ _workers: Optional[int]
22
+ _namespace: Optional[str]
23
+ _project: Optional[str]
24
+ _min_task_size: Optional[int]
25
+ _batch_size: Optional[int]
26
+
27
+ def __init__( # noqa: C901, PLR0912
12
28
  self,
13
- cache=None,
14
- parallel=None,
15
- workers=None,
16
- min_task_size=None,
17
- prefetch=None,
18
- namespace=None,
19
- project=None,
20
- batch_rows=None,
21
- ):
22
- self._cache = cache
23
- self.parallel = parallel
24
- self._workers = workers
25
- self.min_task_size = min_task_size
26
- self.prefetch = prefetch
27
- self.namespace = namespace
28
- self.project = project
29
- self._chunk_rows = batch_rows
30
-
31
- if not isinstance(cache, bool) and cache is not None:
32
- raise SettingsError(
33
- "'cache' argument must be bool"
34
- f" while {cache.__class__.__name__} was given"
35
- )
36
-
37
- if not isinstance(parallel, int) and parallel is not None:
38
- raise SettingsError(
39
- "'parallel' argument must be int or None"
40
- f" while {parallel.__class__.__name__} was given"
41
- )
42
-
43
- if (
44
- not isinstance(workers, bool)
45
- and not isinstance(workers, int)
46
- and workers is not None
47
- ):
48
- raise SettingsError(
49
- "'workers' argument must be int or bool"
50
- f" while {workers.__class__.__name__} was given"
51
- )
52
-
53
- if min_task_size is not None and not isinstance(min_task_size, int):
54
- raise SettingsError(
55
- "'min_task_size' argument must be int or None"
56
- f", {min_task_size.__class__.__name__} was given"
57
- )
58
-
59
- if batch_rows is not None and not isinstance(batch_rows, int):
60
- raise SettingsError(
61
- "'batch_rows' argument must be int or None"
62
- f", {batch_rows.__class__.__name__} was given"
63
- )
64
-
65
- if batch_rows is not None and batch_rows <= 0:
66
- raise SettingsError(
67
- "'batch_rows' argument must be positive integer"
68
- f", {batch_rows} was given"
69
- )
29
+ cache: Optional[bool] = None,
30
+ prefetch: Optional[Union[bool, int]] = None,
31
+ parallel: Optional[Union[bool, int]] = None,
32
+ workers: Optional[int] = None,
33
+ namespace: Optional[str] = None,
34
+ project: Optional[str] = None,
35
+ min_task_size: Optional[int] = None,
36
+ batch_size: Optional[int] = None,
37
+ ) -> None:
38
+ if cache is None:
39
+ self._cache = None
40
+ else:
41
+ if not isinstance(cache, bool):
42
+ raise SettingsError(
43
+ "'cache' argument must be bool"
44
+ f" while {cache.__class__.__name__} was given"
45
+ )
46
+ self._cache = cache
47
+
48
+ if prefetch is None or prefetch is True:
49
+ self._prefetch = None
50
+ elif prefetch is False:
51
+ self._prefetch = 0 # disable prefetch (False == 0)
52
+ else:
53
+ if not isinstance(prefetch, int):
54
+ raise SettingsError(
55
+ "'prefetch' argument must be int or bool"
56
+ f" while {prefetch.__class__.__name__} was given"
57
+ )
58
+ if prefetch < 0:
59
+ raise SettingsError(
60
+ "'prefetch' argument must be non-negative integer"
61
+ f", {prefetch} was given"
62
+ )
63
+ self._prefetch = prefetch
64
+
65
+ if parallel is None or parallel is False:
66
+ self._parallel = None
67
+ elif parallel is True:
68
+ self._parallel = True
69
+ else:
70
+ if not isinstance(parallel, int):
71
+ raise SettingsError(
72
+ "'parallel' argument must be int or bool"
73
+ f" while {parallel.__class__.__name__} was given"
74
+ )
75
+ if parallel <= 0:
76
+ raise SettingsError(
77
+ "'parallel' argument must be positive integer"
78
+ f", {parallel} was given"
79
+ )
80
+ self._parallel = parallel
81
+
82
+ if workers is None:
83
+ self._workers = None
84
+ else:
85
+ if not isinstance(workers, int) or isinstance(workers, bool):
86
+ raise SettingsError(
87
+ "'workers' argument must be int"
88
+ f" while {workers.__class__.__name__} was given"
89
+ )
90
+ if workers <= 0:
91
+ raise SettingsError(
92
+ f"'workers' argument must be positive integer, {workers} was given"
93
+ )
94
+ self._workers = workers
95
+
96
+ if namespace is None:
97
+ self._namespace = None
98
+ else:
99
+ if not isinstance(namespace, str):
100
+ raise SettingsError(
101
+ "'namespace' argument must be str"
102
+ f", {namespace.__class__.__name__} was given"
103
+ )
104
+ self._namespace = namespace
105
+
106
+ if project is None:
107
+ self._project = None
108
+ else:
109
+ if not isinstance(project, str):
110
+ raise SettingsError(
111
+ "'project' argument must be str"
112
+ f", {project.__class__.__name__} was given"
113
+ )
114
+ self._project = project
115
+
116
+ if min_task_size is None:
117
+ self._min_task_size = None
118
+ else:
119
+ if not isinstance(min_task_size, int) or isinstance(min_task_size, bool):
120
+ raise SettingsError(
121
+ "'min_task_size' argument must be int"
122
+ f", {min_task_size.__class__.__name__} was given"
123
+ )
124
+ if min_task_size <= 0:
125
+ raise SettingsError(
126
+ "'min_task_size' argument must be positive integer"
127
+ f", {min_task_size} was given"
128
+ )
129
+ self._min_task_size = min_task_size
130
+
131
+ if batch_size is None:
132
+ self._batch_size = None
133
+ else:
134
+ if not isinstance(batch_size, int) or isinstance(batch_size, bool):
135
+ raise SettingsError(
136
+ "'batch_size' argument must be int"
137
+ f", {batch_size.__class__.__name__} was given"
138
+ )
139
+ if batch_size <= 0:
140
+ raise SettingsError(
141
+ "'batch_size' argument must be positive integer"
142
+ f", {batch_size} was given"
143
+ )
144
+ self._batch_size = batch_size
145
+
146
+ @property
147
+ def cache(self) -> bool:
148
+ return self._cache if self._cache is not None else DEFAULT_CACHE
149
+
150
+ @property
151
+ def prefetch(self) -> Optional[int]:
152
+ return self._prefetch if self._prefetch is not None else DEFAULT_PREFETCH
153
+
154
+ @property
155
+ def parallel(self) -> Optional[Union[bool, int]]:
156
+ return self._parallel if self._parallel is not None else None
157
+
158
+ @property
159
+ def workers(self) -> Optional[int]:
160
+ return self._workers if self._workers is not None else None
161
+
162
+ @property
163
+ def namespace(self) -> Optional[str]:
164
+ return self._namespace if self._namespace is not None else None
70
165
 
71
166
  @property
72
- def cache(self):
73
- return self._cache if self._cache is not None else False
167
+ def project(self) -> Optional[str]:
168
+ return self._project if self._project is not None else None
74
169
 
75
170
  @property
76
- def workers(self):
77
- return self._workers if self._workers is not None else False
171
+ def min_task_size(self) -> Optional[int]:
172
+ return self._min_task_size if self._min_task_size is not None else None
78
173
 
79
174
  @property
80
- def batch_rows(self):
81
- return self._chunk_rows if self._chunk_rows is not None else DEFAULT_CHUNK_ROWS
175
+ def batch_size(self) -> int:
176
+ return self._batch_size if self._batch_size is not None else DEFAULT_BATCH_SIZE
82
177
 
83
- def to_dict(self):
84
- res = {}
178
+ def to_dict(self) -> dict[str, Any]:
179
+ res: dict[str, Any] = {}
85
180
  if self._cache is not None:
86
181
  res["cache"] = self.cache
87
- if self.parallel is not None:
182
+ if self._prefetch is not None:
183
+ res["prefetch"] = self.prefetch
184
+ if self._parallel is not None:
88
185
  res["parallel"] = self.parallel
89
186
  if self._workers is not None:
90
187
  res["workers"] = self.workers
91
- if self.min_task_size is not None:
188
+ if self._min_task_size is not None:
92
189
  res["min_task_size"] = self.min_task_size
93
- if self.namespace is not None:
190
+ if self._namespace is not None:
94
191
  res["namespace"] = self.namespace
95
- if self.project is not None:
192
+ if self._project is not None:
96
193
  res["project"] = self.project
97
- if self._chunk_rows is not None:
98
- res["batch_rows"] = self._chunk_rows
194
+ if self._batch_size is not None:
195
+ res["batch_size"] = self.batch_size
99
196
  return res
100
197
 
101
- def add(self, settings: "Settings"):
102
- self._cache = settings._cache or self._cache
103
- self.parallel = settings.parallel or self.parallel
104
- self._workers = settings._workers or self._workers
105
- self.min_task_size = settings.min_task_size or self.min_task_size
106
- self.namespace = settings.namespace or self.namespace
107
- self.project = settings.project or self.project
108
- if settings.prefetch is not None:
109
- self.prefetch = settings.prefetch
110
- if settings._chunk_rows is not None:
111
- self._chunk_rows = settings._chunk_rows
198
+ def add(self, settings: "Settings") -> None:
199
+ if settings._cache is not None:
200
+ self._cache = settings._cache
201
+ if settings._prefetch is not None:
202
+ self._prefetch = settings._prefetch
203
+ if settings._parallel is not None:
204
+ self._parallel = settings._parallel
205
+ if settings._workers is not None:
206
+ self._workers = settings._workers
207
+ if settings._namespace is not None:
208
+ self._namespace = settings._namespace
209
+ if settings._project is not None:
210
+ self._project = settings._project
211
+ if settings._min_task_size is not None:
212
+ self._min_task_size = settings._min_task_size
213
+ if settings._batch_size is not None:
214
+ self._batch_size = settings._batch_size
datachain/lib/udf.py CHANGED
@@ -54,23 +54,11 @@ UDFOutputSpec = Mapping[str, ColumnType]
54
54
  UDFResult = dict[str, Any]
55
55
 
56
56
 
57
- @attrs.define
58
- class UDFProperties:
59
- udf: "UDFAdapter"
60
-
61
- def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
62
- return self.udf.get_batching(use_partitioning)
63
-
64
- @property
65
- def batch_rows(self):
66
- return self.udf.batch_rows
67
-
68
-
69
57
  @attrs.define(slots=False)
70
58
  class UDFAdapter:
71
59
  inner: "UDFBase"
72
60
  output: UDFOutputSpec
73
- batch_rows: Optional[int] = None
61
+ batch_size: Optional[int] = None
74
62
  batch: int = 1
75
63
 
76
64
  def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
@@ -83,11 +71,6 @@ class UDFAdapter:
83
71
  return Batch(self.batch)
84
72
  raise ValueError(f"invalid batch size {self.batch}")
85
73
 
86
- @property
87
- def properties(self):
88
- # For backwards compatibility.
89
- return UDFProperties(self)
90
-
91
74
  def run(
92
75
  self,
93
76
  udf_fields: "Sequence[str]",
@@ -237,13 +220,13 @@ class UDFBase(AbstractUDF):
237
220
 
238
221
  def to_udf_wrapper(
239
222
  self,
240
- batch_rows: Optional[int] = None,
223
+ batch_size: Optional[int] = None,
241
224
  batch: int = 1,
242
225
  ) -> UDFAdapter:
243
226
  return UDFAdapter(
244
227
  self,
245
228
  self.output.to_udf_spec(),
246
- batch_rows,
229
+ batch_size,
247
230
  batch,
248
231
  )
249
232
 
datachain/query/batch.py CHANGED
@@ -81,8 +81,8 @@ class Batch(BatchingStrategy):
81
81
  # select rows in batches
82
82
  results = []
83
83
 
84
- with contextlib.closing(execute(query, page_size=page_size)) as batch_rows:
85
- for row in batch_rows:
84
+ with contextlib.closing(execute(query, page_size=page_size)) as rows:
85
+ for row in rows:
86
86
  results.append(row)
87
87
  if len(results) >= self.count:
88
88
  batch, results = results[: self.count], results[self.count :]
@@ -55,7 +55,6 @@ from datachain.query.udf import UdfInfo
55
55
  from datachain.sql.functions.random import rand
56
56
  from datachain.sql.types import SQLType
57
57
  from datachain.utils import (
58
- batched,
59
58
  determine_processes,
60
59
  determine_workers,
61
60
  filtered_cloudpickle_dumps,
@@ -334,10 +333,10 @@ def process_udf_outputs(
334
333
  udf_results: Iterator[Iterable["UDFResult"]],
335
334
  udf: "UDFAdapter",
336
335
  cb: Callback = DEFAULT_CALLBACK,
336
+ batch_size: int = INSERT_BATCH_SIZE,
337
337
  ) -> None:
338
338
  # Optimization: Compute row types once, rather than for every row.
339
339
  udf_col_types = get_col_types(warehouse, udf.output)
340
- batch_rows = udf.batch_rows or INSERT_BATCH_SIZE
341
340
 
342
341
  def _insert_rows():
343
342
  for udf_output in udf_results:
@@ -349,9 +348,7 @@ def process_udf_outputs(
349
348
  cb.relative_update()
350
349
  yield adjust_outputs(warehouse, row, udf_col_types)
351
350
 
352
- for row_chunk in batched(_insert_rows(), batch_rows):
353
- warehouse.insert_rows(udf_table, row_chunk)
354
-
351
+ warehouse.insert_rows(udf_table, _insert_rows(), batch_size=batch_size)
355
352
  warehouse.insert_rows_done(udf_table)
356
353
 
357
354
 
@@ -388,12 +385,13 @@ class UDFStep(Step, ABC):
388
385
  udf: "UDFAdapter"
389
386
  catalog: "Catalog"
390
387
  partition_by: Optional[PartitionByType] = None
388
+ is_generator = False
389
+ # Parameters from Settings
390
+ cache: bool = False
391
391
  parallel: Optional[int] = None
392
392
  workers: Union[bool, int] = False
393
393
  min_task_size: Optional[int] = None
394
- is_generator = False
395
- cache: bool = False
396
- batch_rows: Optional[int] = None
394
+ batch_size: Optional[int] = None
397
395
 
398
396
  @abstractmethod
399
397
  def create_udf_table(self, query: Select) -> "Table":
@@ -450,6 +448,7 @@ class UDFStep(Step, ABC):
450
448
  use_cache=self.cache,
451
449
  is_generator=self.is_generator,
452
450
  min_task_size=self.min_task_size,
451
+ batch_size=self.batch_size,
453
452
  )
454
453
  udf_distributor()
455
454
  return
@@ -486,6 +485,7 @@ class UDFStep(Step, ABC):
486
485
  is_generator=self.is_generator,
487
486
  cache=self.cache,
488
487
  rows_total=rows_total,
488
+ batch_size=self.batch_size or INSERT_BATCH_SIZE,
489
489
  )
490
490
 
491
491
  # Run the UDFDispatcher in another process to avoid needing
@@ -534,6 +534,7 @@ class UDFStep(Step, ABC):
534
534
  udf_results,
535
535
  self.udf,
536
536
  cb=generated_cb,
537
+ batch_size=self.batch_size or INSERT_BATCH_SIZE,
537
538
  )
538
539
  finally:
539
540
  download_cb.close()
@@ -595,7 +596,7 @@ class UDFStep(Step, ABC):
595
596
  parallel=self.parallel,
596
597
  workers=self.workers,
597
598
  min_task_size=self.min_task_size,
598
- batch_rows=self.batch_rows,
599
+ batch_size=self.batch_size,
599
600
  )
600
601
  return self.__class__(self.udf, self.catalog)
601
602
 
@@ -641,7 +642,16 @@ class UDFStep(Step, ABC):
641
642
 
642
643
  @frozen
643
644
  class UDFSignal(UDFStep):
645
+ udf: "UDFAdapter"
646
+ catalog: "Catalog"
647
+ partition_by: Optional[PartitionByType] = None
644
648
  is_generator = False
649
+ # Parameters from Settings
650
+ cache: bool = False
651
+ parallel: Optional[int] = None
652
+ workers: Union[bool, int] = False
653
+ min_task_size: Optional[int] = None
654
+ batch_size: Optional[int] = None
645
655
 
646
656
  def create_udf_table(self, query: Select) -> "Table":
647
657
  udf_output_columns: list[sqlalchemy.Column[Any]] = [
@@ -711,7 +721,16 @@ class UDFSignal(UDFStep):
711
721
  class RowGenerator(UDFStep):
712
722
  """Extend dataset with new rows."""
713
723
 
724
+ udf: "UDFAdapter"
725
+ catalog: "Catalog"
726
+ partition_by: Optional[PartitionByType] = None
714
727
  is_generator = True
728
+ # Parameters from Settings
729
+ cache: bool = False
730
+ parallel: Optional[int] = None
731
+ workers: Union[bool, int] = False
732
+ min_task_size: Optional[int] = None
733
+ batch_size: Optional[int] = None
715
734
 
716
735
  def create_udf_table(self, query: Select) -> "Table":
717
736
  warehouse = self.catalog.warehouse
@@ -1626,12 +1645,17 @@ class DatasetQuery:
1626
1645
  def add_signals(
1627
1646
  self,
1628
1647
  udf: "UDFAdapter",
1648
+ partition_by: Optional[PartitionByType] = None,
1649
+ # Parameters from Settings
1650
+ cache: bool = False,
1629
1651
  parallel: Optional[int] = None,
1630
1652
  workers: Union[bool, int] = False,
1631
1653
  min_task_size: Optional[int] = None,
1632
- partition_by: Optional[PartitionByType] = None,
1633
- cache: bool = False,
1634
- batch_rows: Optional[int] = None,
1654
+ batch_size: Optional[int] = None,
1655
+ # Parameters are unused, kept only to match the signature of Settings.to_dict
1656
+ prefetch: Optional[int] = None,
1657
+ namespace: Optional[str] = None,
1658
+ project: Optional[str] = None,
1635
1659
  ) -> "Self":
1636
1660
  """
1637
1661
  Adds one or more signals based on the results from the provided UDF.
@@ -1657,7 +1681,7 @@ class DatasetQuery:
1657
1681
  workers=workers,
1658
1682
  min_task_size=min_task_size,
1659
1683
  cache=cache,
1660
- batch_rows=batch_rows,
1684
+ batch_size=batch_size,
1661
1685
  )
1662
1686
  )
1663
1687
  return query
@@ -1672,14 +1696,17 @@ class DatasetQuery:
1672
1696
  def generate(
1673
1697
  self,
1674
1698
  udf: "UDFAdapter",
1699
+ partition_by: Optional[PartitionByType] = None,
1700
+ # Parameters from Settings
1701
+ cache: bool = False,
1675
1702
  parallel: Optional[int] = None,
1676
1703
  workers: Union[bool, int] = False,
1677
1704
  min_task_size: Optional[int] = None,
1678
- partition_by: Optional[PartitionByType] = None,
1705
+ batch_size: Optional[int] = None,
1706
+ # Parameters are unused, kept only to match the signature of Settings.to_dict:
1707
+ prefetch: Optional[int] = None,
1679
1708
  namespace: Optional[str] = None,
1680
1709
  project: Optional[str] = None,
1681
- cache: bool = False,
1682
- batch_rows: Optional[int] = None,
1683
1710
  ) -> "Self":
1684
1711
  query = self.clone()
1685
1712
  steps = query.steps
@@ -1692,7 +1719,7 @@ class DatasetQuery:
1692
1719
  workers=workers,
1693
1720
  min_task_size=min_task_size,
1694
1721
  cache=cache,
1695
- batch_rows=batch_rows,
1722
+ batch_size=batch_size,
1696
1723
  )
1697
1724
  )
1698
1725
  return query
@@ -114,6 +114,7 @@ class UDFDispatcher:
114
114
  self.is_batching = udf_info["batching"].is_batching
115
115
  self.processes = udf_info["processes"]
116
116
  self.rows_total = udf_info["rows_total"]
117
+ self.batch_size = udf_info["batch_size"]
117
118
  self.buffer_size = buffer_size
118
119
  self.task_queue = None
119
120
  self.done_queue = None
@@ -142,6 +143,7 @@ class UDFDispatcher:
142
143
  self.table,
143
144
  self.cache,
144
145
  self.is_batching,
146
+ self.batch_size,
145
147
  self.udf_fields,
146
148
  )
147
149
 
@@ -232,6 +234,7 @@ class UDFDispatcher:
232
234
  udf_results,
233
235
  udf,
234
236
  cb=generated_cb,
237
+ batch_size=self.batch_size,
235
238
  )
236
239
 
237
240
  def input_batch_size(self, n_workers: int) -> int:
@@ -385,6 +388,7 @@ class UDFWorker:
385
388
  table: "Table",
386
389
  cache: bool,
387
390
  is_batching: bool,
391
+ batch_size: int,
388
392
  udf_fields: Sequence[str],
389
393
  ) -> None:
390
394
  self.catalog = catalog
@@ -395,6 +399,7 @@ class UDFWorker:
395
399
  self.table = table
396
400
  self.cache = cache
397
401
  self.is_batching = is_batching
402
+ self.batch_size = batch_size
398
403
  self.udf_fields = udf_fields
399
404
 
400
405
  self.download_cb = DownloadCallback(self.done_queue)
@@ -420,6 +425,7 @@ class UDFWorker:
420
425
  self.notify_and_process(udf_results),
421
426
  self.udf,
422
427
  cb=self.generated_cb,
428
+ batch_size=self.batch_size,
423
429
  )
424
430
  put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
425
431
 
datachain/query/udf.py CHANGED
@@ -21,6 +21,7 @@ class UdfInfo(TypedDict):
21
21
  is_generator: bool
22
22
  cache: bool
23
23
  rows_total: int
24
+ batch_size: int
24
25
 
25
26
 
26
27
  class AbstractUDFDistributor(ABC):
@@ -39,6 +40,7 @@ class AbstractUDFDistributor(ABC):
39
40
  use_cache: bool,
40
41
  is_generator: bool = False,
41
42
  min_task_size: Optional[Union[str, int]] = None,
43
+ batch_size: Optional[int] = None,
42
44
  ) -> None: ...
43
45
 
44
46
  @abstractmethod
@@ -1,4 +1,3 @@
1
- import base64
2
1
  import json
3
2
  import logging
4
3
  import os
@@ -7,6 +6,7 @@ from datetime import datetime, timedelta, timezone
7
6
  from struct import unpack
8
7
  from typing import (
9
8
  Any,
9
+ BinaryIO,
10
10
  Generic,
11
11
  Optional,
12
12
  TypeVar,
@@ -30,8 +30,9 @@ DatasetExportStatus = Optional[dict[str, Any]]
30
30
  DatasetExportSignedUrls = Optional[list[str]]
31
31
  FileUploadData = Optional[dict[str, Any]]
32
32
  JobData = Optional[dict[str, Any]]
33
- JobListData = dict[str, Any]
34
- ClusterListData = dict[str, Any]
33
+ JobListData = list[dict[str, Any]]
34
+ ClusterListData = list[dict[str, Any]]
35
+
35
36
  logger = logging.getLogger("datachain")
36
37
 
37
38
  DATASET_ROWS_CHUNK_SIZE = 8192
@@ -239,6 +240,45 @@ class StudioClient:
239
240
 
240
241
  return Response(data, ok, message, response.status_code)
241
242
 
243
+ def _send_multipart_request(
244
+ self, route: str, files: dict[str, Any], params: Optional[dict[str, Any]] = None
245
+ ) -> Response[Any]:
246
+ """
247
+ Function that communicates with Studio API using multipart/form-data.
248
+ It will raise an exception, and try to retry, if 5xx status code is
249
+ returned, or if Timeout exceptions is thrown from the requests lib
250
+ """
251
+ import requests
252
+
253
+ # Add team_name to params
254
+ request_params = {**(params or {}), "team_name": self.team}
255
+
256
+ response = requests.post(
257
+ url=f"{self.url}/{route}",
258
+ files=files,
259
+ params=request_params,
260
+ headers={
261
+ "Authorization": f"token {self.token}",
262
+ },
263
+ timeout=self.timeout,
264
+ )
265
+
266
+ ok = response.ok
267
+ try:
268
+ data = json.loads(response.content.decode("utf-8"))
269
+ except json.decoder.JSONDecodeError:
270
+ data = {}
271
+
272
+ if not ok:
273
+ if response.status_code == 403:
274
+ message = f"Not authorized for the team {self.team}"
275
+ else:
276
+ message = data.get("message", "")
277
+ else:
278
+ message = ""
279
+
280
+ return Response(data, ok, message, response.status_code)
281
+
242
282
  @staticmethod
243
283
  def _unpacker_hook(code, data):
244
284
  import msgpack
@@ -409,12 +449,13 @@ class StudioClient:
409
449
  method="GET",
410
450
  )
411
451
 
412
- def upload_file(self, content: bytes, file_name: str) -> Response[FileUploadData]:
413
- data = {
414
- "file_content": base64.b64encode(content).decode("utf-8"),
415
- "file_name": file_name,
416
- }
417
- return self._send_request("datachain/upload-file", data)
452
+ def upload_file(
453
+ self, file_obj: BinaryIO, file_name: str
454
+ ) -> Response[FileUploadData]:
455
+ # Prepare multipart form data
456
+ files = {"file": (file_name, file_obj, "application/octet-stream")}
457
+
458
+ return self._send_multipart_request("datachain/jobs/files", files)
418
459
 
419
460
  def create_job(
420
461
  self,
@@ -449,25 +490,27 @@ class StudioClient:
449
490
  "cron_expression": cron,
450
491
  "credentials_name": credentials_name,
451
492
  }
452
- return self._send_request("datachain/job", data)
493
+ return self._send_request("datachain/jobs/", data)
453
494
 
454
495
  def get_jobs(
455
496
  self,
456
497
  status: Optional[str] = None,
457
498
  limit: int = 20,
499
+ job_id: Optional[str] = None,
458
500
  ) -> Response[JobListData]:
459
- return self._send_request(
460
- "datachain/jobs",
461
- {"status": status, "limit": limit} if status else {"limit": limit},
462
- method="GET",
463
- )
501
+ params: dict[str, Any] = {"limit": limit}
502
+ if status is not None:
503
+ params["status"] = status
504
+ if job_id is not None:
505
+ params["job_id"] = job_id
506
+ return self._send_request("datachain/jobs/", params, method="GET")
464
507
 
465
508
  def cancel_job(
466
509
  self,
467
510
  job_id: str,
468
511
  ) -> Response[JobData]:
469
- url = f"datachain/job/{job_id}/cancel"
512
+ url = f"datachain/jobs/{job_id}/cancel"
470
513
  return self._send_request(url, data={}, method="POST")
471
514
 
472
515
  def get_clusters(self) -> Response[ClusterListData]:
473
- return self._send_request("datachain/clusters", {}, method="GET")
516
+ return self._send_request("datachain/clusters/", {}, method="GET")
datachain/studio.py CHANGED
@@ -403,14 +403,14 @@ def create_job(
403
403
  if not response.data:
404
404
  raise DataChainError("Failed to create job")
405
405
 
406
- job_id = response.data.get("job", {}).get("id")
406
+ job_id = response.data.get("id")
407
407
 
408
408
  if parsed_start_time or cron:
409
409
  print(f"Job {job_id} is scheduled as a task in Studio.")
410
410
  return 0
411
411
 
412
412
  print(f"Job {job_id} created")
413
- print("Open the job in Studio at", response.data.get("job", {}).get("url"))
413
+ print("Open the job in Studio at", response.data.get("url"))
414
414
  print("=" * 40)
415
415
 
416
416
  return 0 if no_wait else show_logs_from_client(client, job_id)
@@ -421,16 +421,14 @@ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
421
421
  for file in files:
422
422
  file_name = os.path.basename(file)
423
423
  with open(file, "rb") as f:
424
- file_content = f.read()
425
- response = client.upload_file(file_content, file_name)
424
+ response = client.upload_file(f, file_name)
426
425
  if not response.ok:
427
426
  raise DataChainError(response.message)
428
427
 
429
428
  if not response.data:
430
429
  raise DataChainError(f"Failed to upload file {file_name}")
431
430
 
432
- file_id = response.data.get("blob", {}).get("id")
433
- if file_id:
431
+ if file_id := response.data.get("id"):
434
432
  file_ids.append(str(file_id))
435
433
  return file_ids
436
434
 
@@ -456,7 +454,7 @@ def list_jobs(status: Optional[str], team_name: Optional[str], limit: int):
456
454
  if not response.ok:
457
455
  raise DataChainError(response.message)
458
456
 
459
- jobs = response.data.get("jobs", [])
457
+ jobs = response.data or []
460
458
  if not jobs:
461
459
  print("No jobs found")
462
460
  return
@@ -492,7 +490,7 @@ def list_clusters(team_name: Optional[str]):
492
490
  if not response.ok:
493
491
  raise DataChainError(response.message)
494
492
 
495
- clusters = response.data.get("clusters", [])
493
+ clusters = response.data or []
496
494
  if not clusters:
497
495
  print("No clusters found")
498
496
  return
@@ -505,6 +503,7 @@ def list_clusters(team_name: Optional[str]):
505
503
  "Cloud Provider": cluster.get("cloud_provider"),
506
504
  "Cloud Credentials": cluster.get("cloud_credentials"),
507
505
  "Is Active": cluster.get("is_active"),
506
+ "Is Default": cluster.get("default"),
508
507
  "Max Workers": cluster.get("max_workers"),
509
508
  }
510
509
  for cluster in clusters
datachain/utils.py CHANGED
@@ -25,7 +25,7 @@ if TYPE_CHECKING:
25
25
  from typing_extensions import Self
26
26
 
27
27
 
28
- DEFAULT_CHUNK_ROWS = 2000
28
+ DEFAULT_BATCH_SIZE = 2000
29
29
 
30
30
  logger = logging.getLogger("datachain")
31
31
 
@@ -228,7 +228,7 @@ _T_co = TypeVar("_T_co", covariant=True)
228
228
 
229
229
  def _dynamic_batched_core(
230
230
  iterable: Iterable[_T_co],
231
- batch_rows: int,
231
+ batch_size: int,
232
232
  ) -> Iterator[list[_T_co]]:
233
233
  """Core batching logic that yields lists."""
234
234
 
@@ -236,7 +236,7 @@ def _dynamic_batched_core(
236
236
 
237
237
  for item in iterable:
238
238
  # Check if adding this item would exceed limits
239
- if len(batch) >= batch_rows and batch: # Yield current batch if we have one
239
+ if len(batch) >= batch_size and batch: # Yield current batch if we have one
240
240
  yield batch
241
241
  batch = []
242
242
 
@@ -247,23 +247,22 @@ def _dynamic_batched_core(
247
247
  yield batch
248
248
 
249
249
 
250
- def batched(iterable: Iterable[_T_co], batch_rows: int) -> Iterator[tuple[_T_co, ...]]:
250
+ def batched(iterable: Iterable[_T_co], batch_size: int) -> Iterator[tuple[_T_co, ...]]:
251
251
  """
252
- Batch data into tuples of length batch_rows .
252
+ Batch data into tuples of length batch_size.
253
253
  The last batch may be shorter.
254
254
  """
255
- yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_rows))
255
+ yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_size))
256
256
 
257
257
 
258
258
  def batched_it(
259
259
  iterable: Iterable[_T_co],
260
- batch_rows: int = DEFAULT_CHUNK_ROWS,
260
+ batch_size: int = DEFAULT_BATCH_SIZE,
261
261
  ) -> Iterator[Iterator[_T_co]]:
262
262
  """
263
- Batch data into iterators with dynamic sizing
264
- based on row count and memory usage.
263
+ Batch data into iterators with dynamic sizing based on row count and memory usage.
265
264
  """
266
- yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_rows))
265
+ yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_size))
267
266
 
268
267
 
269
268
  def flatten(items):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.30.6
3
+ Version: 0.31.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -17,9 +17,9 @@ datachain/project.py,sha256=90D4GpJSA3t0fayYZbzrL3sk4U7EJhQo8psnWvdI7_o,2280
17
17
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
19
19
  datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
20
- datachain/studio.py,sha256=27750qCSNxIChEzhV02damIFreLMfr7UdiWqMFyk8AA,15361
20
+ datachain/studio.py,sha256=IS8o4BZnhUo73Bd8m4CJxFc5utdmh2miIs25WswkFBA,15283
21
21
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
22
- datachain/utils.py,sha256=RKe1-VuC9juQSIbIpMnELJ7QrsKQggj8l7Q8_FiCZHE,15664
22
+ datachain/utils.py,sha256=5ehFeqXau7MFmGUQRsjRyPfDMPoOF1ojpfVciYUo5fE,15659
23
23
  datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
24
24
  datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
25
25
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
@@ -52,8 +52,8 @@ datachain/data_storage/job.py,sha256=ZkeXCNUj_VCkoKYx29hqB4AcfVUielnRjY-GYUcUxt4
52
52
  datachain/data_storage/metastore.py,sha256=aSeTRh43hmrOhULi9YD2VlgCj8B4bjE3jqCOvnb_HQs,53851
53
53
  datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
54
54
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
55
- datachain/data_storage/sqlite.py,sha256=edcTegzEoAEdEp62Rg9oERvHWXDcpg8d4onrD-P2xKM,30159
56
- datachain/data_storage/warehouse.py,sha256=sEbNiWKdB7yuLt88FuIfRur7U7WiOZrcHWhnBS_eMAg,32642
55
+ datachain/data_storage/sqlite.py,sha256=1fIeIhmB3O8oQVzP8dDKap0KUIgI0n2TdBQSyv0R8J4,30345
56
+ datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
57
57
  datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
58
58
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
59
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -85,11 +85,11 @@ datachain/lib/model_store.py,sha256=A0pSVQ7uaZ9RvANapzirF8Cqq9N6ysosPpMSkzdRPkU,
85
85
  datachain/lib/namespaces.py,sha256=I6gLC4ZzgyatFtHL85MWR4ml7-yuQOzxHE7IQNbt_ac,2107
86
86
  datachain/lib/projects.py,sha256=VJgmzHzKjmNPZD1tm0a1RNHmUQwn6WLWCLpKyc4UrSk,2605
87
87
  datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
88
- datachain/lib/settings.py,sha256=n0YYhCVdgCdMkCSLY7kscJF9mUhlQ0a4ENWBsJFynkw,3809
88
+ datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
89
89
  datachain/lib/signal_schema.py,sha256=YMMcc9gHIzBz88zfsreGa1nOoO_56HBtZlT6jf3V1WE,39224
90
90
  datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
91
91
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
92
- datachain/lib/udf.py,sha256=IB1IKF5KyA-NiyfhVzmBPpF_aITPS3zSlrt24f_Ofjo,17956
92
+ datachain/lib/udf.py,sha256=08ia5T3gClen5ZQfIgop-swNnys2G-RIZpszqDnbc0w,17570
93
93
  datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
94
94
  datachain/lib/utils.py,sha256=RLji1gHnfDXtJCnBo8BcNu1obndFpVsXJ_1Vb-FQ9Qo,4554
95
95
  datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
@@ -103,15 +103,15 @@ datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sD
103
103
  datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
104
104
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
105
105
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
106
- datachain/lib/dc/database.py,sha256=F6EOjPKwSdp26kJsOKGq49D9OxqyKEalINHEwLQav2s,14716
107
- datachain/lib/dc/datachain.py,sha256=2UtDhtBzx5VejkDE0UTS3t1517jCGr7YEKvO5wqNU-Q,99709
106
+ datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
107
+ datachain/lib/dc/datachain.py,sha256=AtsvBndqMyKrfW4yH8V0Nf__hfR0LN-NpA2munzfiPM,99888
108
108
  datachain/lib/dc/datasets.py,sha256=-Bvyyu4XXDXLiWa-bOnsp0Q11RSYXRO0j5DaX8ShaFs,15355
109
109
  datachain/lib/dc/hf.py,sha256=AP_MUHg6HJWae10PN9hD_beQVjrl0cleZ6Cvhtl1yoI,2901
110
110
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
111
111
  datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
112
112
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
113
113
  datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
114
- datachain/lib/dc/records.py,sha256=4N1Fq-j5r4GK-PR5jIO-9B2u_zTNX9l-6SmcRhQDAsw,3136
114
+ datachain/lib/dc/records.py,sha256=IKf5MArify-cI1P4NgbIvrAi0UQ5cvofTI3u6_zKBP8,3069
115
115
  datachain/lib/dc/storage.py,sha256=OMJE-9ob9Ku5le8W6O8J1W-XJ0pwHt2PsO-ZCcee1ZA,7950
116
116
  datachain/lib/dc/utils.py,sha256=9OMiFu2kXIbtMqzJTEr1qbCoCBGpOmTnkWImVgFTKgo,4112
117
117
  datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
@@ -125,18 +125,18 @@ datachain/model/ultralytics/bbox.py,sha256=C-aDiBhVa_ML2oERWvksRkyMU1XuYSpb6eItH
125
125
  datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigFYNZWUA,3392
126
126
  datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
127
127
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
128
- datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
129
- datachain/query/dataset.py,sha256=OaGRBNSWYNaRbYn6avij0fiFN5DT-nwdM-wJ4yTfaYs,63317
130
- datachain/query/dispatch.py,sha256=f8IIvuLBJaCEwSRv7bWPMy1uXyc28W0LGqrBffjYf98,15831
128
+ datachain/query/batch.py,sha256=ocPeNgrJM6Y_6SYCx3O2cwlCFAhNMfoYgB99GP6A1Bg,4294
129
+ datachain/query/dataset.py,sha256=1eg5EE4vKI7c_Ng04or6zzKmFcOoEubMCoOaYmYPavE,64499
130
+ datachain/query/dispatch.py,sha256=pygp7xg3lUDKlYHhecKxW5fB3zOSX1fPJfZBU4dfijk,16067
131
131
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
132
132
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
133
133
  datachain/query/queue.py,sha256=v0UeK4ilmdiRoJ5OdjB5qpnHTYDxRP4vhVp5Iw_toaI,3512
134
134
  datachain/query/schema.py,sha256=qLpEyvnzKlNCOrThQiTNpUKTUEsVIHT9trt-0UMt6ko,6704
135
135
  datachain/query/session.py,sha256=gKblltJAVQAVSTswAgWGDgGbpmFlFzFVkIQojDCjgXM,6809
136
- datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
136
+ datachain/query/udf.py,sha256=jqutTpvkT6eHl96ZEgYiiTMAhI7vmTQA6JH9y4WCibI,1405
137
137
  datachain/query/utils.py,sha256=a2PTBZ3qsG6XlUcp9XsoGiQfKkca4Q3m-VzFgiGQPAc,1230
138
138
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- datachain/remote/studio.py,sha256=pDThxvEEpIKVGfa9rmtz_zeqHwrgzh0Lv-Pd4wzDx5k,15448
139
+ datachain/remote/studio.py,sha256=amjcV0B8qumsVBnxPQnt8oSrnfMK2vAdOurVMA9L_zA,16868
140
140
  datachain/sql/__init__.py,sha256=8D2omsBiATt8bjLjGo6jBEtaKEkOlnlNFWhVryHMDv0,388
141
141
  datachain/sql/postgresql_dialect.py,sha256=pDTfH8xaXz5xZsq8O1aQUvWLRIv_ogYeAqtmKlPp3Rw,280
142
142
  datachain/sql/postgresql_types.py,sha256=ryb_0lzuA9UOJ_B6nW9Yb8nJjzeSmEItAL_Ceue65lc,627
@@ -160,9 +160,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
160
160
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
161
161
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
162
162
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
163
- datachain-0.30.6.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
164
- datachain-0.30.6.dist-info/METADATA,sha256=ZyXo8wdTrN08k--Soy3UHpCu_Jni_6ocO3_PbjCswCE,13898
165
- datachain-0.30.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- datachain-0.30.6.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
167
- datachain-0.30.6.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
168
- datachain-0.30.6.dist-info/RECORD,,
163
+ datachain-0.31.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
164
+ datachain-0.31.0.dist-info/METADATA,sha256=hY_KVFdUHZmZcxRiy5e-GY6CXI-sY0oKAtrvNakApdY,13898
165
+ datachain-0.31.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ datachain-0.31.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
167
+ datachain-0.31.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
168
+ datachain-0.31.0.dist-info/RECORD,,