datachain 0.30.5__py3-none-any.whl → 0.30.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cli/commands/datasets.py +32 -17
- datachain/data_storage/sqlite.py +18 -15
- datachain/data_storage/warehouse.py +7 -1
- datachain/delta.py +36 -20
- datachain/lib/dc/database.py +2 -2
- datachain/lib/dc/datachain.py +36 -28
- datachain/lib/dc/datasets.py +4 -0
- datachain/lib/dc/records.py +2 -4
- datachain/lib/dc/storage.py +5 -0
- datachain/lib/settings.py +188 -85
- datachain/lib/udf.py +3 -20
- datachain/query/batch.py +2 -2
- datachain/query/dataset.py +44 -17
- datachain/query/dispatch.py +6 -0
- datachain/query/udf.py +2 -0
- datachain/utils.py +9 -10
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/METADATA +1 -1
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/RECORD +22 -22
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/top_level.txt +0 -0
|
@@ -1,30 +1,41 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable, Iterator
|
|
3
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
3
4
|
|
|
4
5
|
from tabulate import tabulate
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
from datachain.catalog import Catalog
|
|
8
|
-
|
|
7
|
+
from datachain import semver
|
|
9
8
|
from datachain.catalog import is_namespace_local
|
|
10
9
|
from datachain.cli.utils import determine_flavors
|
|
11
10
|
from datachain.config import Config
|
|
12
11
|
from datachain.error import DataChainError, DatasetNotFoundError
|
|
13
12
|
from datachain.studio import list_datasets as list_datasets_studio
|
|
14
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from datachain.catalog import Catalog
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def group_dataset_versions(
|
|
19
|
+
datasets: Iterable[tuple[str, str]], latest_only=True
|
|
20
|
+
) -> dict[str, Union[str, list[str]]]:
|
|
21
|
+
grouped: dict[str, list[tuple[int, int, int]]] = {}
|
|
15
22
|
|
|
16
|
-
def group_dataset_versions(datasets, latest_only=True):
|
|
17
|
-
grouped = {}
|
|
18
23
|
# Sort to ensure groupby works as expected
|
|
19
24
|
# (groupby expects consecutive items with the same key)
|
|
20
25
|
for name, version in sorted(datasets):
|
|
21
|
-
grouped.setdefault(name, []).append(version)
|
|
26
|
+
grouped.setdefault(name, []).append(semver.parse(version))
|
|
22
27
|
|
|
23
28
|
if latest_only:
|
|
24
29
|
# For each dataset name, pick the highest version.
|
|
25
|
-
return {
|
|
30
|
+
return {
|
|
31
|
+
name: semver.create(*(max(versions))) for name, versions in grouped.items()
|
|
32
|
+
}
|
|
33
|
+
|
|
26
34
|
# For each dataset name, return a sorted list of unique versions.
|
|
27
|
-
return {
|
|
35
|
+
return {
|
|
36
|
+
name: [semver.create(*v) for v in sorted(set(versions))]
|
|
37
|
+
for name, versions in grouped.items()
|
|
38
|
+
}
|
|
28
39
|
|
|
29
40
|
|
|
30
41
|
def list_datasets(
|
|
@@ -35,7 +46,7 @@ def list_datasets(
|
|
|
35
46
|
team: Optional[str] = None,
|
|
36
47
|
latest_only: bool = True,
|
|
37
48
|
name: Optional[str] = None,
|
|
38
|
-
):
|
|
49
|
+
) -> None:
|
|
39
50
|
token = Config().read().get("studio", {}).get("token")
|
|
40
51
|
all, local, studio = determine_flavors(studio, local, all, token)
|
|
41
52
|
if name:
|
|
@@ -95,27 +106,31 @@ def list_datasets(
|
|
|
95
106
|
print(tabulate(rows, headers="keys"))
|
|
96
107
|
|
|
97
108
|
|
|
98
|
-
def list_datasets_local(
|
|
109
|
+
def list_datasets_local(
|
|
110
|
+
catalog: "Catalog", name: Optional[str] = None
|
|
111
|
+
) -> Iterator[tuple[str, str]]:
|
|
99
112
|
if name:
|
|
100
113
|
yield from list_datasets_local_versions(catalog, name)
|
|
101
114
|
return
|
|
102
115
|
|
|
103
116
|
for d in catalog.ls_datasets():
|
|
104
117
|
for v in d.versions:
|
|
105
|
-
yield
|
|
118
|
+
yield d.full_name, v.version
|
|
106
119
|
|
|
107
120
|
|
|
108
|
-
def list_datasets_local_versions(
|
|
121
|
+
def list_datasets_local_versions(
|
|
122
|
+
catalog: "Catalog", name: str
|
|
123
|
+
) -> Iterator[tuple[str, str]]:
|
|
109
124
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
110
125
|
|
|
111
126
|
ds = catalog.get_dataset(
|
|
112
127
|
name, namespace_name=namespace_name, project_name=project_name
|
|
113
128
|
)
|
|
114
129
|
for v in ds.versions:
|
|
115
|
-
yield
|
|
130
|
+
yield name, v.version
|
|
116
131
|
|
|
117
132
|
|
|
118
|
-
def _datasets_tabulate_row(name, both, local_version, studio_version):
|
|
133
|
+
def _datasets_tabulate_row(name, both, local_version, studio_version) -> dict[str, str]:
|
|
119
134
|
row = {
|
|
120
135
|
"Name": name,
|
|
121
136
|
}
|
|
@@ -136,7 +151,7 @@ def rm_dataset(
|
|
|
136
151
|
force: Optional[bool] = False,
|
|
137
152
|
studio: Optional[bool] = False,
|
|
138
153
|
team: Optional[str] = None,
|
|
139
|
-
):
|
|
154
|
+
) -> None:
|
|
140
155
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
141
156
|
|
|
142
157
|
if studio:
|
|
@@ -166,7 +181,7 @@ def edit_dataset(
|
|
|
166
181
|
description: Optional[str] = None,
|
|
167
182
|
attrs: Optional[list[str]] = None,
|
|
168
183
|
team: Optional[str] = None,
|
|
169
|
-
):
|
|
184
|
+
) -> None:
|
|
170
185
|
from datachain.lib.dc.utils import is_studio
|
|
171
186
|
|
|
172
187
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -37,6 +37,7 @@ from datachain import semver
|
|
|
37
37
|
from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
|
|
38
38
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
39
39
|
from datachain.data_storage.schema import DefaultSchema
|
|
40
|
+
from datachain.data_storage.warehouse import INSERT_BATCH_SIZE
|
|
40
41
|
from datachain.dataset import DatasetRecord, StorageURI
|
|
41
42
|
from datachain.error import DataChainError, OutdatedDatabaseSchemaError
|
|
42
43
|
from datachain.namespace import Namespace
|
|
@@ -44,7 +45,7 @@ from datachain.project import Project
|
|
|
44
45
|
from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
|
|
45
46
|
from datachain.sql.sqlite.base import load_usearch_extension
|
|
46
47
|
from datachain.sql.types import SQLType
|
|
47
|
-
from datachain.utils import DataChainDir, batched_it
|
|
48
|
+
from datachain.utils import DataChainDir, batched, batched_it
|
|
48
49
|
|
|
49
50
|
if TYPE_CHECKING:
|
|
50
51
|
from sqlalchemy.dialects.sqlite import Insert
|
|
@@ -712,19 +713,21 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
712
713
|
def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
|
|
713
714
|
return (e.model_dump() for e in entries)
|
|
714
715
|
|
|
715
|
-
def insert_rows(
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
716
|
+
def insert_rows(
|
|
717
|
+
self,
|
|
718
|
+
table: Table,
|
|
719
|
+
rows: Iterable[dict[str, Any]],
|
|
720
|
+
batch_size: int = INSERT_BATCH_SIZE,
|
|
721
|
+
) -> None:
|
|
722
|
+
for row_chunk in batched(rows, batch_size):
|
|
723
|
+
with self.db.transaction() as conn:
|
|
724
|
+
# transactions speeds up inserts significantly as there is no separate
|
|
725
|
+
# transaction created for each insert row
|
|
726
|
+
self.db.executemany(
|
|
727
|
+
table.insert().values({f: bindparam(f) for f in row_chunk[0]}),
|
|
728
|
+
row_chunk,
|
|
729
|
+
conn=conn,
|
|
730
|
+
)
|
|
728
731
|
|
|
729
732
|
def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
|
|
730
733
|
dr = self.dataset_rows(dataset, version)
|
|
@@ -797,7 +800,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
797
800
|
.limit(None)
|
|
798
801
|
)
|
|
799
802
|
|
|
800
|
-
for batch in batched_it(ids,
|
|
803
|
+
for batch in batched_it(ids, INSERT_BATCH_SIZE):
|
|
801
804
|
batch_ids = [row[0] for row in batch]
|
|
802
805
|
select_q._where_criteria = (col_id.in_(batch_ids),)
|
|
803
806
|
q = table.insert().from_select(list(select_q.selected_columns), select_q)
|
|
@@ -43,6 +43,7 @@ if TYPE_CHECKING:
|
|
|
43
43
|
logger = logging.getLogger("datachain")
|
|
44
44
|
|
|
45
45
|
SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
|
|
46
|
+
INSERT_BATCH_SIZE = 10_000 # number of rows to insert at a time
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
class AbstractWarehouse(ABC, Serializable):
|
|
@@ -415,7 +416,12 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
415
416
|
"""Convert File entries so they can be passed on to `insert_rows()`"""
|
|
416
417
|
|
|
417
418
|
@abstractmethod
|
|
418
|
-
def insert_rows(
|
|
419
|
+
def insert_rows(
|
|
420
|
+
self,
|
|
421
|
+
table: sa.Table,
|
|
422
|
+
rows: Iterable[dict[str, Any]],
|
|
423
|
+
batch_size: int = INSERT_BATCH_SIZE,
|
|
424
|
+
) -> None:
|
|
419
425
|
"""Does batch inserts of any kind of rows into table"""
|
|
420
426
|
|
|
421
427
|
def insert_rows_done(self, table: sa.Table) -> None:
|
datachain/delta.py
CHANGED
|
@@ -4,7 +4,7 @@ from functools import wraps
|
|
|
4
4
|
from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
|
5
5
|
|
|
6
6
|
import datachain
|
|
7
|
-
from datachain.dataset import DatasetDependency
|
|
7
|
+
from datachain.dataset import DatasetDependency, DatasetRecord
|
|
8
8
|
from datachain.error import DatasetNotFoundError
|
|
9
9
|
from datachain.project import Project
|
|
10
10
|
|
|
@@ -30,9 +30,10 @@ def delta_disabled(
|
|
|
30
30
|
|
|
31
31
|
@wraps(method)
|
|
32
32
|
def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
|
|
33
|
-
if self.delta:
|
|
33
|
+
if self.delta and not self._delta_unsafe:
|
|
34
34
|
raise NotImplementedError(
|
|
35
|
-
f"
|
|
35
|
+
f"Cannot use {method.__name__} with delta datasets - may cause"
|
|
36
|
+
" inconsistency. Use delta_unsafe flag to allow this operation."
|
|
36
37
|
)
|
|
37
38
|
return method(self, *args, **kwargs)
|
|
38
39
|
|
|
@@ -124,10 +125,19 @@ def _get_retry_chain(
|
|
|
124
125
|
# Subtract also diff chain since some items might be picked
|
|
125
126
|
# up by `delta=True` itself (e.g. records got modified AND are missing in the
|
|
126
127
|
# result dataset atm)
|
|
127
|
-
|
|
128
|
+
on = [on] if isinstance(on, str) else on
|
|
129
|
+
|
|
130
|
+
return (
|
|
131
|
+
retry_chain.diff(
|
|
132
|
+
diff_chain, on=on, added=True, same=True, modified=False, deleted=False
|
|
133
|
+
).distinct(*on)
|
|
134
|
+
if retry_chain
|
|
135
|
+
else None
|
|
136
|
+
)
|
|
128
137
|
|
|
129
138
|
|
|
130
139
|
def _get_source_info(
|
|
140
|
+
source_ds: DatasetRecord,
|
|
131
141
|
name: str,
|
|
132
142
|
namespace_name: str,
|
|
133
143
|
project_name: str,
|
|
@@ -154,25 +164,23 @@ def _get_source_info(
|
|
|
154
164
|
indirect=False,
|
|
155
165
|
)
|
|
156
166
|
|
|
157
|
-
|
|
158
|
-
if not
|
|
167
|
+
source_ds_dep = next((d for d in dependencies if d.name == source_ds.name), None)
|
|
168
|
+
if not source_ds_dep:
|
|
159
169
|
# Starting dataset was removed, back off to normal dataset creation
|
|
160
170
|
return None, None, None, None, None
|
|
161
171
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
project_name=source_ds_project.name,
|
|
169
|
-
).latest_version
|
|
172
|
+
# Refresh starting dataset to have new versions if they are created
|
|
173
|
+
source_ds = catalog.get_dataset(
|
|
174
|
+
source_ds.name,
|
|
175
|
+
namespace_name=source_ds.project.namespace.name,
|
|
176
|
+
project_name=source_ds.project.name,
|
|
177
|
+
)
|
|
170
178
|
|
|
171
179
|
return (
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
180
|
+
source_ds.name,
|
|
181
|
+
source_ds.project,
|
|
182
|
+
source_ds_dep.version,
|
|
183
|
+
source_ds.latest_version,
|
|
176
184
|
dependencies,
|
|
177
185
|
)
|
|
178
186
|
|
|
@@ -244,7 +252,14 @@ def delta_retry_update(
|
|
|
244
252
|
source_ds_version,
|
|
245
253
|
source_ds_latest_version,
|
|
246
254
|
dependencies,
|
|
247
|
-
) = _get_source_info(
|
|
255
|
+
) = _get_source_info(
|
|
256
|
+
dc._query.starting_step.dataset, # type: ignore[union-attr]
|
|
257
|
+
name,
|
|
258
|
+
namespace_name,
|
|
259
|
+
project_name,
|
|
260
|
+
latest_version,
|
|
261
|
+
catalog,
|
|
262
|
+
)
|
|
248
263
|
|
|
249
264
|
# If source_ds_name is None, starting dataset was removed
|
|
250
265
|
if source_ds_name is None:
|
|
@@ -267,8 +282,9 @@ def delta_retry_update(
|
|
|
267
282
|
if dependencies:
|
|
268
283
|
dependencies = copy(dependencies)
|
|
269
284
|
dependencies = [d for d in dependencies if d is not None]
|
|
285
|
+
source_ds_dep = next(d for d in dependencies if d.name == source_ds_name)
|
|
270
286
|
# Update to latest version
|
|
271
|
-
|
|
287
|
+
source_ds_dep.version = source_ds_latest_version # type: ignore[union-attr]
|
|
272
288
|
|
|
273
289
|
# Handle retry functionality if enabled
|
|
274
290
|
if delta_retry:
|
datachain/lib/dc/database.py
CHANGED
|
@@ -73,7 +73,7 @@ def to_database(
|
|
|
73
73
|
table_name: str,
|
|
74
74
|
connection: "ConnectionType",
|
|
75
75
|
*,
|
|
76
|
-
|
|
76
|
+
batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
|
|
77
77
|
on_conflict: Optional[str] = None,
|
|
78
78
|
conflict_columns: Optional[list[str]] = None,
|
|
79
79
|
column_mapping: Optional[dict[str, Optional[str]]] = None,
|
|
@@ -124,7 +124,7 @@ def to_database(
|
|
|
124
124
|
table.create(conn, checkfirst=True)
|
|
125
125
|
|
|
126
126
|
rows_iter = chain._leaf_values()
|
|
127
|
-
for batch in batched(rows_iter,
|
|
127
|
+
for batch in batched(rows_iter, batch_size):
|
|
128
128
|
rows_affected = _process_batch(
|
|
129
129
|
conn,
|
|
130
130
|
table,
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -193,6 +193,7 @@ class DataChain:
|
|
|
193
193
|
self._setup: dict = setup or {}
|
|
194
194
|
self._sys = _sys
|
|
195
195
|
self._delta = False
|
|
196
|
+
self._delta_unsafe = False
|
|
196
197
|
self._delta_on: Optional[Union[str, Sequence[str]]] = None
|
|
197
198
|
self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
|
|
198
199
|
self._delta_compare: Optional[Union[str, Sequence[str]]] = None
|
|
@@ -216,6 +217,7 @@ class DataChain:
|
|
|
216
217
|
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
217
218
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
218
219
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
220
|
+
delta_unsafe: bool = False,
|
|
219
221
|
) -> "Self":
|
|
220
222
|
"""Marks this chain as delta, which means special delta process will be
|
|
221
223
|
called on saving dataset for optimization"""
|
|
@@ -226,6 +228,7 @@ class DataChain:
|
|
|
226
228
|
self._delta_result_on = right_on
|
|
227
229
|
self._delta_compare = compare
|
|
228
230
|
self._delta_retry = delta_retry
|
|
231
|
+
self._delta_unsafe = delta_unsafe
|
|
229
232
|
return self
|
|
230
233
|
|
|
231
234
|
@property
|
|
@@ -238,6 +241,10 @@ class DataChain:
|
|
|
238
241
|
"""Returns True if this chain is ran in "delta" update mode"""
|
|
239
242
|
return self._delta
|
|
240
243
|
|
|
244
|
+
@property
|
|
245
|
+
def delta_unsafe(self) -> bool:
|
|
246
|
+
return self._delta_unsafe
|
|
247
|
+
|
|
241
248
|
@property
|
|
242
249
|
def schema(self) -> dict[str, DataType]:
|
|
243
250
|
"""Get schema of the chain."""
|
|
@@ -328,21 +335,22 @@ class DataChain:
|
|
|
328
335
|
right_on=self._delta_result_on,
|
|
329
336
|
compare=self._delta_compare,
|
|
330
337
|
delta_retry=self._delta_retry,
|
|
338
|
+
delta_unsafe=self._delta_unsafe,
|
|
331
339
|
)
|
|
332
340
|
|
|
333
341
|
return chain
|
|
334
342
|
|
|
335
343
|
def settings(
|
|
336
344
|
self,
|
|
337
|
-
cache=None,
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
prefetch: Optional[int] = None,
|
|
342
|
-
sys: Optional[bool] = None,
|
|
345
|
+
cache: Optional[bool] = None,
|
|
346
|
+
prefetch: Optional[Union[bool, int]] = None,
|
|
347
|
+
parallel: Optional[Union[bool, int]] = None,
|
|
348
|
+
workers: Optional[int] = None,
|
|
343
349
|
namespace: Optional[str] = None,
|
|
344
350
|
project: Optional[str] = None,
|
|
345
|
-
|
|
351
|
+
min_task_size: Optional[int] = None,
|
|
352
|
+
batch_size: Optional[int] = None,
|
|
353
|
+
sys: Optional[bool] = None,
|
|
346
354
|
) -> "Self":
|
|
347
355
|
"""Change settings for chain.
|
|
348
356
|
|
|
@@ -351,23 +359,23 @@ class DataChain:
|
|
|
351
359
|
|
|
352
360
|
Parameters:
|
|
353
361
|
cache : data caching. (default=False)
|
|
362
|
+
prefetch : number of workers to use for downloading files in advance.
|
|
363
|
+
This is enabled by default and uses 2 workers.
|
|
364
|
+
To disable prefetching, set it to 0 or False.
|
|
354
365
|
parallel : number of thread for processors. True is a special value to
|
|
355
366
|
enable all available CPUs. (default=1)
|
|
356
367
|
workers : number of distributed workers. Only for Studio mode. (default=1)
|
|
357
|
-
min_task_size : minimum number of tasks. (default=1)
|
|
358
|
-
prefetch : number of workers to use for downloading files in advance.
|
|
359
|
-
This is enabled by default and uses 2 workers.
|
|
360
|
-
To disable prefetching, set it to 0.
|
|
361
368
|
namespace : namespace name.
|
|
362
369
|
project : project name.
|
|
363
|
-
|
|
370
|
+
min_task_size : minimum number of tasks. (default=1)
|
|
371
|
+
batch_size : row limit per insert to balance speed and memory usage.
|
|
364
372
|
(default=2000)
|
|
365
373
|
|
|
366
374
|
Example:
|
|
367
375
|
```py
|
|
368
376
|
chain = (
|
|
369
377
|
chain
|
|
370
|
-
.settings(cache=True, parallel=8,
|
|
378
|
+
.settings(cache=True, parallel=8, batch_size=300)
|
|
371
379
|
.map(laion=process_webdataset(spec=WDSLaion), params="file")
|
|
372
380
|
)
|
|
373
381
|
```
|
|
@@ -377,14 +385,14 @@ class DataChain:
|
|
|
377
385
|
settings = copy.copy(self._settings)
|
|
378
386
|
settings.add(
|
|
379
387
|
Settings(
|
|
380
|
-
cache,
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
+
cache=cache,
|
|
389
|
+
prefetch=prefetch,
|
|
390
|
+
parallel=parallel,
|
|
391
|
+
workers=workers,
|
|
392
|
+
namespace=namespace,
|
|
393
|
+
project=project,
|
|
394
|
+
min_task_size=min_task_size,
|
|
395
|
+
batch_size=batch_size,
|
|
388
396
|
)
|
|
389
397
|
)
|
|
390
398
|
return self._evolve(settings=settings, _sys=sys)
|
|
@@ -737,7 +745,7 @@ class DataChain:
|
|
|
737
745
|
|
|
738
746
|
return self._evolve(
|
|
739
747
|
query=self._query.add_signals(
|
|
740
|
-
udf_obj.to_udf_wrapper(self._settings.
|
|
748
|
+
udf_obj.to_udf_wrapper(self._settings.batch_size),
|
|
741
749
|
**self._settings.to_dict(),
|
|
742
750
|
),
|
|
743
751
|
signal_schema=self.signals_schema | udf_obj.output,
|
|
@@ -775,7 +783,7 @@ class DataChain:
|
|
|
775
783
|
udf_obj.prefetch = prefetch
|
|
776
784
|
return self._evolve(
|
|
777
785
|
query=self._query.generate(
|
|
778
|
-
udf_obj.to_udf_wrapper(self._settings.
|
|
786
|
+
udf_obj.to_udf_wrapper(self._settings.batch_size),
|
|
779
787
|
**self._settings.to_dict(),
|
|
780
788
|
),
|
|
781
789
|
signal_schema=udf_obj.output,
|
|
@@ -911,7 +919,7 @@ class DataChain:
|
|
|
911
919
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
912
920
|
return self._evolve(
|
|
913
921
|
query=self._query.generate(
|
|
914
|
-
udf_obj.to_udf_wrapper(self._settings.
|
|
922
|
+
udf_obj.to_udf_wrapper(self._settings.batch_size),
|
|
915
923
|
partition_by=processed_partition_by,
|
|
916
924
|
**self._settings.to_dict(),
|
|
917
925
|
),
|
|
@@ -960,7 +968,7 @@ class DataChain:
|
|
|
960
968
|
|
|
961
969
|
return self._evolve(
|
|
962
970
|
query=self._query.add_signals(
|
|
963
|
-
udf_obj.to_udf_wrapper(self._settings.
|
|
971
|
+
udf_obj.to_udf_wrapper(self._settings.batch_size, batch=batch),
|
|
964
972
|
**self._settings.to_dict(),
|
|
965
973
|
),
|
|
966
974
|
signal_schema=self.signals_schema | udf_obj.output,
|
|
@@ -2306,7 +2314,7 @@ class DataChain:
|
|
|
2306
2314
|
table_name: str,
|
|
2307
2315
|
connection: "ConnectionType",
|
|
2308
2316
|
*,
|
|
2309
|
-
|
|
2317
|
+
batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
|
|
2310
2318
|
on_conflict: Optional[str] = None,
|
|
2311
2319
|
conflict_columns: Optional[list[str]] = None,
|
|
2312
2320
|
column_mapping: Optional[dict[str, Optional[str]]] = None,
|
|
@@ -2328,7 +2336,7 @@ class DataChain:
|
|
|
2328
2336
|
library. If a DBAPI2 object, only sqlite3 is supported. The user is
|
|
2329
2337
|
responsible for engine disposal and connection closure for the
|
|
2330
2338
|
SQLAlchemy connectable; str connections are closed automatically.
|
|
2331
|
-
|
|
2339
|
+
batch_size: Number of rows to insert per batch for optimal performance.
|
|
2332
2340
|
Larger batches are faster but use more memory. Default: 10,000.
|
|
2333
2341
|
on_conflict: Strategy for handling duplicate rows (requires table
|
|
2334
2342
|
constraints):
|
|
@@ -2409,7 +2417,7 @@ class DataChain:
|
|
|
2409
2417
|
self,
|
|
2410
2418
|
table_name,
|
|
2411
2419
|
connection,
|
|
2412
|
-
|
|
2420
|
+
batch_size=batch_size,
|
|
2413
2421
|
on_conflict=on_conflict,
|
|
2414
2422
|
conflict_columns=conflict_columns,
|
|
2415
2423
|
column_mapping=column_mapping,
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -40,6 +40,7 @@ def read_dataset(
|
|
|
40
40
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
41
41
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
42
42
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
43
|
+
delta_unsafe: bool = False,
|
|
43
44
|
update: bool = False,
|
|
44
45
|
) -> "DataChain":
|
|
45
46
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
@@ -80,6 +81,8 @@ def read_dataset(
|
|
|
80
81
|
update: If True always checks for newer versions available on Studio, even if
|
|
81
82
|
some version of the dataset exists locally already. If False (default), it
|
|
82
83
|
will only fetch the dataset from Studio if it is not found locally.
|
|
84
|
+
delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
|
|
85
|
+
distinct.
|
|
83
86
|
|
|
84
87
|
|
|
85
88
|
Example:
|
|
@@ -205,6 +208,7 @@ def read_dataset(
|
|
|
205
208
|
right_on=delta_result_on,
|
|
206
209
|
compare=delta_compare,
|
|
207
210
|
delta_retry=delta_retry,
|
|
211
|
+
delta_unsafe=delta_unsafe,
|
|
208
212
|
)
|
|
209
213
|
|
|
210
214
|
return chain
|
datachain/lib/dc/records.py
CHANGED
|
@@ -31,7 +31,7 @@ def read_records(
|
|
|
31
31
|
|
|
32
32
|
Parameters:
|
|
33
33
|
to_insert : records (or a single record) to insert. Each record is
|
|
34
|
-
a dictionary of signals and
|
|
34
|
+
a dictionary of signals and their values.
|
|
35
35
|
schema : describes chain signals and their corresponding types
|
|
36
36
|
|
|
37
37
|
Example:
|
|
@@ -45,7 +45,6 @@ def read_records(
|
|
|
45
45
|
"""
|
|
46
46
|
from datachain.query.dataset import adjust_outputs, get_col_types
|
|
47
47
|
from datachain.sql.types import SQLType
|
|
48
|
-
from datachain.utils import batched
|
|
49
48
|
|
|
50
49
|
from .datasets import read_dataset
|
|
51
50
|
|
|
@@ -96,7 +95,6 @@ def read_records(
|
|
|
96
95
|
{c.name: c.type for c in columns if isinstance(c.type, SQLType)},
|
|
97
96
|
)
|
|
98
97
|
records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
|
|
99
|
-
|
|
100
|
-
warehouse.insert_rows(table, chunk)
|
|
98
|
+
warehouse.insert_rows(table, records, batch_size=READ_RECORDS_BATCH_SIZE)
|
|
101
99
|
warehouse.insert_rows_done(table)
|
|
102
100
|
return read_dataset(name=dsr.full_name, session=session, settings=settings)
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -43,6 +43,7 @@ def read_storage(
|
|
|
43
43
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
44
44
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
45
45
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
46
|
+
delta_unsafe: bool = False,
|
|
46
47
|
client_config: Optional[dict] = None,
|
|
47
48
|
) -> "DataChain":
|
|
48
49
|
"""Get data from storage(s) as a list of file with all file attributes.
|
|
@@ -77,6 +78,9 @@ def read_storage(
|
|
|
77
78
|
(error mode)
|
|
78
79
|
- True: Reprocess records missing from the result dataset (missing mode)
|
|
79
80
|
- None: No retry processing (default)
|
|
81
|
+
delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
|
|
82
|
+
distinct. Caller must ensure datasets are consistent and not partially
|
|
83
|
+
updated.
|
|
80
84
|
|
|
81
85
|
Returns:
|
|
82
86
|
DataChain: A DataChain object containing the file information.
|
|
@@ -218,6 +222,7 @@ def read_storage(
|
|
|
218
222
|
right_on=delta_result_on,
|
|
219
223
|
compare=delta_compare,
|
|
220
224
|
delta_retry=delta_retry,
|
|
225
|
+
delta_unsafe=delta_unsafe,
|
|
221
226
|
)
|
|
222
227
|
|
|
223
228
|
return storage_chain
|