datachain 0.30.5__py3-none-any.whl → 0.30.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,30 +1,41 @@
1
1
  import sys
2
- from typing import TYPE_CHECKING, Optional
2
+ from collections.abc import Iterable, Iterator
3
+ from typing import TYPE_CHECKING, Optional, Union
3
4
 
4
5
  from tabulate import tabulate
5
6
 
6
- if TYPE_CHECKING:
7
- from datachain.catalog import Catalog
8
-
7
+ from datachain import semver
9
8
  from datachain.catalog import is_namespace_local
10
9
  from datachain.cli.utils import determine_flavors
11
10
  from datachain.config import Config
12
11
  from datachain.error import DataChainError, DatasetNotFoundError
13
12
  from datachain.studio import list_datasets as list_datasets_studio
14
13
 
14
+ if TYPE_CHECKING:
15
+ from datachain.catalog import Catalog
16
+
17
+
18
+ def group_dataset_versions(
19
+ datasets: Iterable[tuple[str, str]], latest_only=True
20
+ ) -> dict[str, Union[str, list[str]]]:
21
+ grouped: dict[str, list[tuple[int, int, int]]] = {}
15
22
 
16
- def group_dataset_versions(datasets, latest_only=True):
17
- grouped = {}
18
23
  # Sort to ensure groupby works as expected
19
24
  # (groupby expects consecutive items with the same key)
20
25
  for name, version in sorted(datasets):
21
- grouped.setdefault(name, []).append(version)
26
+ grouped.setdefault(name, []).append(semver.parse(version))
22
27
 
23
28
  if latest_only:
24
29
  # For each dataset name, pick the highest version.
25
- return {name: max(versions) for name, versions in grouped.items()}
30
+ return {
31
+ name: semver.create(*(max(versions))) for name, versions in grouped.items()
32
+ }
33
+
26
34
  # For each dataset name, return a sorted list of unique versions.
27
- return {name: sorted(set(versions)) for name, versions in grouped.items()}
35
+ return {
36
+ name: [semver.create(*v) for v in sorted(set(versions))]
37
+ for name, versions in grouped.items()
38
+ }
28
39
 
29
40
 
30
41
  def list_datasets(
@@ -35,7 +46,7 @@ def list_datasets(
35
46
  team: Optional[str] = None,
36
47
  latest_only: bool = True,
37
48
  name: Optional[str] = None,
38
- ):
49
+ ) -> None:
39
50
  token = Config().read().get("studio", {}).get("token")
40
51
  all, local, studio = determine_flavors(studio, local, all, token)
41
52
  if name:
@@ -95,27 +106,31 @@ def list_datasets(
95
106
  print(tabulate(rows, headers="keys"))
96
107
 
97
108
 
98
- def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
109
+ def list_datasets_local(
110
+ catalog: "Catalog", name: Optional[str] = None
111
+ ) -> Iterator[tuple[str, str]]:
99
112
  if name:
100
113
  yield from list_datasets_local_versions(catalog, name)
101
114
  return
102
115
 
103
116
  for d in catalog.ls_datasets():
104
117
  for v in d.versions:
105
- yield (d.full_name, v.version)
118
+ yield d.full_name, v.version
106
119
 
107
120
 
108
- def list_datasets_local_versions(catalog: "Catalog", name: str):
121
+ def list_datasets_local_versions(
122
+ catalog: "Catalog", name: str
123
+ ) -> Iterator[tuple[str, str]]:
109
124
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
110
125
 
111
126
  ds = catalog.get_dataset(
112
127
  name, namespace_name=namespace_name, project_name=project_name
113
128
  )
114
129
  for v in ds.versions:
115
- yield (name, v.version)
130
+ yield name, v.version
116
131
 
117
132
 
118
- def _datasets_tabulate_row(name, both, local_version, studio_version):
133
+ def _datasets_tabulate_row(name, both, local_version, studio_version) -> dict[str, str]:
119
134
  row = {
120
135
  "Name": name,
121
136
  }
@@ -136,7 +151,7 @@ def rm_dataset(
136
151
  force: Optional[bool] = False,
137
152
  studio: Optional[bool] = False,
138
153
  team: Optional[str] = None,
139
- ):
154
+ ) -> None:
140
155
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
141
156
 
142
157
  if studio:
@@ -166,7 +181,7 @@ def edit_dataset(
166
181
  description: Optional[str] = None,
167
182
  attrs: Optional[list[str]] = None,
168
183
  team: Optional[str] = None,
169
- ):
184
+ ) -> None:
170
185
  from datachain.lib.dc.utils import is_studio
171
186
 
172
187
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
@@ -37,6 +37,7 @@ from datachain import semver
37
37
  from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
38
38
  from datachain.data_storage.db_engine import DatabaseEngine
39
39
  from datachain.data_storage.schema import DefaultSchema
40
+ from datachain.data_storage.warehouse import INSERT_BATCH_SIZE
40
41
  from datachain.dataset import DatasetRecord, StorageURI
41
42
  from datachain.error import DataChainError, OutdatedDatabaseSchemaError
42
43
  from datachain.namespace import Namespace
@@ -44,7 +45,7 @@ from datachain.project import Project
44
45
  from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
45
46
  from datachain.sql.sqlite.base import load_usearch_extension
46
47
  from datachain.sql.types import SQLType
47
- from datachain.utils import DataChainDir, batched_it
48
+ from datachain.utils import DataChainDir, batched, batched_it
48
49
 
49
50
  if TYPE_CHECKING:
50
51
  from sqlalchemy.dialects.sqlite import Insert
@@ -712,19 +713,21 @@ class SQLiteWarehouse(AbstractWarehouse):
712
713
  def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
713
714
  return (e.model_dump() for e in entries)
714
715
 
715
- def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
716
- rows = list(rows)
717
- if not rows:
718
- return
719
-
720
- with self.db.transaction() as conn:
721
- # transactions speeds up inserts significantly as there is no separate
722
- # transaction created for each insert row
723
- self.db.executemany(
724
- table.insert().values({f: bindparam(f) for f in rows[0]}),
725
- rows,
726
- conn=conn,
727
- )
716
+ def insert_rows(
717
+ self,
718
+ table: Table,
719
+ rows: Iterable[dict[str, Any]],
720
+ batch_size: int = INSERT_BATCH_SIZE,
721
+ ) -> None:
722
+ for row_chunk in batched(rows, batch_size):
723
+ with self.db.transaction() as conn:
724
+ # transactions speeds up inserts significantly as there is no separate
725
+ # transaction created for each insert row
726
+ self.db.executemany(
727
+ table.insert().values({f: bindparam(f) for f in row_chunk[0]}),
728
+ row_chunk,
729
+ conn=conn,
730
+ )
728
731
 
729
732
  def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
730
733
  dr = self.dataset_rows(dataset, version)
@@ -797,7 +800,7 @@ class SQLiteWarehouse(AbstractWarehouse):
797
800
  .limit(None)
798
801
  )
799
802
 
800
- for batch in batched_it(ids, 10_000):
803
+ for batch in batched_it(ids, INSERT_BATCH_SIZE):
801
804
  batch_ids = [row[0] for row in batch]
802
805
  select_q._where_criteria = (col_id.in_(batch_ids),)
803
806
  q = table.insert().from_select(list(select_q.selected_columns), select_q)
@@ -43,6 +43,7 @@ if TYPE_CHECKING:
43
43
  logger = logging.getLogger("datachain")
44
44
 
45
45
  SELECT_BATCH_SIZE = 100_000 # number of rows to fetch at a time
46
+ INSERT_BATCH_SIZE = 10_000 # number of rows to insert at a time
46
47
 
47
48
 
48
49
  class AbstractWarehouse(ABC, Serializable):
@@ -415,7 +416,12 @@ class AbstractWarehouse(ABC, Serializable):
415
416
  """Convert File entries so they can be passed on to `insert_rows()`"""
416
417
 
417
418
  @abstractmethod
418
- def insert_rows(self, table: sa.Table, rows: Iterable[dict[str, Any]]) -> None:
419
+ def insert_rows(
420
+ self,
421
+ table: sa.Table,
422
+ rows: Iterable[dict[str, Any]],
423
+ batch_size: int = INSERT_BATCH_SIZE,
424
+ ) -> None:
419
425
  """Does batch inserts of any kind of rows into table"""
420
426
 
421
427
  def insert_rows_done(self, table: sa.Table) -> None:
datachain/delta.py CHANGED
@@ -4,7 +4,7 @@ from functools import wraps
4
4
  from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
5
5
 
6
6
  import datachain
7
- from datachain.dataset import DatasetDependency
7
+ from datachain.dataset import DatasetDependency, DatasetRecord
8
8
  from datachain.error import DatasetNotFoundError
9
9
  from datachain.project import Project
10
10
 
@@ -30,9 +30,10 @@ def delta_disabled(
30
30
 
31
31
  @wraps(method)
32
32
  def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
33
- if self.delta:
33
+ if self.delta and not self._delta_unsafe:
34
34
  raise NotImplementedError(
35
- f"Delta update cannot be used with {method.__name__}"
35
+ f"Cannot use {method.__name__} with delta datasets - may cause"
36
+ " inconsistency. Use delta_unsafe flag to allow this operation."
36
37
  )
37
38
  return method(self, *args, **kwargs)
38
39
 
@@ -124,10 +125,19 @@ def _get_retry_chain(
124
125
  # Subtract also diff chain since some items might be picked
125
126
  # up by `delta=True` itself (e.g. records got modified AND are missing in the
126
127
  # result dataset atm)
127
- return retry_chain.subtract(diff_chain, on=on) if retry_chain else None
128
+ on = [on] if isinstance(on, str) else on
129
+
130
+ return (
131
+ retry_chain.diff(
132
+ diff_chain, on=on, added=True, same=True, modified=False, deleted=False
133
+ ).distinct(*on)
134
+ if retry_chain
135
+ else None
136
+ )
128
137
 
129
138
 
130
139
  def _get_source_info(
140
+ source_ds: DatasetRecord,
131
141
  name: str,
132
142
  namespace_name: str,
133
143
  project_name: str,
@@ -154,25 +164,23 @@ def _get_source_info(
154
164
  indirect=False,
155
165
  )
156
166
 
157
- dep = dependencies[0]
158
- if not dep:
167
+ source_ds_dep = next((d for d in dependencies if d.name == source_ds.name), None)
168
+ if not source_ds_dep:
159
169
  # Starting dataset was removed, back off to normal dataset creation
160
170
  return None, None, None, None, None
161
171
 
162
- source_ds_project = catalog.metastore.get_project(dep.project, dep.namespace)
163
- source_ds_name = dep.name
164
- source_ds_version = dep.version
165
- source_ds_latest_version = catalog.get_dataset(
166
- source_ds_name,
167
- namespace_name=source_ds_project.namespace.name,
168
- project_name=source_ds_project.name,
169
- ).latest_version
172
+ # Refresh starting dataset to have new versions if they are created
173
+ source_ds = catalog.get_dataset(
174
+ source_ds.name,
175
+ namespace_name=source_ds.project.namespace.name,
176
+ project_name=source_ds.project.name,
177
+ )
170
178
 
171
179
  return (
172
- source_ds_name,
173
- source_ds_project,
174
- source_ds_version,
175
- source_ds_latest_version,
180
+ source_ds.name,
181
+ source_ds.project,
182
+ source_ds_dep.version,
183
+ source_ds.latest_version,
176
184
  dependencies,
177
185
  )
178
186
 
@@ -244,7 +252,14 @@ def delta_retry_update(
244
252
  source_ds_version,
245
253
  source_ds_latest_version,
246
254
  dependencies,
247
- ) = _get_source_info(name, namespace_name, project_name, latest_version, catalog)
255
+ ) = _get_source_info(
256
+ dc._query.starting_step.dataset, # type: ignore[union-attr]
257
+ name,
258
+ namespace_name,
259
+ project_name,
260
+ latest_version,
261
+ catalog,
262
+ )
248
263
 
249
264
  # If source_ds_name is None, starting dataset was removed
250
265
  if source_ds_name is None:
@@ -267,8 +282,9 @@ def delta_retry_update(
267
282
  if dependencies:
268
283
  dependencies = copy(dependencies)
269
284
  dependencies = [d for d in dependencies if d is not None]
285
+ source_ds_dep = next(d for d in dependencies if d.name == source_ds_name)
270
286
  # Update to latest version
271
- dependencies[0].version = source_ds_latest_version # type: ignore[union-attr]
287
+ source_ds_dep.version = source_ds_latest_version # type: ignore[union-attr]
272
288
 
273
289
  # Handle retry functionality if enabled
274
290
  if delta_retry:
@@ -73,7 +73,7 @@ def to_database(
73
73
  table_name: str,
74
74
  connection: "ConnectionType",
75
75
  *,
76
- batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
76
+ batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
77
77
  on_conflict: Optional[str] = None,
78
78
  conflict_columns: Optional[list[str]] = None,
79
79
  column_mapping: Optional[dict[str, Optional[str]]] = None,
@@ -124,7 +124,7 @@ def to_database(
124
124
  table.create(conn, checkfirst=True)
125
125
 
126
126
  rows_iter = chain._leaf_values()
127
- for batch in batched(rows_iter, batch_rows):
127
+ for batch in batched(rows_iter, batch_size):
128
128
  rows_affected = _process_batch(
129
129
  conn,
130
130
  table,
@@ -193,6 +193,7 @@ class DataChain:
193
193
  self._setup: dict = setup or {}
194
194
  self._sys = _sys
195
195
  self._delta = False
196
+ self._delta_unsafe = False
196
197
  self._delta_on: Optional[Union[str, Sequence[str]]] = None
197
198
  self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
198
199
  self._delta_compare: Optional[Union[str, Sequence[str]]] = None
@@ -216,6 +217,7 @@ class DataChain:
216
217
  right_on: Optional[Union[str, Sequence[str]]] = None,
217
218
  compare: Optional[Union[str, Sequence[str]]] = None,
218
219
  delta_retry: Optional[Union[bool, str]] = None,
220
+ delta_unsafe: bool = False,
219
221
  ) -> "Self":
220
222
  """Marks this chain as delta, which means special delta process will be
221
223
  called on saving dataset for optimization"""
@@ -226,6 +228,7 @@ class DataChain:
226
228
  self._delta_result_on = right_on
227
229
  self._delta_compare = compare
228
230
  self._delta_retry = delta_retry
231
+ self._delta_unsafe = delta_unsafe
229
232
  return self
230
233
 
231
234
  @property
@@ -238,6 +241,10 @@ class DataChain:
238
241
  """Returns True if this chain is ran in "delta" update mode"""
239
242
  return self._delta
240
243
 
244
+ @property
245
+ def delta_unsafe(self) -> bool:
246
+ return self._delta_unsafe
247
+
241
248
  @property
242
249
  def schema(self) -> dict[str, DataType]:
243
250
  """Get schema of the chain."""
@@ -328,21 +335,22 @@ class DataChain:
328
335
  right_on=self._delta_result_on,
329
336
  compare=self._delta_compare,
330
337
  delta_retry=self._delta_retry,
338
+ delta_unsafe=self._delta_unsafe,
331
339
  )
332
340
 
333
341
  return chain
334
342
 
335
343
  def settings(
336
344
  self,
337
- cache=None,
338
- parallel=None,
339
- workers=None,
340
- min_task_size=None,
341
- prefetch: Optional[int] = None,
342
- sys: Optional[bool] = None,
345
+ cache: Optional[bool] = None,
346
+ prefetch: Optional[Union[bool, int]] = None,
347
+ parallel: Optional[Union[bool, int]] = None,
348
+ workers: Optional[int] = None,
343
349
  namespace: Optional[str] = None,
344
350
  project: Optional[str] = None,
345
- batch_rows: Optional[int] = None,
351
+ min_task_size: Optional[int] = None,
352
+ batch_size: Optional[int] = None,
353
+ sys: Optional[bool] = None,
346
354
  ) -> "Self":
347
355
  """Change settings for chain.
348
356
 
@@ -351,23 +359,23 @@ class DataChain:
351
359
 
352
360
  Parameters:
353
361
  cache : data caching. (default=False)
362
+ prefetch : number of workers to use for downloading files in advance.
363
+ This is enabled by default and uses 2 workers.
364
+ To disable prefetching, set it to 0 or False.
354
365
  parallel : number of thread for processors. True is a special value to
355
366
  enable all available CPUs. (default=1)
356
367
  workers : number of distributed workers. Only for Studio mode. (default=1)
357
- min_task_size : minimum number of tasks. (default=1)
358
- prefetch : number of workers to use for downloading files in advance.
359
- This is enabled by default and uses 2 workers.
360
- To disable prefetching, set it to 0.
361
368
  namespace : namespace name.
362
369
  project : project name.
363
- batch_rows : row limit per insert to balance speed and memory usage.
370
+ min_task_size : minimum number of tasks. (default=1)
371
+ batch_size : row limit per insert to balance speed and memory usage.
364
372
  (default=2000)
365
373
 
366
374
  Example:
367
375
  ```py
368
376
  chain = (
369
377
  chain
370
- .settings(cache=True, parallel=8, batch_rows=300)
378
+ .settings(cache=True, parallel=8, batch_size=300)
371
379
  .map(laion=process_webdataset(spec=WDSLaion), params="file")
372
380
  )
373
381
  ```
@@ -377,14 +385,14 @@ class DataChain:
377
385
  settings = copy.copy(self._settings)
378
386
  settings.add(
379
387
  Settings(
380
- cache,
381
- parallel,
382
- workers,
383
- min_task_size,
384
- prefetch,
385
- namespace,
386
- project,
387
- batch_rows,
388
+ cache=cache,
389
+ prefetch=prefetch,
390
+ parallel=parallel,
391
+ workers=workers,
392
+ namespace=namespace,
393
+ project=project,
394
+ min_task_size=min_task_size,
395
+ batch_size=batch_size,
388
396
  )
389
397
  )
390
398
  return self._evolve(settings=settings, _sys=sys)
@@ -737,7 +745,7 @@ class DataChain:
737
745
 
738
746
  return self._evolve(
739
747
  query=self._query.add_signals(
740
- udf_obj.to_udf_wrapper(self._settings.batch_rows),
748
+ udf_obj.to_udf_wrapper(self._settings.batch_size),
741
749
  **self._settings.to_dict(),
742
750
  ),
743
751
  signal_schema=self.signals_schema | udf_obj.output,
@@ -775,7 +783,7 @@ class DataChain:
775
783
  udf_obj.prefetch = prefetch
776
784
  return self._evolve(
777
785
  query=self._query.generate(
778
- udf_obj.to_udf_wrapper(self._settings.batch_rows),
786
+ udf_obj.to_udf_wrapper(self._settings.batch_size),
779
787
  **self._settings.to_dict(),
780
788
  ),
781
789
  signal_schema=udf_obj.output,
@@ -911,7 +919,7 @@ class DataChain:
911
919
  udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
912
920
  return self._evolve(
913
921
  query=self._query.generate(
914
- udf_obj.to_udf_wrapper(self._settings.batch_rows),
922
+ udf_obj.to_udf_wrapper(self._settings.batch_size),
915
923
  partition_by=processed_partition_by,
916
924
  **self._settings.to_dict(),
917
925
  ),
@@ -960,7 +968,7 @@ class DataChain:
960
968
 
961
969
  return self._evolve(
962
970
  query=self._query.add_signals(
963
- udf_obj.to_udf_wrapper(self._settings.batch_rows, batch=batch),
971
+ udf_obj.to_udf_wrapper(self._settings.batch_size, batch=batch),
964
972
  **self._settings.to_dict(),
965
973
  ),
966
974
  signal_schema=self.signals_schema | udf_obj.output,
@@ -2306,7 +2314,7 @@ class DataChain:
2306
2314
  table_name: str,
2307
2315
  connection: "ConnectionType",
2308
2316
  *,
2309
- batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
2317
+ batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
2310
2318
  on_conflict: Optional[str] = None,
2311
2319
  conflict_columns: Optional[list[str]] = None,
2312
2320
  column_mapping: Optional[dict[str, Optional[str]]] = None,
@@ -2328,7 +2336,7 @@ class DataChain:
2328
2336
  library. If a DBAPI2 object, only sqlite3 is supported. The user is
2329
2337
  responsible for engine disposal and connection closure for the
2330
2338
  SQLAlchemy connectable; str connections are closed automatically.
2331
- batch_rows: Number of rows to insert per batch for optimal performance.
2339
+ batch_size: Number of rows to insert per batch for optimal performance.
2332
2340
  Larger batches are faster but use more memory. Default: 10,000.
2333
2341
  on_conflict: Strategy for handling duplicate rows (requires table
2334
2342
  constraints):
@@ -2409,7 +2417,7 @@ class DataChain:
2409
2417
  self,
2410
2418
  table_name,
2411
2419
  connection,
2412
- batch_rows=batch_rows,
2420
+ batch_size=batch_size,
2413
2421
  on_conflict=on_conflict,
2414
2422
  conflict_columns=conflict_columns,
2415
2423
  column_mapping=column_mapping,
@@ -40,6 +40,7 @@ def read_dataset(
40
40
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
41
41
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
42
42
  delta_retry: Optional[Union[bool, str]] = None,
43
+ delta_unsafe: bool = False,
43
44
  update: bool = False,
44
45
  ) -> "DataChain":
45
46
  """Get data from a saved Dataset. It returns the chain itself.
@@ -80,6 +81,8 @@ def read_dataset(
80
81
  update: If True always checks for newer versions available on Studio, even if
81
82
  some version of the dataset exists locally already. If False (default), it
82
83
  will only fetch the dataset from Studio if it is not found locally.
84
+ delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
85
+ distinct.
83
86
 
84
87
 
85
88
  Example:
@@ -205,6 +208,7 @@ def read_dataset(
205
208
  right_on=delta_result_on,
206
209
  compare=delta_compare,
207
210
  delta_retry=delta_retry,
211
+ delta_unsafe=delta_unsafe,
208
212
  )
209
213
 
210
214
  return chain
@@ -31,7 +31,7 @@ def read_records(
31
31
 
32
32
  Parameters:
33
33
  to_insert : records (or a single record) to insert. Each record is
34
- a dictionary of signals and theirs values.
34
+ a dictionary of signals and their values.
35
35
  schema : describes chain signals and their corresponding types
36
36
 
37
37
  Example:
@@ -45,7 +45,6 @@ def read_records(
45
45
  """
46
46
  from datachain.query.dataset import adjust_outputs, get_col_types
47
47
  from datachain.sql.types import SQLType
48
- from datachain.utils import batched
49
48
 
50
49
  from .datasets import read_dataset
51
50
 
@@ -96,7 +95,6 @@ def read_records(
96
95
  {c.name: c.type for c in columns if isinstance(c.type, SQLType)},
97
96
  )
98
97
  records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
99
- for chunk in batched(records, READ_RECORDS_BATCH_SIZE):
100
- warehouse.insert_rows(table, chunk)
98
+ warehouse.insert_rows(table, records, batch_size=READ_RECORDS_BATCH_SIZE)
101
99
  warehouse.insert_rows_done(table)
102
100
  return read_dataset(name=dsr.full_name, session=session, settings=settings)
@@ -43,6 +43,7 @@ def read_storage(
43
43
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
44
44
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
45
45
  delta_retry: Optional[Union[bool, str]] = None,
46
+ delta_unsafe: bool = False,
46
47
  client_config: Optional[dict] = None,
47
48
  ) -> "DataChain":
48
49
  """Get data from storage(s) as a list of file with all file attributes.
@@ -77,6 +78,9 @@ def read_storage(
77
78
  (error mode)
78
79
  - True: Reprocess records missing from the result dataset (missing mode)
79
80
  - None: No retry processing (default)
81
+ delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
82
+ distinct. Caller must ensure datasets are consistent and not partially
83
+ updated.
80
84
 
81
85
  Returns:
82
86
  DataChain: A DataChain object containing the file information.
@@ -218,6 +222,7 @@ def read_storage(
218
222
  right_on=delta_result_on,
219
223
  compare=delta_compare,
220
224
  delta_retry=delta_retry,
225
+ delta_unsafe=delta_unsafe,
221
226
  )
222
227
 
223
228
  return storage_chain