datachain 0.20.2__py3-none-any.whl → 0.20.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -26,6 +26,7 @@ from tqdm import tqdm
26
26
  from datachain import semver
27
27
  from datachain.dataset import DatasetRecord, parse_dataset_name
28
28
  from datachain.delta import delta_disabled
29
+ from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
29
30
  from datachain.func import literal
30
31
  from datachain.func.base import Function
31
32
  from datachain.func.func import Func
@@ -439,10 +440,10 @@ class DataChain:
439
440
 
440
441
  from datachain.lib.arrow import schema_to_output
441
442
 
442
- json_values = list(self.limit(schema_sample_size).collect(col))
443
+ json_values = self.limit(schema_sample_size).to_list(col)
443
444
  json_dicts = [
444
445
  json.loads(json_value) if isinstance(json_value, str) else json_value
445
- for json_value in json_values
446
+ for (json_value,) in json_values
446
447
  ]
447
448
 
448
449
  if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
@@ -581,7 +582,15 @@ class DataChain:
581
582
  or self.session.catalog.metastore.default_project_name
582
583
  )
583
584
 
584
- project = get_project(project_name, namespace_name, session=self.session)
585
+ try:
586
+ project = self.session.catalog.metastore.get_project(
587
+ project_name,
588
+ namespace_name,
589
+ create=self.session.catalog.metastore.project_allowed_to_create,
590
+ )
591
+ except ProjectNotFoundError as e:
592
+ # not being able to create it as creation is not allowed
593
+ raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
585
594
 
586
595
  schema = self.signals_schema.clone_without_sys_signals().serialize()
587
596
 
@@ -893,7 +902,7 @@ class DataChain:
893
902
  Order is not guaranteed when steps are added after an `order_by` statement.
894
903
  I.e. when using `read_dataset` an `order_by` statement should be used if
895
904
  the order of the records in the chain is important.
896
- Using `order_by` directly before `limit`, `collect` and `collect_flatten`
905
+ Using `order_by` directly before `limit`, `to_list` and similar methods
897
906
  will give expected results.
898
907
  See https://github.com/iterative/datachain/issues/477 for further details.
899
908
  """
@@ -1098,32 +1107,32 @@ class DataChain:
1098
1107
 
1099
1108
  @property
1100
1109
  def _effective_signals_schema(self) -> "SignalSchema":
1101
- """Effective schema used for user-facing API like collect, to_pandas, etc."""
1110
+ """Effective schema used for user-facing API like to_list, to_pandas, etc."""
1102
1111
  signals_schema = self.signals_schema
1103
1112
  if not self._sys:
1104
1113
  return signals_schema.clone_without_sys_signals()
1105
1114
  return signals_schema
1106
1115
 
1107
1116
  @overload
1108
- def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
1117
+ def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
1109
1118
 
1110
1119
  @overload
1111
- def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1120
+ def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1112
1121
 
1113
1122
  @overload
1114
- def collect_flatten(
1123
+ def _leaf_values(
1115
1124
  self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
1116
1125
  ) -> Iterator[_T]: ...
1117
1126
 
1118
1127
  @overload
1119
- def collect_flatten(
1128
+ def _leaf_values(
1120
1129
  self,
1121
1130
  *,
1122
1131
  row_factory: Callable[[list[str], tuple[Any, ...]], _T],
1123
1132
  include_hidden: bool,
1124
1133
  ) -> Iterator[_T]: ...
1125
1134
 
1126
- def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
1135
+ def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
1127
1136
  """Yields flattened rows of values as a tuple.
1128
1137
 
1129
1138
  Args:
@@ -1151,7 +1160,7 @@ class DataChain:
1151
1160
  headers, _ = self._effective_signals_schema.get_headers_with_length()
1152
1161
  column_names = [".".join(filter(None, header)) for header in headers]
1153
1162
 
1154
- results_iter = self.collect_flatten()
1163
+ results_iter = self._leaf_values()
1155
1164
 
1156
1165
  def column_chunks() -> Iterator[list[list[Any]]]:
1157
1166
  for chunk_iter in batched_it(results_iter, chunk_size):
@@ -1184,9 +1193,9 @@ class DataChain:
1184
1193
 
1185
1194
  def results(self, *, row_factory=None, include_hidden=True):
1186
1195
  if row_factory is None:
1187
- return list(self.collect_flatten(include_hidden=include_hidden))
1196
+ return list(self._leaf_values(include_hidden=include_hidden))
1188
1197
  return list(
1189
- self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
1198
+ self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
1190
1199
  )
1191
1200
 
1192
1201
  def to_records(self) -> list[dict[str, Any]]:
@@ -1197,42 +1206,38 @@ class DataChain:
1197
1206
 
1198
1207
  return self.results(row_factory=to_dict)
1199
1208
 
1200
- @overload
1201
- def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1202
-
1203
- @overload
1204
- def collect(self, col: str) -> Iterator[DataValue]: ...
1205
-
1206
- @overload
1207
- def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1208
-
1209
- def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1209
+ def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
1210
1210
  """Yields rows of values, optionally limited to the specified columns.
1211
1211
 
1212
1212
  Args:
1213
1213
  *cols: Limit to the specified columns. By default, all columns are selected.
1214
1214
 
1215
1215
  Yields:
1216
- (DataType): Yields a single item if a column is selected.
1217
- (tuple[DataType, ...]): Yields a tuple of items if multiple columns are
1218
- selected.
1216
+ (tuple[DataType, ...]): Yields a tuple of items for each row.
1219
1217
 
1220
1218
  Example:
1221
1219
  Iterating over all rows:
1222
1220
  ```py
1223
- for row in dc.collect():
1221
+ for row in ds.to_iter():
1222
+ print(row)
1223
+ ```
1224
+
1225
+ DataChain is iterable and can be used in a for loop directly which is
1226
+ equivalent to `ds.to_iter()`:
1227
+ ```py
1228
+ for row in ds:
1224
1229
  print(row)
1225
1230
  ```
1226
1231
 
1227
1232
  Iterating over all rows with selected columns:
1228
1233
  ```py
1229
- for name, size in dc.collect("file.path", "file.size"):
1234
+ for name, size in ds.to_iter("file.path", "file.size"):
1230
1235
  print(name, size)
1231
1236
  ```
1232
1237
 
1233
1238
  Iterating over a single column:
1234
1239
  ```py
1235
- for file in dc.collect("file.path"):
1240
+ for (file,) in ds.to_iter("file.path"):
1236
1241
  print(file)
1237
1242
  ```
1238
1243
  """
@@ -1244,7 +1249,31 @@ class DataChain:
1244
1249
  ret = signals_schema.row_to_features(
1245
1250
  row, catalog=chain.session.catalog, cache=chain._settings.cache
1246
1251
  )
1247
- yield ret[0] if len(cols) == 1 else tuple(ret)
1252
+ yield tuple(ret)
1253
+
1254
+ @overload
1255
+ def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1256
+
1257
+ @overload
1258
+ def collect(self, col: str) -> Iterator[DataValue]: ...
1259
+
1260
+ @overload
1261
+ def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1262
+
1263
+ def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1264
+ """
1265
+ Deprecated. Use `to_iter` method instead.
1266
+ """
1267
+ warnings.warn(
1268
+ "Method `collect` is deprecated. Use `to_iter` method instead.",
1269
+ DeprecationWarning,
1270
+ stacklevel=2,
1271
+ )
1272
+
1273
+ if len(cols) == 1:
1274
+ yield from [item[0] for item in self.to_iter(*cols)]
1275
+ else:
1276
+ yield from self.to_iter(*cols)
1248
1277
 
1249
1278
  def to_pytorch(
1250
1279
  self,
@@ -1479,7 +1508,7 @@ class DataChain:
1479
1508
  )
1480
1509
  return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
1481
1510
 
1482
- def compare(
1511
+ def diff(
1483
1512
  self,
1484
1513
  other: "DataChain",
1485
1514
  on: Union[str, Sequence[str]],
@@ -1492,41 +1521,33 @@ class DataChain:
1492
1521
  same: bool = False,
1493
1522
  status_col: Optional[str] = None,
1494
1523
  ) -> "DataChain":
1495
- """Comparing two chains by identifying rows that are added, deleted, modified
1496
- or same. Result is the new chain that has additional column with possible
1497
- values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
1498
- rows respectively. Note that if only one "status" is asked, by setting proper
1499
- flags, this additional column is not created as it would have only one value
1500
- for all rows. Beside additional diff column, new chain has schema of the chain
1501
- on which method was called.
1524
+ """Calculate differences between two chains.
1525
+
1526
+ This method identifies records that are added, deleted, modified, or unchanged
1527
+ between two chains. It adds a status column with values: A=added, D=deleted,
1528
+ M=modified, S=same.
1502
1529
 
1503
1530
  Parameters:
1504
- other: Chain to calculate diff from.
1505
- on: Column or list of columns to match on. If both chains have the
1506
- same columns then this column is enough for the match. Otherwise,
1507
- `right_on` parameter has to specify the columns for the other chain.
1508
- This value is used to find corresponding row in other dataset. If not
1509
- found there, row is considered as added (or removed if vice versa), and
1510
- if found then row can be either modified or same.
1511
- right_on: Optional column or list of columns
1512
- for the `other` to match.
1513
- compare: Column or list of columns to compare on. If both chains have
1514
- the same columns then this column is enough for the compare. Otherwise,
1515
- `right_compare` parameter has to specify the columns for the other
1516
- chain. This value is used to see if row is modified or same. If
1517
- not set, all columns will be used for comparison
1518
- right_compare: Optional column or list of columns
1519
- for the `other` to compare to.
1520
- added (bool): Whether to return added rows in resulting chain.
1521
- deleted (bool): Whether to return deleted rows in resulting chain.
1522
- modified (bool): Whether to return modified rows in resulting chain.
1523
- same (bool): Whether to return unchanged rows in resulting chain.
1524
- status_col (str): Name of the new column that is created in resulting chain
1525
- representing diff status.
1531
+ other: Chain to compare against.
1532
+ on: Column(s) to match records between chains.
1533
+ right_on: Column(s) in the other chain to match against. Defaults to `on`.
1534
+ compare: Column(s) to check for changes.
1535
+ If not specified,all columns are used.
1536
+ right_compare: Column(s) in the other chain to compare against.
1537
+ Defaults to values of `compare`.
1538
+ added (bool): Include records that exist in this chain but not in the other.
1539
+ deleted (bool): Include records that exist only in the other chain.
1540
+ modified (bool): Include records that exist in both
1541
+ but have different values.
1542
+ same (bool): Include records that are identical in both chains.
1543
+ status_col (str): Name for the status column showing differences.
1544
+
1545
+ Default behavior: By default, shows added, deleted, and modified records,
1546
+ but excludes unchanged records (same=False). Status column is not created.
1526
1547
 
1527
1548
  Example:
1528
1549
  ```py
1529
- res = persons.compare(
1550
+ res = persons.diff(
1530
1551
  new_persons,
1531
1552
  on=["id"],
1532
1553
  right_on=["other_id"],
@@ -1555,7 +1576,7 @@ class DataChain:
1555
1576
  status_col=status_col,
1556
1577
  )
1557
1578
 
1558
- def diff(
1579
+ def file_diff(
1559
1580
  self,
1560
1581
  other: "DataChain",
1561
1582
  on: str = "file",
@@ -1566,31 +1587,29 @@ class DataChain:
1566
1587
  same: bool = False,
1567
1588
  status_col: Optional[str] = None,
1568
1589
  ) -> "DataChain":
1569
- """Similar to `.compare()`, which is more generic method to calculate difference
1570
- between two chains. Unlike `.compare()`, this method works only on those chains
1571
- that have `File` object, or it's derivatives, in it. File `source` and `path`
1572
- are used for matching, and file `version` and `etag` for comparing, while in
1573
- `.compare()` user needs to provide arbitrary columns for matching and comparing.
1590
+ """Calculate differences between two chains containing files.
1591
+
1592
+ This method is specifically designed for file chains. It uses file `source`
1593
+ and `path` to match files, and file `version` and `etag` to detect changes.
1574
1594
 
1575
1595
  Parameters:
1576
- other: Chain to calculate diff from.
1577
- on: File signal to match on. If both chains have the
1578
- same file signal then this column is enough for the match. Otherwise,
1579
- `right_on` parameter has to specify the file signal for the other chain.
1580
- This value is used to find corresponding row in other dataset. If not
1581
- found there, row is considered as added (or removed if vice versa), and
1582
- if found then row can be either modified or same.
1583
- right_on: Optional file signal for the `other` to match.
1584
- added (bool): Whether to return added rows in resulting chain.
1585
- deleted (bool): Whether to return deleted rows in resulting chain.
1586
- modified (bool): Whether to return modified rows in resulting chain.
1587
- same (bool): Whether to return unchanged rows in resulting chain.
1588
- status_col (str): Optional name of the new column that is created in
1589
- resulting chain representing diff status.
1596
+ other: Chain to compare against.
1597
+ on: File column name in this chain. Default is "file".
1598
+ right_on: File column name in the other chain. Defaults to `on`.
1599
+ added (bool): Include files that exist in this chain but not in the other.
1600
+ deleted (bool): Include files that exist only in the other chain.
1601
+ modified (bool): Include files that exist in both but have different
1602
+ versions/etags.
1603
+ same (bool): Include files that are identical in both chains.
1604
+ status_col (str): Name for the status column showing differences
1605
+ (A=added, D=deleted, M=modified, S=same).
1606
+
1607
+ Default behavior: By default, includes only new files (added=True and
1608
+ modified=True). This is useful for incremental processing.
1590
1609
 
1591
1610
  Example:
1592
1611
  ```py
1593
- diff = images.diff(
1612
+ diff = images.file_diff(
1594
1613
  new_images,
1595
1614
  on="file",
1596
1615
  right_on="other_file",
@@ -1615,7 +1634,7 @@ class DataChain:
1615
1634
  compare_cols = get_file_signals(on, compare_file_signals)
1616
1635
  right_compare_cols = get_file_signals(right_on, compare_file_signals)
1617
1636
 
1618
- return self.compare(
1637
+ return self.diff(
1619
1638
  other,
1620
1639
  on_cols,
1621
1640
  right_on=right_on_cols,
@@ -2027,7 +2046,7 @@ class DataChain:
2027
2046
  headers, _ = self._effective_signals_schema.get_headers_with_length()
2028
2047
  column_names = [".".join(filter(None, header)) for header in headers]
2029
2048
 
2030
- results_iter = self.collect_flatten()
2049
+ results_iter = self._leaf_values()
2031
2050
 
2032
2051
  with opener(path, "w", newline="") as f:
2033
2052
  writer = csv.writer(f, delimiter=delimiter, **kwargs)
@@ -2079,7 +2098,7 @@ class DataChain:
2079
2098
  if include_outer_list:
2080
2099
  # This makes the file JSON instead of JSON lines.
2081
2100
  f.write(b"[\n")
2082
- for row in self.collect_flatten():
2101
+ for row in self._leaf_values():
2083
2102
  if not is_first:
2084
2103
  if include_outer_list:
2085
2104
  # This makes the file JSON instead of JSON lines.
@@ -2244,7 +2263,7 @@ class DataChain:
2244
2263
  max_threads=num_threads or 1,
2245
2264
  client_config=client_config,
2246
2265
  )
2247
- file_exporter.run(self.collect(signal), progress_bar)
2266
+ file_exporter.run(self.to_values(signal), progress_bar)
2248
2267
 
2249
2268
  def shuffle(self) -> "Self":
2250
2269
  """Shuffle the rows of the chain deterministically."""
@@ -2378,3 +2397,72 @@ class DataChain:
2378
2397
  Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
2379
2398
  """
2380
2399
  return self._evolve(query=self._query.chunk(index, total))
2400
+
2401
+ def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
2402
+ """Returns a list of rows of values, optionally limited to the specified
2403
+ columns.
2404
+
2405
+ Args:
2406
+ *cols: Limit to the specified columns. By default, all columns are selected.
2407
+
2408
+ Returns:
2409
+ list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
2410
+
2411
+ Example:
2412
+ Getting all rows as a list:
2413
+ ```py
2414
+ rows = dc.to_list()
2415
+ print(rows)
2416
+ ```
2417
+
2418
+ Getting all rows with selected columns as a list:
2419
+ ```py
2420
+ name_size_pairs = dc.to_list("file.path", "file.size")
2421
+ print(name_size_pairs)
2422
+ ```
2423
+
2424
+ Getting a single column as a list:
2425
+ ```py
2426
+ files = dc.to_list("file.path")
2427
+ print(files) # Returns list of 1-tuples
2428
+ ```
2429
+ """
2430
+ return list(self.to_iter(*cols))
2431
+
2432
+ def to_values(self, col: str) -> list[DataValue]:
2433
+ """Returns a flat list of values from a single column.
2434
+
2435
+ Args:
2436
+ col: The name of the column to extract values from.
2437
+
2438
+ Returns:
2439
+ list[DataValue]: Returns a flat list of values from the specified column.
2440
+
2441
+ Example:
2442
+ Getting all values from a single column:
2443
+ ```py
2444
+ file_paths = dc.to_values("file.path")
2445
+ print(file_paths) # Returns list of strings
2446
+ ```
2447
+
2448
+ Getting all file sizes:
2449
+ ```py
2450
+ sizes = dc.to_values("file.size")
2451
+ print(sizes) # Returns list of integers
2452
+ ```
2453
+ """
2454
+ return [row[0] for row in self.to_list(col)]
2455
+
2456
+ def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
2457
+ """Make DataChain objects iterable.
2458
+
2459
+ Yields:
2460
+ (tuple[DataValue, ...]): Yields tuples of all column values for each row.
2461
+
2462
+ Example:
2463
+ ```py
2464
+ for row in chain:
2465
+ print(row)
2466
+ ```
2467
+ """
2468
+ return self.to_iter()
@@ -33,7 +33,11 @@ def read_dataset(
33
33
  settings: Optional[dict] = None,
34
34
  fallback_to_studio: bool = True,
35
35
  delta: Optional[bool] = False,
36
- delta_on: Optional[Union[str, Sequence[str]]] = None,
36
+ delta_on: Optional[Union[str, Sequence[str]]] = (
37
+ "file.path",
38
+ "file.etag",
39
+ "file.version",
40
+ ),
37
41
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
38
42
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
39
43
  delta_retry: Optional[Union[bool, str]] = None,
@@ -53,41 +57,25 @@ def read_dataset(
53
57
  settings : Settings to use for the chain.
54
58
  fallback_to_studio : Try to pull dataset from Studio if not found locally.
55
59
  Default is True.
56
- delta: If set to True, we optimize the creation of new dataset versions by
57
- calculating the diff between the latest version of this storage and the
58
- version used to create the most recent version of the resulting chain
59
- dataset (the one specified in `.save()`). We then run the "diff" chain
60
- using only the diff data, rather than the entire storage data, and merge
61
- that diff chain with the latest version of the resulting dataset to create
62
- a new version. This approach avoids applying modifications to all records
63
- from storage every time, which can be an expensive operation.
64
- The diff is calculated using the `DataChain.compare()` method, which
65
- compares the `delta_on` fields to find matches and checks the compare
66
- fields to determine if a record has changed. Note that this process only
67
- considers added and modified records in storage; deleted records are not
68
- removed from the new dataset version.
69
- This calculation is based on the difference between the current version
70
- of the source and the version used to create the dataset.
71
- delta_on: A list of fields that uniquely identify rows in the source.
72
- If two rows have the same values, they are considered the same (e.g., they
73
- could be different versions of the same row in a versioned source).
74
- This is used in the delta update to calculate the diff.
75
- delta_result_on: A list of fields in the resulting dataset that correspond
76
- to the `delta_on` fields from the source.
77
- This is needed to identify rows that have changed in the source but are
78
- already present in the current version of the resulting dataset, in order
79
- to avoid including outdated versions of those rows in the new dataset.
80
- We retain only the latest versions of rows to prevent duplication.
81
- There is no need to define this if the `delta_on` fields are present in
82
- the final dataset and have not been renamed.
83
- delta_compare: A list of fields used to check if the same row has been modified
84
- in the new version of the source.
85
- If not defined, all fields except those defined in delta_on will be used.
86
- delta_retry: Specifies retry behavior for delta processing. If a string,
87
- it's the name of a field in the result dataset that indicates an error
88
- when not None - records with errors will be reprocessed. If True,
89
- records that exist in the source dataset but not in the result dataset
90
- will be reprocessed.
60
+ delta: If True, only process new or changed files instead of reprocessing
61
+ everything. This saves time by skipping files that were already processed in
62
+ previous versions. The optimization is working when a new version of the
63
+ dataset is created.
64
+ Default is False.
65
+ delta_on: Field(s) that uniquely identify each record in the source data.
66
+ Used to detect which records are new or changed.
67
+ Default is ("file.path", "file.etag", "file.version").
68
+ delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
69
+ Only needed if you rename the identifying fields during processing.
70
+ Default is None.
71
+ delta_compare: Field(s) used to detect if a record has changed.
72
+ If not specified, all fields except `delta_on` fields are used.
73
+ Default is None.
74
+ delta_retry: Controls retry behavior for failed records:
75
+ - String (field name): Reprocess records where this field is not empty
76
+ (error mode)
77
+ - True: Reprocess records missing from the result dataset (missing mode)
78
+ - None: No retry processing (default)
91
79
 
92
80
  Example:
93
81
  ```py
@@ -228,7 +216,7 @@ def datasets(
228
216
  import datachain as dc
229
217
 
230
218
  chain = dc.datasets(column="dataset")
231
- for ds in chain.collect("dataset"):
219
+ for ds in chain.to_iter("dataset"):
232
220
  print(f"{ds.name}@v{ds.version}")
233
221
  ```
234
222
  """
@@ -35,7 +35,11 @@ def read_storage(
35
35
  update: bool = False,
36
36
  anon: bool = False,
37
37
  delta: Optional[bool] = False,
38
- delta_on: Optional[Union[str, Sequence[str]]] = None,
38
+ delta_on: Optional[Union[str, Sequence[str]]] = (
39
+ "file.path",
40
+ "file.etag",
41
+ "file.version",
42
+ ),
39
43
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
40
44
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
41
45
  delta_retry: Optional[Union[bool, str]] = None,
@@ -54,43 +58,25 @@ def read_storage(
54
58
  update : force storage reindexing. Default is False.
55
59
  anon : If True, we will treat cloud bucket as public one
56
60
  client_config : Optional client configuration for the storage client.
57
- delta: If set to True, we optimize the creation of new dataset versions by
58
- calculating the diff between the latest version of this storage and the
59
- version used to create the most recent version of the resulting chain
60
- dataset (the one specified in `.save()`). We then run the "diff" chain
61
- using only the diff data, rather than the entire storage data, and merge
62
- that diff chain with the latest version of the resulting dataset to create
63
- a new version. This approach avoids applying modifications to all records
64
- from storage every time, which can be an expensive operation.
65
- The diff is calculated using the `DataChain.compare()` method, which
66
- compares the `delta_on` fields to find matches and checks the compare
67
- fields to determine if a record has changed. Note that this process only
68
- considers added and modified records in storage; deleted records are not
69
- removed from the new dataset version.
70
- This calculation is based on the difference between the current version
71
- of the source and the version used to create the dataset.
72
- delta_on: A list of fields that uniquely identify rows in the source.
73
- If two rows have the same values, they are considered the same (e.g., they
74
- could be different versions of the same row in a versioned source).
75
- This is used in the delta update to calculate the diff.
76
- delta_result_on: A list of fields in the resulting dataset that correspond
77
- to the `delta_on` fields from the source.
78
- This is needed to identify rows that have changed in the source but are
79
- already present in the current version of the resulting dataset, in order
80
- to avoid including outdated versions of those rows in the new dataset.
81
- We retain only the latest versions of rows to prevent duplication.
82
- There is no need to define this if the `delta_on` fields are present in
83
- the final dataset and have not been renamed.
84
- delta_compare: A list of fields used to check if the same row has been modified
85
- in the new version of the source.
86
- If not defined, all fields except those defined in `delta_on` will be used.
87
- delta_retry: Controls which records to reprocess. Can be:
88
- - A string specifying a field name: Records where this field is not None
89
- will be reprocessed (error checking mode).
90
- - True: Records that exist in the source dataset but not in the result
91
- dataset (based on delta_on/delta_result_on fields) will be reprocessed
92
- (missing records mode).
93
- - False or None: No retry processing.
61
+ delta: If True, only process new or changed files instead of reprocessing
62
+ everything. This saves time by skipping files that were already processed in
63
+ previous versions. The optimization is working when a new version of the
64
+ dataset is created.
65
+ Default is False.
66
+ delta_on: Field(s) that uniquely identify each record in the source data.
67
+ Used to detect which records are new or changed.
68
+ Default is ("file.path", "file.etag", "file.version").
69
+ delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
70
+ Only needed if you rename the identifying fields during processing.
71
+ Default is None.
72
+ delta_compare: Field(s) used to detect if a record has changed.
73
+ If not specified, all fields except `delta_on` fields are used.
74
+ Default is None.
75
+ delta_retry: Controls retry behavior for failed records:
76
+ - String (field name): Reprocess records where this field is not empty
77
+ (error mode)
78
+ - True: Reprocess records missing from the result dataset (missing mode)
79
+ - None: No retry processing (default)
94
80
 
95
81
  Returns:
96
82
  DataChain: A DataChain object containing the file information.