datachain 0.20.2__py3-none-any.whl → 0.20.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -26,6 +26,7 @@ from tqdm import tqdm
26
26
  from datachain import semver
27
27
  from datachain.dataset import DatasetRecord, parse_dataset_name
28
28
  from datachain.delta import delta_disabled
29
+ from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
29
30
  from datachain.func import literal
30
31
  from datachain.func.base import Function
31
32
  from datachain.func.func import Func
@@ -37,7 +38,6 @@ from datachain.lib.file import (
37
38
  FileExporter,
38
39
  )
39
40
  from datachain.lib.file import ExportPlacement as FileExportPlacement
40
- from datachain.lib.projects import get as get_project
41
41
  from datachain.lib.settings import Settings
42
42
  from datachain.lib.signal_schema import SignalSchema
43
43
  from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
@@ -439,10 +439,10 @@ class DataChain:
439
439
 
440
440
  from datachain.lib.arrow import schema_to_output
441
441
 
442
- json_values = list(self.limit(schema_sample_size).collect(col))
442
+ json_values = self.limit(schema_sample_size).to_list(col)
443
443
  json_dicts = [
444
444
  json.loads(json_value) if isinstance(json_value, str) else json_value
445
- for json_value in json_values
445
+ for (json_value,) in json_values
446
446
  ]
447
447
 
448
448
  if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
@@ -524,8 +524,10 @@ class DataChain:
524
524
  It returns the chain itself.
525
525
  """
526
526
  schema = self.signals_schema.clone_without_sys_signals().serialize()
527
- project = get_project(
528
- self.project_name, self.namespace_name, session=self.session
527
+ project = self.session.catalog.metastore.get_project(
528
+ self.project_name,
529
+ self.namespace_name,
530
+ create=True,
529
531
  )
530
532
  return self._evolve(
531
533
  query=self._query.save(project=project, feature_schema=schema)
@@ -581,7 +583,15 @@ class DataChain:
581
583
  or self.session.catalog.metastore.default_project_name
582
584
  )
583
585
 
584
- project = get_project(project_name, namespace_name, session=self.session)
586
+ try:
587
+ project = self.session.catalog.metastore.get_project(
588
+ project_name,
589
+ namespace_name,
590
+ create=self.session.catalog.metastore.project_allowed_to_create,
591
+ )
592
+ except ProjectNotFoundError as e:
593
+ # not being able to create it as creation is not allowed
594
+ raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
585
595
 
586
596
  schema = self.signals_schema.clone_without_sys_signals().serialize()
587
597
 
@@ -893,7 +903,7 @@ class DataChain:
893
903
  Order is not guaranteed when steps are added after an `order_by` statement.
894
904
  I.e. when using `read_dataset` an `order_by` statement should be used if
895
905
  the order of the records in the chain is important.
896
- Using `order_by` directly before `limit`, `collect` and `collect_flatten`
906
+ Using `order_by` directly before `limit`, `to_list` and similar methods
897
907
  will give expected results.
898
908
  See https://github.com/iterative/datachain/issues/477 for further details.
899
909
  """
@@ -1098,32 +1108,32 @@ class DataChain:
1098
1108
 
1099
1109
  @property
1100
1110
  def _effective_signals_schema(self) -> "SignalSchema":
1101
- """Effective schema used for user-facing API like collect, to_pandas, etc."""
1111
+ """Effective schema used for user-facing API like to_list, to_pandas, etc."""
1102
1112
  signals_schema = self.signals_schema
1103
1113
  if not self._sys:
1104
1114
  return signals_schema.clone_without_sys_signals()
1105
1115
  return signals_schema
1106
1116
 
1107
1117
  @overload
1108
- def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
1118
+ def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
1109
1119
 
1110
1120
  @overload
1111
- def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1121
+ def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1112
1122
 
1113
1123
  @overload
1114
- def collect_flatten(
1124
+ def _leaf_values(
1115
1125
  self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
1116
1126
  ) -> Iterator[_T]: ...
1117
1127
 
1118
1128
  @overload
1119
- def collect_flatten(
1129
+ def _leaf_values(
1120
1130
  self,
1121
1131
  *,
1122
1132
  row_factory: Callable[[list[str], tuple[Any, ...]], _T],
1123
1133
  include_hidden: bool,
1124
1134
  ) -> Iterator[_T]: ...
1125
1135
 
1126
- def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
1136
+ def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
1127
1137
  """Yields flattened rows of values as a tuple.
1128
1138
 
1129
1139
  Args:
@@ -1151,7 +1161,7 @@ class DataChain:
1151
1161
  headers, _ = self._effective_signals_schema.get_headers_with_length()
1152
1162
  column_names = [".".join(filter(None, header)) for header in headers]
1153
1163
 
1154
- results_iter = self.collect_flatten()
1164
+ results_iter = self._leaf_values()
1155
1165
 
1156
1166
  def column_chunks() -> Iterator[list[list[Any]]]:
1157
1167
  for chunk_iter in batched_it(results_iter, chunk_size):
@@ -1184,9 +1194,9 @@ class DataChain:
1184
1194
 
1185
1195
  def results(self, *, row_factory=None, include_hidden=True):
1186
1196
  if row_factory is None:
1187
- return list(self.collect_flatten(include_hidden=include_hidden))
1197
+ return list(self._leaf_values(include_hidden=include_hidden))
1188
1198
  return list(
1189
- self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
1199
+ self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
1190
1200
  )
1191
1201
 
1192
1202
  def to_records(self) -> list[dict[str, Any]]:
@@ -1197,42 +1207,38 @@ class DataChain:
1197
1207
 
1198
1208
  return self.results(row_factory=to_dict)
1199
1209
 
1200
- @overload
1201
- def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1202
-
1203
- @overload
1204
- def collect(self, col: str) -> Iterator[DataValue]: ...
1205
-
1206
- @overload
1207
- def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1208
-
1209
- def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1210
+ def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
1210
1211
  """Yields rows of values, optionally limited to the specified columns.
1211
1212
 
1212
1213
  Args:
1213
1214
  *cols: Limit to the specified columns. By default, all columns are selected.
1214
1215
 
1215
1216
  Yields:
1216
- (DataType): Yields a single item if a column is selected.
1217
- (tuple[DataType, ...]): Yields a tuple of items if multiple columns are
1218
- selected.
1217
+ (tuple[DataType, ...]): Yields a tuple of items for each row.
1219
1218
 
1220
1219
  Example:
1221
1220
  Iterating over all rows:
1222
1221
  ```py
1223
- for row in dc.collect():
1222
+ for row in ds.to_iter():
1223
+ print(row)
1224
+ ```
1225
+
1226
+ DataChain is iterable and can be used in a for loop directly which is
1227
+ equivalent to `ds.to_iter()`:
1228
+ ```py
1229
+ for row in ds:
1224
1230
  print(row)
1225
1231
  ```
1226
1232
 
1227
1233
  Iterating over all rows with selected columns:
1228
1234
  ```py
1229
- for name, size in dc.collect("file.path", "file.size"):
1235
+ for name, size in ds.to_iter("file.path", "file.size"):
1230
1236
  print(name, size)
1231
1237
  ```
1232
1238
 
1233
1239
  Iterating over a single column:
1234
1240
  ```py
1235
- for file in dc.collect("file.path"):
1241
+ for (file,) in ds.to_iter("file.path"):
1236
1242
  print(file)
1237
1243
  ```
1238
1244
  """
@@ -1244,7 +1250,31 @@ class DataChain:
1244
1250
  ret = signals_schema.row_to_features(
1245
1251
  row, catalog=chain.session.catalog, cache=chain._settings.cache
1246
1252
  )
1247
- yield ret[0] if len(cols) == 1 else tuple(ret)
1253
+ yield tuple(ret)
1254
+
1255
+ @overload
1256
+ def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1257
+
1258
+ @overload
1259
+ def collect(self, col: str) -> Iterator[DataValue]: ...
1260
+
1261
+ @overload
1262
+ def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1263
+
1264
+ def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1265
+ """
1266
+ Deprecated. Use `to_iter` method instead.
1267
+ """
1268
+ warnings.warn(
1269
+ "Method `collect` is deprecated. Use `to_iter` method instead.",
1270
+ DeprecationWarning,
1271
+ stacklevel=2,
1272
+ )
1273
+
1274
+ if len(cols) == 1:
1275
+ yield from [item[0] for item in self.to_iter(*cols)]
1276
+ else:
1277
+ yield from self.to_iter(*cols)
1248
1278
 
1249
1279
  def to_pytorch(
1250
1280
  self,
@@ -1479,7 +1509,7 @@ class DataChain:
1479
1509
  )
1480
1510
  return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
1481
1511
 
1482
- def compare(
1512
+ def diff(
1483
1513
  self,
1484
1514
  other: "DataChain",
1485
1515
  on: Union[str, Sequence[str]],
@@ -1492,41 +1522,33 @@ class DataChain:
1492
1522
  same: bool = False,
1493
1523
  status_col: Optional[str] = None,
1494
1524
  ) -> "DataChain":
1495
- """Comparing two chains by identifying rows that are added, deleted, modified
1496
- or same. Result is the new chain that has additional column with possible
1497
- values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
1498
- rows respectively. Note that if only one "status" is asked, by setting proper
1499
- flags, this additional column is not created as it would have only one value
1500
- for all rows. Beside additional diff column, new chain has schema of the chain
1501
- on which method was called.
1525
+ """Calculate differences between two chains.
1526
+
1527
+ This method identifies records that are added, deleted, modified, or unchanged
1528
+ between two chains. It adds a status column with values: A=added, D=deleted,
1529
+ M=modified, S=same.
1502
1530
 
1503
1531
  Parameters:
1504
- other: Chain to calculate diff from.
1505
- on: Column or list of columns to match on. If both chains have the
1506
- same columns then this column is enough for the match. Otherwise,
1507
- `right_on` parameter has to specify the columns for the other chain.
1508
- This value is used to find corresponding row in other dataset. If not
1509
- found there, row is considered as added (or removed if vice versa), and
1510
- if found then row can be either modified or same.
1511
- right_on: Optional column or list of columns
1512
- for the `other` to match.
1513
- compare: Column or list of columns to compare on. If both chains have
1514
- the same columns then this column is enough for the compare. Otherwise,
1515
- `right_compare` parameter has to specify the columns for the other
1516
- chain. This value is used to see if row is modified or same. If
1517
- not set, all columns will be used for comparison
1518
- right_compare: Optional column or list of columns
1519
- for the `other` to compare to.
1520
- added (bool): Whether to return added rows in resulting chain.
1521
- deleted (bool): Whether to return deleted rows in resulting chain.
1522
- modified (bool): Whether to return modified rows in resulting chain.
1523
- same (bool): Whether to return unchanged rows in resulting chain.
1524
- status_col (str): Name of the new column that is created in resulting chain
1525
- representing diff status.
1532
+ other: Chain to compare against.
1533
+ on: Column(s) to match records between chains.
1534
+ right_on: Column(s) in the other chain to match against. Defaults to `on`.
1535
+ compare: Column(s) to check for changes.
1536
+ If not specified,all columns are used.
1537
+ right_compare: Column(s) in the other chain to compare against.
1538
+ Defaults to values of `compare`.
1539
+ added (bool): Include records that exist in this chain but not in the other.
1540
+ deleted (bool): Include records that exist only in the other chain.
1541
+ modified (bool): Include records that exist in both
1542
+ but have different values.
1543
+ same (bool): Include records that are identical in both chains.
1544
+ status_col (str): Name for the status column showing differences.
1545
+
1546
+ Default behavior: By default, shows added, deleted, and modified records,
1547
+ but excludes unchanged records (same=False). Status column is not created.
1526
1548
 
1527
1549
  Example:
1528
1550
  ```py
1529
- res = persons.compare(
1551
+ res = persons.diff(
1530
1552
  new_persons,
1531
1553
  on=["id"],
1532
1554
  right_on=["other_id"],
@@ -1555,7 +1577,7 @@ class DataChain:
1555
1577
  status_col=status_col,
1556
1578
  )
1557
1579
 
1558
- def diff(
1580
+ def file_diff(
1559
1581
  self,
1560
1582
  other: "DataChain",
1561
1583
  on: str = "file",
@@ -1566,31 +1588,29 @@ class DataChain:
1566
1588
  same: bool = False,
1567
1589
  status_col: Optional[str] = None,
1568
1590
  ) -> "DataChain":
1569
- """Similar to `.compare()`, which is more generic method to calculate difference
1570
- between two chains. Unlike `.compare()`, this method works only on those chains
1571
- that have `File` object, or it's derivatives, in it. File `source` and `path`
1572
- are used for matching, and file `version` and `etag` for comparing, while in
1573
- `.compare()` user needs to provide arbitrary columns for matching and comparing.
1591
+ """Calculate differences between two chains containing files.
1592
+
1593
+ This method is specifically designed for file chains. It uses file `source`
1594
+ and `path` to match files, and file `version` and `etag` to detect changes.
1574
1595
 
1575
1596
  Parameters:
1576
- other: Chain to calculate diff from.
1577
- on: File signal to match on. If both chains have the
1578
- same file signal then this column is enough for the match. Otherwise,
1579
- `right_on` parameter has to specify the file signal for the other chain.
1580
- This value is used to find corresponding row in other dataset. If not
1581
- found there, row is considered as added (or removed if vice versa), and
1582
- if found then row can be either modified or same.
1583
- right_on: Optional file signal for the `other` to match.
1584
- added (bool): Whether to return added rows in resulting chain.
1585
- deleted (bool): Whether to return deleted rows in resulting chain.
1586
- modified (bool): Whether to return modified rows in resulting chain.
1587
- same (bool): Whether to return unchanged rows in resulting chain.
1588
- status_col (str): Optional name of the new column that is created in
1589
- resulting chain representing diff status.
1597
+ other: Chain to compare against.
1598
+ on: File column name in this chain. Default is "file".
1599
+ right_on: File column name in the other chain. Defaults to `on`.
1600
+ added (bool): Include files that exist in this chain but not in the other.
1601
+ deleted (bool): Include files that exist only in the other chain.
1602
+ modified (bool): Include files that exist in both but have different
1603
+ versions/etags.
1604
+ same (bool): Include files that are identical in both chains.
1605
+ status_col (str): Name for the status column showing differences
1606
+ (A=added, D=deleted, M=modified, S=same).
1607
+
1608
+ Default behavior: By default, includes only new files (added=True and
1609
+ modified=True). This is useful for incremental processing.
1590
1610
 
1591
1611
  Example:
1592
1612
  ```py
1593
- diff = images.diff(
1613
+ diff = images.file_diff(
1594
1614
  new_images,
1595
1615
  on="file",
1596
1616
  right_on="other_file",
@@ -1615,7 +1635,7 @@ class DataChain:
1615
1635
  compare_cols = get_file_signals(on, compare_file_signals)
1616
1636
  right_compare_cols = get_file_signals(right_on, compare_file_signals)
1617
1637
 
1618
- return self.compare(
1638
+ return self.diff(
1619
1639
  other,
1620
1640
  on_cols,
1621
1641
  right_on=right_on_cols,
@@ -2027,7 +2047,7 @@ class DataChain:
2027
2047
  headers, _ = self._effective_signals_schema.get_headers_with_length()
2028
2048
  column_names = [".".join(filter(None, header)) for header in headers]
2029
2049
 
2030
- results_iter = self.collect_flatten()
2050
+ results_iter = self._leaf_values()
2031
2051
 
2032
2052
  with opener(path, "w", newline="") as f:
2033
2053
  writer = csv.writer(f, delimiter=delimiter, **kwargs)
@@ -2079,7 +2099,7 @@ class DataChain:
2079
2099
  if include_outer_list:
2080
2100
  # This makes the file JSON instead of JSON lines.
2081
2101
  f.write(b"[\n")
2082
- for row in self.collect_flatten():
2102
+ for row in self._leaf_values():
2083
2103
  if not is_first:
2084
2104
  if include_outer_list:
2085
2105
  # This makes the file JSON instead of JSON lines.
@@ -2244,7 +2264,7 @@ class DataChain:
2244
2264
  max_threads=num_threads or 1,
2245
2265
  client_config=client_config,
2246
2266
  )
2247
- file_exporter.run(self.collect(signal), progress_bar)
2267
+ file_exporter.run(self.to_values(signal), progress_bar)
2248
2268
 
2249
2269
  def shuffle(self) -> "Self":
2250
2270
  """Shuffle the rows of the chain deterministically."""
@@ -2378,3 +2398,72 @@ class DataChain:
2378
2398
  Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
2379
2399
  """
2380
2400
  return self._evolve(query=self._query.chunk(index, total))
2401
+
2402
+ def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
2403
+ """Returns a list of rows of values, optionally limited to the specified
2404
+ columns.
2405
+
2406
+ Args:
2407
+ *cols: Limit to the specified columns. By default, all columns are selected.
2408
+
2409
+ Returns:
2410
+ list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
2411
+
2412
+ Example:
2413
+ Getting all rows as a list:
2414
+ ```py
2415
+ rows = dc.to_list()
2416
+ print(rows)
2417
+ ```
2418
+
2419
+ Getting all rows with selected columns as a list:
2420
+ ```py
2421
+ name_size_pairs = dc.to_list("file.path", "file.size")
2422
+ print(name_size_pairs)
2423
+ ```
2424
+
2425
+ Getting a single column as a list:
2426
+ ```py
2427
+ files = dc.to_list("file.path")
2428
+ print(files) # Returns list of 1-tuples
2429
+ ```
2430
+ """
2431
+ return list(self.to_iter(*cols))
2432
+
2433
+ def to_values(self, col: str) -> list[DataValue]:
2434
+ """Returns a flat list of values from a single column.
2435
+
2436
+ Args:
2437
+ col: The name of the column to extract values from.
2438
+
2439
+ Returns:
2440
+ list[DataValue]: Returns a flat list of values from the specified column.
2441
+
2442
+ Example:
2443
+ Getting all values from a single column:
2444
+ ```py
2445
+ file_paths = dc.to_values("file.path")
2446
+ print(file_paths) # Returns list of strings
2447
+ ```
2448
+
2449
+ Getting all file sizes:
2450
+ ```py
2451
+ sizes = dc.to_values("file.size")
2452
+ print(sizes) # Returns list of integers
2453
+ ```
2454
+ """
2455
+ return [row[0] for row in self.to_list(col)]
2456
+
2457
+ def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
2458
+ """Make DataChain objects iterable.
2459
+
2460
+ Yields:
2461
+ (tuple[DataValue, ...]): Yields tuples of all column values for each row.
2462
+
2463
+ Example:
2464
+ ```py
2465
+ for row in chain:
2466
+ print(row)
2467
+ ```
2468
+ """
2469
+ return self.to_iter()
@@ -2,7 +2,11 @@ from collections.abc import Sequence
2
2
  from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
3
3
 
4
4
  from datachain.dataset import parse_dataset_name
5
- from datachain.error import DatasetVersionNotFoundError
5
+ from datachain.error import (
6
+ DatasetNotFoundError,
7
+ DatasetVersionNotFoundError,
8
+ ProjectNotFoundError,
9
+ )
6
10
  from datachain.lib.dataset_info import DatasetInfo
7
11
  from datachain.lib.file import (
8
12
  File,
@@ -33,7 +37,11 @@ def read_dataset(
33
37
  settings: Optional[dict] = None,
34
38
  fallback_to_studio: bool = True,
35
39
  delta: Optional[bool] = False,
36
- delta_on: Optional[Union[str, Sequence[str]]] = None,
40
+ delta_on: Optional[Union[str, Sequence[str]]] = (
41
+ "file.path",
42
+ "file.etag",
43
+ "file.version",
44
+ ),
37
45
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
38
46
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
39
47
  delta_retry: Optional[Union[bool, str]] = None,
@@ -53,41 +61,25 @@ def read_dataset(
53
61
  settings : Settings to use for the chain.
54
62
  fallback_to_studio : Try to pull dataset from Studio if not found locally.
55
63
  Default is True.
56
- delta: If set to True, we optimize the creation of new dataset versions by
57
- calculating the diff between the latest version of this storage and the
58
- version used to create the most recent version of the resulting chain
59
- dataset (the one specified in `.save()`). We then run the "diff" chain
60
- using only the diff data, rather than the entire storage data, and merge
61
- that diff chain with the latest version of the resulting dataset to create
62
- a new version. This approach avoids applying modifications to all records
63
- from storage every time, which can be an expensive operation.
64
- The diff is calculated using the `DataChain.compare()` method, which
65
- compares the `delta_on` fields to find matches and checks the compare
66
- fields to determine if a record has changed. Note that this process only
67
- considers added and modified records in storage; deleted records are not
68
- removed from the new dataset version.
69
- This calculation is based on the difference between the current version
70
- of the source and the version used to create the dataset.
71
- delta_on: A list of fields that uniquely identify rows in the source.
72
- If two rows have the same values, they are considered the same (e.g., they
73
- could be different versions of the same row in a versioned source).
74
- This is used in the delta update to calculate the diff.
75
- delta_result_on: A list of fields in the resulting dataset that correspond
76
- to the `delta_on` fields from the source.
77
- This is needed to identify rows that have changed in the source but are
78
- already present in the current version of the resulting dataset, in order
79
- to avoid including outdated versions of those rows in the new dataset.
80
- We retain only the latest versions of rows to prevent duplication.
81
- There is no need to define this if the `delta_on` fields are present in
82
- the final dataset and have not been renamed.
83
- delta_compare: A list of fields used to check if the same row has been modified
84
- in the new version of the source.
85
- If not defined, all fields except those defined in delta_on will be used.
86
- delta_retry: Specifies retry behavior for delta processing. If a string,
87
- it's the name of a field in the result dataset that indicates an error
88
- when not None - records with errors will be reprocessed. If True,
89
- records that exist in the source dataset but not in the result dataset
90
- will be reprocessed.
64
+ delta: If True, only process new or changed files instead of reprocessing
65
+ everything. This saves time by skipping files that were already processed in
66
+ previous versions. The optimization is working when a new version of the
67
+ dataset is created.
68
+ Default is False.
69
+ delta_on: Field(s) that uniquely identify each record in the source data.
70
+ Used to detect which records are new or changed.
71
+ Default is ("file.path", "file.etag", "file.version").
72
+ delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
73
+ Only needed if you rename the identifying fields during processing.
74
+ Default is None.
75
+ delta_compare: Field(s) used to detect if a record has changed.
76
+ If not specified, all fields except `delta_on` fields are used.
77
+ Default is None.
78
+ delta_retry: Controls retry behavior for failed records:
79
+ - String (field name): Reprocess records where this field is not empty
80
+ (error mode)
81
+ - True: Reprocess records missing from the result dataset (missing mode)
82
+ - None: No retry processing (default)
91
83
 
92
84
  Example:
93
85
  ```py
@@ -148,9 +140,15 @@ def read_dataset(
148
140
  # all 2.* dataset versions). If dataset doesn't have any versions where
149
141
  # major part is equal to that input, exception is thrown.
150
142
  major = int(version)
151
- dataset = session.catalog.get_dataset(
152
- name, get_project(project_name, namespace_name, session=session)
153
- )
143
+ try:
144
+ ds_project = get_project(project_name, namespace_name, session=session)
145
+ except ProjectNotFoundError:
146
+ raise DatasetNotFoundError(
147
+ f"Dataset {name} not found in namespace {namespace_name} and",
148
+ f" project {project_name}",
149
+ ) from None
150
+
151
+ dataset = session.catalog.get_dataset(name, ds_project)
154
152
  latest_major = dataset.latest_major_version(major)
155
153
  if not latest_major:
156
154
  raise DatasetVersionNotFoundError(
@@ -228,7 +226,7 @@ def datasets(
228
226
  import datachain as dc
229
227
 
230
228
  chain = dc.datasets(column="dataset")
231
- for ds in chain.collect("dataset"):
229
+ for ds in chain.to_iter("dataset"):
232
230
  print(f"{ds.name}@v{ds.version}")
233
231
  ```
234
232
  """
@@ -333,7 +331,13 @@ def delete_dataset(
333
331
  None, name, namespace_name, project_name, version=version, force=force
334
332
  )
335
333
 
336
- ds_project = get_project(project_name, namespace_name, session=session)
334
+ try:
335
+ ds_project = get_project(project_name, namespace_name, session=session)
336
+ except ProjectNotFoundError:
337
+ raise DatasetNotFoundError(
338
+ f"Dataset {name} not found in namespace {namespace_name} and project",
339
+ f" {project_name}",
340
+ ) from None
337
341
 
338
342
  if not force:
339
343
  version = version or catalog.get_dataset(name, ds_project).latest_version