datachain 0.21.0__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (48) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +180 -65
  4. datachain/cli/__init__.py +4 -9
  5. datachain/cli/commands/datasets.py +43 -28
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +422 -37
  12. datachain/data_storage/sqlite.py +136 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +126 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +260 -92
  21. datachain/lib/dc/datasets.py +104 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +1 -0
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/tar.py +1 -2
  33. datachain/lib/udf.py +1 -1
  34. datachain/lib/udf_signature.py +1 -1
  35. datachain/lib/webdataset.py +30 -20
  36. datachain/listing.py +3 -1
  37. datachain/namespace.py +65 -0
  38. datachain/project.py +78 -0
  39. datachain/query/dataset.py +71 -46
  40. datachain/query/session.py +1 -1
  41. datachain/remote/studio.py +61 -26
  42. datachain/studio.py +36 -10
  43. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
  44. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
  45. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
  46. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
  47. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
  48. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
datachain/func/func.py CHANGED
@@ -25,7 +25,7 @@ if TYPE_CHECKING:
25
25
  ColT = Union[str, Column, ColumnElement, "Func", tuple]
26
26
 
27
27
 
28
- class Func(Function):
28
+ class Func(Function): # noqa: PLW1641
29
29
  """Represents a function to be applied to a column in a SQL query."""
30
30
 
31
31
  def __init__(
datachain/lib/arrow.py CHANGED
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
76
76
  fs_path = file.path
77
77
  fs = ReferenceFileSystem({fs_path: [cache_path]})
78
78
  else:
79
- fs, fs_path = file.get_fs(), file.get_path()
79
+ fs, fs_path = file.get_fs(), file.get_fs_path()
80
80
 
81
81
  kwargs = self.kwargs
82
82
  if format := kwargs.get("format"):
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
160
160
  kwargs["format"] = fix_pyarrow_format(format, parse_options)
161
161
 
162
162
  schemas = []
163
- for file in chain.collect("file"):
164
- ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
163
+ for (file,) in chain.to_iter("file"):
164
+ ds = dataset(file.get_fs_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
165
165
  schemas.append(ds.schema)
166
166
  if not schemas:
167
167
  raise ValueError(
@@ -22,6 +22,8 @@ if TYPE_CHECKING:
22
22
 
23
23
  class DatasetInfo(DataModel):
24
24
  name: str
25
+ namespace: str
26
+ project: str
25
27
  uuid: str = Field(default=str(uuid4()))
26
28
  version: str = Field(default=DEFAULT_DATASET_VERSION)
27
29
  status: int = Field(default=DatasetStatus.CREATED)
@@ -91,6 +93,8 @@ class DatasetInfo(DataModel):
91
93
  return cls(
92
94
  uuid=version.uuid,
93
95
  name=dataset.name,
96
+ namespace=dataset.project.namespace.name,
97
+ project=dataset.project.name,
94
98
  version=version.version,
95
99
  status=version.status,
96
100
  created_at=version.created_at,
@@ -24,8 +24,9 @@ from pydantic import BaseModel
24
24
  from tqdm import tqdm
25
25
 
26
26
  from datachain import semver
27
- from datachain.dataset import DatasetRecord
27
+ from datachain.dataset import DatasetRecord, parse_dataset_name
28
28
  from datachain.delta import delta_disabled
29
+ from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
29
30
  from datachain.func import literal
30
31
  from datachain.func.base import Function
31
32
  from datachain.func.func import Func
@@ -261,7 +262,7 @@ class DataChain:
261
262
  """Underlying dataset, if there is one."""
262
263
  if not self.name:
263
264
  return None
264
- return self.session.catalog.get_dataset(self.name)
265
+ return self.session.catalog.get_dataset(self.name, self._query.project)
265
266
 
266
267
  def __or__(self, other: "Self") -> "Self":
267
268
  """Return `self.union(other)`."""
@@ -312,6 +313,8 @@ class DataChain:
312
313
  min_task_size=None,
313
314
  prefetch: Optional[int] = None,
314
315
  sys: Optional[bool] = None,
316
+ namespace: Optional[str] = None,
317
+ project: Optional[str] = None,
315
318
  ) -> "Self":
316
319
  """Change settings for chain.
317
320
 
@@ -327,6 +330,8 @@ class DataChain:
327
330
  prefetch: number of workers to use for downloading files in advance.
328
331
  This is enabled by default and uses 2 workers.
329
332
  To disable prefetching, set it to 0.
333
+ namespace: namespace name.
334
+ project: project name.
330
335
 
331
336
  Example:
332
337
  ```py
@@ -340,7 +345,11 @@ class DataChain:
340
345
  if sys is None:
341
346
  sys = self._sys
342
347
  settings = copy.copy(self._settings)
343
- settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
348
+ settings.add(
349
+ Settings(
350
+ cache, parallel, workers, min_task_size, prefetch, namespace, project
351
+ )
352
+ )
344
353
  return self._evolve(settings=settings, _sys=sys)
345
354
 
346
355
  def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
@@ -430,10 +439,10 @@ class DataChain:
430
439
 
431
440
  from datachain.lib.arrow import schema_to_output
432
441
 
433
- json_values = list(self.limit(schema_sample_size).collect(col))
442
+ json_values = self.limit(schema_sample_size).to_list(col)
434
443
  json_dicts = [
435
444
  json.loads(json_value) if isinstance(json_value, str) else json_value
436
- for json_value in json_values
445
+ for (json_value,) in json_values
437
446
  ]
438
447
 
439
448
  if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
@@ -490,6 +499,22 @@ class DataChain:
490
499
  )
491
500
  return listings(*args, **kwargs)
492
501
 
502
+ @property
503
+ def namespace_name(self) -> str:
504
+ """Current namespace name in which the chain is running"""
505
+ return (
506
+ self._settings.namespace
507
+ or self.session.catalog.metastore.default_namespace_name
508
+ )
509
+
510
+ @property
511
+ def project_name(self) -> str:
512
+ """Current project name in which the chain is running"""
513
+ return (
514
+ self._settings.project
515
+ or self.session.catalog.metastore.default_project_name
516
+ )
517
+
493
518
  def persist(self) -> "Self":
494
519
  """Saves temporary chain that will be removed after the process ends.
495
520
  Temporary datasets are useful for optimization, for example when we have
@@ -499,7 +524,14 @@ class DataChain:
499
524
  It returns the chain itself.
500
525
  """
501
526
  schema = self.signals_schema.clone_without_sys_signals().serialize()
502
- return self._evolve(query=self._query.save(feature_schema=schema))
527
+ project = self.session.catalog.metastore.get_project(
528
+ self.project_name,
529
+ self.namespace_name,
530
+ create=True,
531
+ )
532
+ return self._evolve(
533
+ query=self._query.save(project=project, feature_schema=schema)
534
+ )
503
535
 
504
536
  def save( # type: ignore[override]
505
537
  self,
@@ -513,7 +545,10 @@ class DataChain:
513
545
  """Save to a Dataset. It returns the chain itself.
514
546
 
515
547
  Parameters:
516
- name : dataset name.
548
+ name : dataset name. It can be full name consisting of namespace and
549
+ project, but it can also be just a regular dataset name in which
550
+ case we are taking namespace and project from settings, if they
551
+ are defined there, or default ones instead.
517
552
  version : version of a dataset. If version is not specified and dataset
518
553
  already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
519
554
  description : description of a dataset.
@@ -535,6 +570,29 @@ class DataChain:
535
570
  " patch"
536
571
  )
537
572
 
573
+ namespace_name, project_name, name = parse_dataset_name(name)
574
+
575
+ namespace_name = (
576
+ namespace_name
577
+ or self._settings.namespace
578
+ or self.session.catalog.metastore.default_namespace_name
579
+ )
580
+ project_name = (
581
+ project_name
582
+ or self._settings.project
583
+ or self.session.catalog.metastore.default_project_name
584
+ )
585
+
586
+ try:
587
+ project = self.session.catalog.metastore.get_project(
588
+ project_name,
589
+ namespace_name,
590
+ create=self.session.catalog.metastore.project_allowed_to_create,
591
+ )
592
+ except ProjectNotFoundError as e:
593
+ # not being able to create it as creation is not allowed
594
+ raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
595
+
538
596
  schema = self.signals_schema.clone_without_sys_signals().serialize()
539
597
 
540
598
  # Handle retry and delta functionality
@@ -558,6 +616,7 @@ class DataChain:
558
616
  query=result_ds._query.save(
559
617
  name=name,
560
618
  version=version,
619
+ project=project,
561
620
  feature_schema=schema,
562
621
  dependencies=dependencies,
563
622
  **kwargs,
@@ -577,6 +636,7 @@ class DataChain:
577
636
  query=self._query.save(
578
637
  name=name,
579
638
  version=version,
639
+ project=project,
580
640
  description=description,
581
641
  attrs=attrs,
582
642
  feature_schema=schema,
@@ -843,7 +903,7 @@ class DataChain:
843
903
  Order is not guaranteed when steps are added after an `order_by` statement.
844
904
  I.e. when using `read_dataset` an `order_by` statement should be used if
845
905
  the order of the records in the chain is important.
846
- Using `order_by` directly before `limit`, `collect` and `collect_flatten`
906
+ Using `order_by` directly before `limit`, `to_list` and similar methods
847
907
  will give expected results.
848
908
  See https://github.com/iterative/datachain/issues/477 for further details.
849
909
  """
@@ -1048,32 +1108,32 @@ class DataChain:
1048
1108
 
1049
1109
  @property
1050
1110
  def _effective_signals_schema(self) -> "SignalSchema":
1051
- """Effective schema used for user-facing API like collect, to_pandas, etc."""
1111
+ """Effective schema used for user-facing API like to_list, to_pandas, etc."""
1052
1112
  signals_schema = self.signals_schema
1053
1113
  if not self._sys:
1054
1114
  return signals_schema.clone_without_sys_signals()
1055
1115
  return signals_schema
1056
1116
 
1057
1117
  @overload
1058
- def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
1118
+ def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
1059
1119
 
1060
1120
  @overload
1061
- def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1121
+ def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1062
1122
 
1063
1123
  @overload
1064
- def collect_flatten(
1124
+ def _leaf_values(
1065
1125
  self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
1066
1126
  ) -> Iterator[_T]: ...
1067
1127
 
1068
1128
  @overload
1069
- def collect_flatten(
1129
+ def _leaf_values(
1070
1130
  self,
1071
1131
  *,
1072
1132
  row_factory: Callable[[list[str], tuple[Any, ...]], _T],
1073
1133
  include_hidden: bool,
1074
1134
  ) -> Iterator[_T]: ...
1075
1135
 
1076
- def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
1136
+ def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
1077
1137
  """Yields flattened rows of values as a tuple.
1078
1138
 
1079
1139
  Args:
@@ -1101,7 +1161,7 @@ class DataChain:
1101
1161
  headers, _ = self._effective_signals_schema.get_headers_with_length()
1102
1162
  column_names = [".".join(filter(None, header)) for header in headers]
1103
1163
 
1104
- results_iter = self.collect_flatten()
1164
+ results_iter = self._leaf_values()
1105
1165
 
1106
1166
  def column_chunks() -> Iterator[list[list[Any]]]:
1107
1167
  for chunk_iter in batched_it(results_iter, chunk_size):
@@ -1134,9 +1194,9 @@ class DataChain:
1134
1194
 
1135
1195
  def results(self, *, row_factory=None, include_hidden=True):
1136
1196
  if row_factory is None:
1137
- return list(self.collect_flatten(include_hidden=include_hidden))
1197
+ return list(self._leaf_values(include_hidden=include_hidden))
1138
1198
  return list(
1139
- self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
1199
+ self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
1140
1200
  )
1141
1201
 
1142
1202
  def to_records(self) -> list[dict[str, Any]]:
@@ -1147,42 +1207,38 @@ class DataChain:
1147
1207
 
1148
1208
  return self.results(row_factory=to_dict)
1149
1209
 
1150
- @overload
1151
- def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1152
-
1153
- @overload
1154
- def collect(self, col: str) -> Iterator[DataValue]: ...
1155
-
1156
- @overload
1157
- def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1158
-
1159
- def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1210
+ def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
1160
1211
  """Yields rows of values, optionally limited to the specified columns.
1161
1212
 
1162
1213
  Args:
1163
1214
  *cols: Limit to the specified columns. By default, all columns are selected.
1164
1215
 
1165
1216
  Yields:
1166
- (DataType): Yields a single item if a column is selected.
1167
- (tuple[DataType, ...]): Yields a tuple of items if multiple columns are
1168
- selected.
1217
+ (tuple[DataType, ...]): Yields a tuple of items for each row.
1169
1218
 
1170
1219
  Example:
1171
1220
  Iterating over all rows:
1172
1221
  ```py
1173
- for row in dc.collect():
1222
+ for row in ds.to_iter():
1223
+ print(row)
1224
+ ```
1225
+
1226
+ DataChain is iterable and can be used in a for loop directly which is
1227
+ equivalent to `ds.to_iter()`:
1228
+ ```py
1229
+ for row in ds:
1174
1230
  print(row)
1175
1231
  ```
1176
1232
 
1177
1233
  Iterating over all rows with selected columns:
1178
1234
  ```py
1179
- for name, size in dc.collect("file.path", "file.size"):
1235
+ for name, size in ds.to_iter("file.path", "file.size"):
1180
1236
  print(name, size)
1181
1237
  ```
1182
1238
 
1183
1239
  Iterating over a single column:
1184
1240
  ```py
1185
- for file in dc.collect("file.path"):
1241
+ for (file,) in ds.to_iter("file.path"):
1186
1242
  print(file)
1187
1243
  ```
1188
1244
  """
@@ -1194,7 +1250,31 @@ class DataChain:
1194
1250
  ret = signals_schema.row_to_features(
1195
1251
  row, catalog=chain.session.catalog, cache=chain._settings.cache
1196
1252
  )
1197
- yield ret[0] if len(cols) == 1 else tuple(ret)
1253
+ yield tuple(ret)
1254
+
1255
+ @overload
1256
+ def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1257
+
1258
+ @overload
1259
+ def collect(self, col: str) -> Iterator[DataValue]: ...
1260
+
1261
+ @overload
1262
+ def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1263
+
1264
+ def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1265
+ """
1266
+ Deprecated. Use `to_iter` method instead.
1267
+ """
1268
+ warnings.warn(
1269
+ "Method `collect` is deprecated. Use `to_iter` method instead.",
1270
+ DeprecationWarning,
1271
+ stacklevel=2,
1272
+ )
1273
+
1274
+ if len(cols) == 1:
1275
+ yield from [item[0] for item in self.to_iter(*cols)]
1276
+ else:
1277
+ yield from self.to_iter(*cols)
1198
1278
 
1199
1279
  def to_pytorch(
1200
1280
  self,
@@ -1429,7 +1509,7 @@ class DataChain:
1429
1509
  )
1430
1510
  return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
1431
1511
 
1432
- def compare(
1512
+ def diff(
1433
1513
  self,
1434
1514
  other: "DataChain",
1435
1515
  on: Union[str, Sequence[str]],
@@ -1442,41 +1522,33 @@ class DataChain:
1442
1522
  same: bool = False,
1443
1523
  status_col: Optional[str] = None,
1444
1524
  ) -> "DataChain":
1445
- """Comparing two chains by identifying rows that are added, deleted, modified
1446
- or same. Result is the new chain that has additional column with possible
1447
- values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
1448
- rows respectively. Note that if only one "status" is asked, by setting proper
1449
- flags, this additional column is not created as it would have only one value
1450
- for all rows. Beside additional diff column, new chain has schema of the chain
1451
- on which method was called.
1525
+ """Calculate differences between two chains.
1526
+
1527
+ This method identifies records that are added, deleted, modified, or unchanged
1528
+ between two chains. It adds a status column with values: A=added, D=deleted,
1529
+ M=modified, S=same.
1452
1530
 
1453
1531
  Parameters:
1454
- other: Chain to calculate diff from.
1455
- on: Column or list of columns to match on. If both chains have the
1456
- same columns then this column is enough for the match. Otherwise,
1457
- `right_on` parameter has to specify the columns for the other chain.
1458
- This value is used to find corresponding row in other dataset. If not
1459
- found there, row is considered as added (or removed if vice versa), and
1460
- if found then row can be either modified or same.
1461
- right_on: Optional column or list of columns
1462
- for the `other` to match.
1463
- compare: Column or list of columns to compare on. If both chains have
1464
- the same columns then this column is enough for the compare. Otherwise,
1465
- `right_compare` parameter has to specify the columns for the other
1466
- chain. This value is used to see if row is modified or same. If
1467
- not set, all columns will be used for comparison
1468
- right_compare: Optional column or list of columns
1469
- for the `other` to compare to.
1470
- added (bool): Whether to return added rows in resulting chain.
1471
- deleted (bool): Whether to return deleted rows in resulting chain.
1472
- modified (bool): Whether to return modified rows in resulting chain.
1473
- same (bool): Whether to return unchanged rows in resulting chain.
1474
- status_col (str): Name of the new column that is created in resulting chain
1475
- representing diff status.
1532
+ other: Chain to compare against.
1533
+ on: Column(s) to match records between chains.
1534
+ right_on: Column(s) in the other chain to match against. Defaults to `on`.
1535
+ compare: Column(s) to check for changes.
1536
+ If not specified,all columns are used.
1537
+ right_compare: Column(s) in the other chain to compare against.
1538
+ Defaults to values of `compare`.
1539
+ added (bool): Include records that exist in this chain but not in the other.
1540
+ deleted (bool): Include records that exist only in the other chain.
1541
+ modified (bool): Include records that exist in both
1542
+ but have different values.
1543
+ same (bool): Include records that are identical in both chains.
1544
+ status_col (str): Name for the status column showing differences.
1545
+
1546
+ Default behavior: By default, shows added, deleted, and modified records,
1547
+ but excludes unchanged records (same=False). Status column is not created.
1476
1548
 
1477
1549
  Example:
1478
1550
  ```py
1479
- res = persons.compare(
1551
+ res = persons.diff(
1480
1552
  new_persons,
1481
1553
  on=["id"],
1482
1554
  right_on=["other_id"],
@@ -1505,7 +1577,7 @@ class DataChain:
1505
1577
  status_col=status_col,
1506
1578
  )
1507
1579
 
1508
- def diff(
1580
+ def file_diff(
1509
1581
  self,
1510
1582
  other: "DataChain",
1511
1583
  on: str = "file",
@@ -1516,31 +1588,29 @@ class DataChain:
1516
1588
  same: bool = False,
1517
1589
  status_col: Optional[str] = None,
1518
1590
  ) -> "DataChain":
1519
- """Similar to `.compare()`, which is more generic method to calculate difference
1520
- between two chains. Unlike `.compare()`, this method works only on those chains
1521
- that have `File` object, or it's derivatives, in it. File `source` and `path`
1522
- are used for matching, and file `version` and `etag` for comparing, while in
1523
- `.compare()` user needs to provide arbitrary columns for matching and comparing.
1591
+ """Calculate differences between two chains containing files.
1592
+
1593
+ This method is specifically designed for file chains. It uses file `source`
1594
+ and `path` to match files, and file `version` and `etag` to detect changes.
1524
1595
 
1525
1596
  Parameters:
1526
- other: Chain to calculate diff from.
1527
- on: File signal to match on. If both chains have the
1528
- same file signal then this column is enough for the match. Otherwise,
1529
- `right_on` parameter has to specify the file signal for the other chain.
1530
- This value is used to find corresponding row in other dataset. If not
1531
- found there, row is considered as added (or removed if vice versa), and
1532
- if found then row can be either modified or same.
1533
- right_on: Optional file signal for the `other` to match.
1534
- added (bool): Whether to return added rows in resulting chain.
1535
- deleted (bool): Whether to return deleted rows in resulting chain.
1536
- modified (bool): Whether to return modified rows in resulting chain.
1537
- same (bool): Whether to return unchanged rows in resulting chain.
1538
- status_col (str): Optional name of the new column that is created in
1539
- resulting chain representing diff status.
1597
+ other: Chain to compare against.
1598
+ on: File column name in this chain. Default is "file".
1599
+ right_on: File column name in the other chain. Defaults to `on`.
1600
+ added (bool): Include files that exist in this chain but not in the other.
1601
+ deleted (bool): Include files that exist only in the other chain.
1602
+ modified (bool): Include files that exist in both but have different
1603
+ versions/etags.
1604
+ same (bool): Include files that are identical in both chains.
1605
+ status_col (str): Name for the status column showing differences
1606
+ (A=added, D=deleted, M=modified, S=same).
1607
+
1608
+ Default behavior: By default, includes only new files (added=True and
1609
+ modified=True). This is useful for incremental processing.
1540
1610
 
1541
1611
  Example:
1542
1612
  ```py
1543
- diff = images.diff(
1613
+ diff = images.file_diff(
1544
1614
  new_images,
1545
1615
  on="file",
1546
1616
  right_on="other_file",
@@ -1565,7 +1635,7 @@ class DataChain:
1565
1635
  compare_cols = get_file_signals(on, compare_file_signals)
1566
1636
  right_compare_cols = get_file_signals(right_on, compare_file_signals)
1567
1637
 
1568
- return self.compare(
1638
+ return self.diff(
1569
1639
  other,
1570
1640
  on_cols,
1571
1641
  right_on=right_on_cols,
@@ -1977,7 +2047,7 @@ class DataChain:
1977
2047
  headers, _ = self._effective_signals_schema.get_headers_with_length()
1978
2048
  column_names = [".".join(filter(None, header)) for header in headers]
1979
2049
 
1980
- results_iter = self.collect_flatten()
2050
+ results_iter = self._leaf_values()
1981
2051
 
1982
2052
  with opener(path, "w", newline="") as f:
1983
2053
  writer = csv.writer(f, delimiter=delimiter, **kwargs)
@@ -2029,7 +2099,7 @@ class DataChain:
2029
2099
  if include_outer_list:
2030
2100
  # This makes the file JSON instead of JSON lines.
2031
2101
  f.write(b"[\n")
2032
- for row in self.collect_flatten():
2102
+ for row in self._leaf_values():
2033
2103
  if not is_first:
2034
2104
  if include_outer_list:
2035
2105
  # This makes the file JSON instead of JSON lines.
@@ -2194,7 +2264,7 @@ class DataChain:
2194
2264
  max_threads=num_threads or 1,
2195
2265
  client_config=client_config,
2196
2266
  )
2197
- file_exporter.run(self.collect(signal), progress_bar)
2267
+ file_exporter.run(self.to_values(signal), progress_bar)
2198
2268
 
2199
2269
  def shuffle(self) -> "Self":
2200
2270
  """Shuffle the rows of the chain deterministically."""
@@ -2239,16 +2309,45 @@ class DataChain:
2239
2309
 
2240
2310
  Combining filters with "or"
2241
2311
  ```py
2242
- dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
2312
+ dc.filter(
2313
+ C("file.path").glob("cat*") |
2314
+ C("file.path").glob("dog*")
2315
+ )
2316
+ ```
2317
+
2318
+ ```py
2319
+ dc.filter(dc.func.or_(
2320
+ C("file.path").glob("cat*"),
2321
+ C("file.path").glob("dog*")
2322
+ ))
2243
2323
  ```
2244
2324
 
2245
2325
  Combining filters with "and"
2246
2326
  ```py
2247
2327
  dc.filter(
2248
- C("file.path").glob("*.jpg) &
2328
+ C("file.path").glob("*.jpg"),
2329
+ string.length(C("file.path")) > 5
2330
+ )
2331
+ ```
2332
+
2333
+ ```py
2334
+ dc.filter(
2335
+ C("file.path").glob("*.jpg") &
2249
2336
  (string.length(C("file.path")) > 5)
2250
2337
  )
2251
2338
  ```
2339
+
2340
+ ```py
2341
+ dc.filter(dc.func.and_(
2342
+ C("file.path").glob("*.jpg"),
2343
+ string.length(C("file.path")) > 5
2344
+ ))
2345
+ ```
2346
+
2347
+ Combining filters with "not"
2348
+ ```py
2349
+ dc.filter(~(C("file.path").glob("*.jpg")))
2350
+ ```
2252
2351
  """
2253
2352
  return self._evolve(query=self._query.filter(*args))
2254
2353
 
@@ -2299,3 +2398,72 @@ class DataChain:
2299
2398
  Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
2300
2399
  """
2301
2400
  return self._evolve(query=self._query.chunk(index, total))
2401
+
2402
+ def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
2403
+ """Returns a list of rows of values, optionally limited to the specified
2404
+ columns.
2405
+
2406
+ Args:
2407
+ *cols: Limit to the specified columns. By default, all columns are selected.
2408
+
2409
+ Returns:
2410
+ list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
2411
+
2412
+ Example:
2413
+ Getting all rows as a list:
2414
+ ```py
2415
+ rows = dc.to_list()
2416
+ print(rows)
2417
+ ```
2418
+
2419
+ Getting all rows with selected columns as a list:
2420
+ ```py
2421
+ name_size_pairs = dc.to_list("file.path", "file.size")
2422
+ print(name_size_pairs)
2423
+ ```
2424
+
2425
+ Getting a single column as a list:
2426
+ ```py
2427
+ files = dc.to_list("file.path")
2428
+ print(files) # Returns list of 1-tuples
2429
+ ```
2430
+ """
2431
+ return list(self.to_iter(*cols))
2432
+
2433
+ def to_values(self, col: str) -> list[DataValue]:
2434
+ """Returns a flat list of values from a single column.
2435
+
2436
+ Args:
2437
+ col: The name of the column to extract values from.
2438
+
2439
+ Returns:
2440
+ list[DataValue]: Returns a flat list of values from the specified column.
2441
+
2442
+ Example:
2443
+ Getting all values from a single column:
2444
+ ```py
2445
+ file_paths = dc.to_values("file.path")
2446
+ print(file_paths) # Returns list of strings
2447
+ ```
2448
+
2449
+ Getting all file sizes:
2450
+ ```py
2451
+ sizes = dc.to_values("file.size")
2452
+ print(sizes) # Returns list of integers
2453
+ ```
2454
+ """
2455
+ return [row[0] for row in self.to_list(col)]
2456
+
2457
+ def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
2458
+ """Make DataChain objects iterable.
2459
+
2460
+ Yields:
2461
+ (tuple[DataValue, ...]): Yields tuples of all column values for each row.
2462
+
2463
+ Example:
2464
+ ```py
2465
+ for row in chain:
2466
+ print(row)
2467
+ ```
2468
+ """
2469
+ return self.to_iter()