datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +213 -65
  4. datachain/cli/__init__.py +0 -7
  5. datachain/cli/commands/datasets.py +35 -26
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +433 -37
  12. datachain/data_storage/sqlite.py +140 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +128 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +253 -91
  21. datachain/lib/dc/datasets.py +103 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +2 -1
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/signal_schema.py +8 -0
  33. datachain/lib/tar.py +1 -2
  34. datachain/lib/udf.py +1 -1
  35. datachain/lib/udf_signature.py +1 -1
  36. datachain/lib/webdataset.py +30 -20
  37. datachain/listing.py +3 -1
  38. datachain/namespace.py +65 -0
  39. datachain/project.py +78 -0
  40. datachain/query/dataset.py +71 -46
  41. datachain/query/session.py +1 -1
  42. datachain/remote/studio.py +61 -26
  43. datachain/studio.py +23 -6
  44. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
  45. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
  46. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
  47. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
  48. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
  49. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
datachain/func/func.py CHANGED
@@ -25,7 +25,7 @@ if TYPE_CHECKING:
25
25
  ColT = Union[str, Column, ColumnElement, "Func", tuple]
26
26
 
27
27
 
28
- class Func(Function):
28
+ class Func(Function): # noqa: PLW1641
29
29
  """Represents a function to be applied to a column in a SQL query."""
30
30
 
31
31
  def __init__(
datachain/lib/arrow.py CHANGED
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
76
76
  fs_path = file.path
77
77
  fs = ReferenceFileSystem({fs_path: [cache_path]})
78
78
  else:
79
- fs, fs_path = file.get_fs(), file.get_path()
79
+ fs, fs_path = file.get_fs(), file.get_fs_path()
80
80
 
81
81
  kwargs = self.kwargs
82
82
  if format := kwargs.get("format"):
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
160
160
  kwargs["format"] = fix_pyarrow_format(format, parse_options)
161
161
 
162
162
  schemas = []
163
- for file in chain.collect("file"):
164
- ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
163
+ for (file,) in chain.to_iter("file"):
164
+ ds = dataset(file.get_fs_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
165
165
  schemas.append(ds.schema)
166
166
  if not schemas:
167
167
  raise ValueError(
@@ -22,6 +22,8 @@ if TYPE_CHECKING:
22
22
 
23
23
  class DatasetInfo(DataModel):
24
24
  name: str
25
+ namespace: str
26
+ project: str
25
27
  uuid: str = Field(default=str(uuid4()))
26
28
  version: str = Field(default=DEFAULT_DATASET_VERSION)
27
29
  status: int = Field(default=DatasetStatus.CREATED)
@@ -91,6 +93,8 @@ class DatasetInfo(DataModel):
91
93
  return cls(
92
94
  uuid=version.uuid,
93
95
  name=dataset.name,
96
+ namespace=dataset.project.namespace.name,
97
+ project=dataset.project.name,
94
98
  version=version.version,
95
99
  status=version.status,
96
100
  created_at=version.created_at,
@@ -26,6 +26,7 @@ from tqdm import tqdm
26
26
  from datachain import semver
27
27
  from datachain.dataset import DatasetRecord
28
28
  from datachain.delta import delta_disabled
29
+ from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
29
30
  from datachain.func import literal
30
31
  from datachain.func.base import Function
31
32
  from datachain.func.func import Func
@@ -261,7 +262,7 @@ class DataChain:
261
262
  """Underlying dataset, if there is one."""
262
263
  if not self.name:
263
264
  return None
264
- return self.session.catalog.get_dataset(self.name)
265
+ return self.session.catalog.get_dataset(self.name, self._query.project)
265
266
 
266
267
  def __or__(self, other: "Self") -> "Self":
267
268
  """Return `self.union(other)`."""
@@ -312,6 +313,8 @@ class DataChain:
312
313
  min_task_size=None,
313
314
  prefetch: Optional[int] = None,
314
315
  sys: Optional[bool] = None,
316
+ namespace: Optional[str] = None,
317
+ project: Optional[str] = None,
315
318
  ) -> "Self":
316
319
  """Change settings for chain.
317
320
 
@@ -327,6 +330,8 @@ class DataChain:
327
330
  prefetch: number of workers to use for downloading files in advance.
328
331
  This is enabled by default and uses 2 workers.
329
332
  To disable prefetching, set it to 0.
333
+ namespace: namespace name.
334
+ project: project name.
330
335
 
331
336
  Example:
332
337
  ```py
@@ -340,7 +345,11 @@ class DataChain:
340
345
  if sys is None:
341
346
  sys = self._sys
342
347
  settings = copy.copy(self._settings)
343
- settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
348
+ settings.add(
349
+ Settings(
350
+ cache, parallel, workers, min_task_size, prefetch, namespace, project
351
+ )
352
+ )
344
353
  return self._evolve(settings=settings, _sys=sys)
345
354
 
346
355
  def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
@@ -430,10 +439,10 @@ class DataChain:
430
439
 
431
440
  from datachain.lib.arrow import schema_to_output
432
441
 
433
- json_values = list(self.limit(schema_sample_size).collect(col))
442
+ json_values = self.limit(schema_sample_size).to_list(col)
434
443
  json_dicts = [
435
444
  json.loads(json_value) if isinstance(json_value, str) else json_value
436
- for json_value in json_values
445
+ for (json_value,) in json_values
437
446
  ]
438
447
 
439
448
  if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
@@ -490,6 +499,22 @@ class DataChain:
490
499
  )
491
500
  return listings(*args, **kwargs)
492
501
 
502
+ @property
503
+ def namespace_name(self) -> str:
504
+ """Current namespace name in which the chain is running"""
505
+ return (
506
+ self._settings.namespace
507
+ or self.session.catalog.metastore.default_namespace_name
508
+ )
509
+
510
+ @property
511
+ def project_name(self) -> str:
512
+ """Current project name in which the chain is running"""
513
+ return (
514
+ self._settings.project
515
+ or self.session.catalog.metastore.default_project_name
516
+ )
517
+
493
518
  def persist(self) -> "Self":
494
519
  """Saves temporary chain that will be removed after the process ends.
495
520
  Temporary datasets are useful for optimization, for example when we have
@@ -499,7 +524,14 @@ class DataChain:
499
524
  It returns the chain itself.
500
525
  """
501
526
  schema = self.signals_schema.clone_without_sys_signals().serialize()
502
- return self._evolve(query=self._query.save(feature_schema=schema))
527
+ project = self.session.catalog.metastore.get_project(
528
+ self.project_name,
529
+ self.namespace_name,
530
+ create=True,
531
+ )
532
+ return self._evolve(
533
+ query=self._query.save(project=project, feature_schema=schema)
534
+ )
503
535
 
504
536
  def save( # type: ignore[override]
505
537
  self,
@@ -513,7 +545,10 @@ class DataChain:
513
545
  """Save to a Dataset. It returns the chain itself.
514
546
 
515
547
  Parameters:
516
- name : dataset name.
548
+ name : dataset name. It can be full name consisting of namespace and
549
+ project, but it can also be just a regular dataset name in which
550
+ case we are taking namespace and project from settings, if they
551
+ are defined there, or default ones instead.
517
552
  version : version of a dataset. If version is not specified and dataset
518
553
  already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
519
554
  description : description of a dataset.
@@ -522,6 +557,7 @@ class DataChain:
522
557
  update_version: which part of the dataset version to automatically increase.
523
558
  Available values: `major`, `minor` or `patch`. Default is `patch`.
524
559
  """
560
+ catalog = self.session.catalog
525
561
  if version is not None:
526
562
  semver.validate(version)
527
563
 
@@ -535,6 +571,22 @@ class DataChain:
535
571
  " patch"
536
572
  )
537
573
 
574
+ namespace_name, project_name, name = catalog.get_full_dataset_name(
575
+ name,
576
+ namespace_name=self._settings.namespace,
577
+ project_name=self._settings.project,
578
+ )
579
+
580
+ try:
581
+ project = self.session.catalog.metastore.get_project(
582
+ project_name,
583
+ namespace_name,
584
+ create=self.session.catalog.metastore.project_allowed_to_create,
585
+ )
586
+ except ProjectNotFoundError as e:
587
+ # not being able to create it as creation is not allowed
588
+ raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
589
+
538
590
  schema = self.signals_schema.clone_without_sys_signals().serialize()
539
591
 
540
592
  # Handle retry and delta functionality
@@ -558,6 +610,7 @@ class DataChain:
558
610
  query=result_ds._query.save(
559
611
  name=name,
560
612
  version=version,
613
+ project=project,
561
614
  feature_schema=schema,
562
615
  dependencies=dependencies,
563
616
  **kwargs,
@@ -577,6 +630,7 @@ class DataChain:
577
630
  query=self._query.save(
578
631
  name=name,
579
632
  version=version,
633
+ project=project,
580
634
  description=description,
581
635
  attrs=attrs,
582
636
  feature_schema=schema,
@@ -843,7 +897,7 @@ class DataChain:
843
897
  Order is not guaranteed when steps are added after an `order_by` statement.
844
898
  I.e. when using `read_dataset` an `order_by` statement should be used if
845
899
  the order of the records in the chain is important.
846
- Using `order_by` directly before `limit`, `collect` and `collect_flatten`
900
+ Using `order_by` directly before `limit`, `to_list` and similar methods
847
901
  will give expected results.
848
902
  See https://github.com/iterative/datachain/issues/477 for further details.
849
903
  """
@@ -1048,32 +1102,32 @@ class DataChain:
1048
1102
 
1049
1103
  @property
1050
1104
  def _effective_signals_schema(self) -> "SignalSchema":
1051
- """Effective schema used for user-facing API like collect, to_pandas, etc."""
1105
+ """Effective schema used for user-facing API like to_list, to_pandas, etc."""
1052
1106
  signals_schema = self.signals_schema
1053
1107
  if not self._sys:
1054
1108
  return signals_schema.clone_without_sys_signals()
1055
1109
  return signals_schema
1056
1110
 
1057
1111
  @overload
1058
- def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
1112
+ def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
1059
1113
 
1060
1114
  @overload
1061
- def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1115
+ def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1062
1116
 
1063
1117
  @overload
1064
- def collect_flatten(
1118
+ def _leaf_values(
1065
1119
  self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
1066
1120
  ) -> Iterator[_T]: ...
1067
1121
 
1068
1122
  @overload
1069
- def collect_flatten(
1123
+ def _leaf_values(
1070
1124
  self,
1071
1125
  *,
1072
1126
  row_factory: Callable[[list[str], tuple[Any, ...]], _T],
1073
1127
  include_hidden: bool,
1074
1128
  ) -> Iterator[_T]: ...
1075
1129
 
1076
- def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
1130
+ def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
1077
1131
  """Yields flattened rows of values as a tuple.
1078
1132
 
1079
1133
  Args:
@@ -1101,7 +1155,7 @@ class DataChain:
1101
1155
  headers, _ = self._effective_signals_schema.get_headers_with_length()
1102
1156
  column_names = [".".join(filter(None, header)) for header in headers]
1103
1157
 
1104
- results_iter = self.collect_flatten()
1158
+ results_iter = self._leaf_values()
1105
1159
 
1106
1160
  def column_chunks() -> Iterator[list[list[Any]]]:
1107
1161
  for chunk_iter in batched_it(results_iter, chunk_size):
@@ -1134,9 +1188,9 @@ class DataChain:
1134
1188
 
1135
1189
  def results(self, *, row_factory=None, include_hidden=True):
1136
1190
  if row_factory is None:
1137
- return list(self.collect_flatten(include_hidden=include_hidden))
1191
+ return list(self._leaf_values(include_hidden=include_hidden))
1138
1192
  return list(
1139
- self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
1193
+ self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
1140
1194
  )
1141
1195
 
1142
1196
  def to_records(self) -> list[dict[str, Any]]:
@@ -1147,42 +1201,38 @@ class DataChain:
1147
1201
 
1148
1202
  return self.results(row_factory=to_dict)
1149
1203
 
1150
- @overload
1151
- def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1152
-
1153
- @overload
1154
- def collect(self, col: str) -> Iterator[DataValue]: ...
1155
-
1156
- @overload
1157
- def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1158
-
1159
- def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1204
+ def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
1160
1205
  """Yields rows of values, optionally limited to the specified columns.
1161
1206
 
1162
1207
  Args:
1163
1208
  *cols: Limit to the specified columns. By default, all columns are selected.
1164
1209
 
1165
1210
  Yields:
1166
- (DataType): Yields a single item if a column is selected.
1167
- (tuple[DataType, ...]): Yields a tuple of items if multiple columns are
1168
- selected.
1211
+ (tuple[DataType, ...]): Yields a tuple of items for each row.
1169
1212
 
1170
1213
  Example:
1171
1214
  Iterating over all rows:
1172
1215
  ```py
1173
- for row in dc.collect():
1216
+ for row in ds.to_iter():
1217
+ print(row)
1218
+ ```
1219
+
1220
+ DataChain is iterable and can be used in a for loop directly which is
1221
+ equivalent to `ds.to_iter()`:
1222
+ ```py
1223
+ for row in ds:
1174
1224
  print(row)
1175
1225
  ```
1176
1226
 
1177
1227
  Iterating over all rows with selected columns:
1178
1228
  ```py
1179
- for name, size in dc.collect("file.path", "file.size"):
1229
+ for name, size in ds.to_iter("file.path", "file.size"):
1180
1230
  print(name, size)
1181
1231
  ```
1182
1232
 
1183
1233
  Iterating over a single column:
1184
1234
  ```py
1185
- for file in dc.collect("file.path"):
1235
+ for (file,) in ds.to_iter("file.path"):
1186
1236
  print(file)
1187
1237
  ```
1188
1238
  """
@@ -1194,7 +1244,31 @@ class DataChain:
1194
1244
  ret = signals_schema.row_to_features(
1195
1245
  row, catalog=chain.session.catalog, cache=chain._settings.cache
1196
1246
  )
1197
- yield ret[0] if len(cols) == 1 else tuple(ret)
1247
+ yield tuple(ret)
1248
+
1249
+ @overload
1250
+ def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1251
+
1252
+ @overload
1253
+ def collect(self, col: str) -> Iterator[DataValue]: ...
1254
+
1255
+ @overload
1256
+ def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1257
+
1258
+ def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1259
+ """
1260
+ Deprecated. Use `to_iter` method instead.
1261
+ """
1262
+ warnings.warn(
1263
+ "Method `collect` is deprecated. Use `to_iter` method instead.",
1264
+ DeprecationWarning,
1265
+ stacklevel=2,
1266
+ )
1267
+
1268
+ if len(cols) == 1:
1269
+ yield from [item[0] for item in self.to_iter(*cols)]
1270
+ else:
1271
+ yield from self.to_iter(*cols)
1198
1272
 
1199
1273
  def to_pytorch(
1200
1274
  self,
@@ -1429,7 +1503,7 @@ class DataChain:
1429
1503
  )
1430
1504
  return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
1431
1505
 
1432
- def compare(
1506
+ def diff(
1433
1507
  self,
1434
1508
  other: "DataChain",
1435
1509
  on: Union[str, Sequence[str]],
@@ -1442,41 +1516,33 @@ class DataChain:
1442
1516
  same: bool = False,
1443
1517
  status_col: Optional[str] = None,
1444
1518
  ) -> "DataChain":
1445
- """Comparing two chains by identifying rows that are added, deleted, modified
1446
- or same. Result is the new chain that has additional column with possible
1447
- values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
1448
- rows respectively. Note that if only one "status" is asked, by setting proper
1449
- flags, this additional column is not created as it would have only one value
1450
- for all rows. Beside additional diff column, new chain has schema of the chain
1451
- on which method was called.
1519
+ """Calculate differences between two chains.
1520
+
1521
+ This method identifies records that are added, deleted, modified, or unchanged
1522
+ between two chains. It adds a status column with values: A=added, D=deleted,
1523
+ M=modified, S=same.
1452
1524
 
1453
1525
  Parameters:
1454
- other: Chain to calculate diff from.
1455
- on: Column or list of columns to match on. If both chains have the
1456
- same columns then this column is enough for the match. Otherwise,
1457
- `right_on` parameter has to specify the columns for the other chain.
1458
- This value is used to find corresponding row in other dataset. If not
1459
- found there, row is considered as added (or removed if vice versa), and
1460
- if found then row can be either modified or same.
1461
- right_on: Optional column or list of columns
1462
- for the `other` to match.
1463
- compare: Column or list of columns to compare on. If both chains have
1464
- the same columns then this column is enough for the compare. Otherwise,
1465
- `right_compare` parameter has to specify the columns for the other
1466
- chain. This value is used to see if row is modified or same. If
1467
- not set, all columns will be used for comparison
1468
- right_compare: Optional column or list of columns
1469
- for the `other` to compare to.
1470
- added (bool): Whether to return added rows in resulting chain.
1471
- deleted (bool): Whether to return deleted rows in resulting chain.
1472
- modified (bool): Whether to return modified rows in resulting chain.
1473
- same (bool): Whether to return unchanged rows in resulting chain.
1474
- status_col (str): Name of the new column that is created in resulting chain
1475
- representing diff status.
1526
+ other: Chain to compare against.
1527
+ on: Column(s) to match records between chains.
1528
+ right_on: Column(s) in the other chain to match against. Defaults to `on`.
1529
+ compare: Column(s) to check for changes.
1530
+ If not specified,all columns are used.
1531
+ right_compare: Column(s) in the other chain to compare against.
1532
+ Defaults to values of `compare`.
1533
+ added (bool): Include records that exist in this chain but not in the other.
1534
+ deleted (bool): Include records that exist only in the other chain.
1535
+ modified (bool): Include records that exist in both
1536
+ but have different values.
1537
+ same (bool): Include records that are identical in both chains.
1538
+ status_col (str): Name for the status column showing differences.
1539
+
1540
+ Default behavior: By default, shows added, deleted, and modified records,
1541
+ but excludes unchanged records (same=False). Status column is not created.
1476
1542
 
1477
1543
  Example:
1478
1544
  ```py
1479
- res = persons.compare(
1545
+ res = persons.diff(
1480
1546
  new_persons,
1481
1547
  on=["id"],
1482
1548
  right_on=["other_id"],
@@ -1505,7 +1571,7 @@ class DataChain:
1505
1571
  status_col=status_col,
1506
1572
  )
1507
1573
 
1508
- def diff(
1574
+ def file_diff(
1509
1575
  self,
1510
1576
  other: "DataChain",
1511
1577
  on: str = "file",
@@ -1516,31 +1582,29 @@ class DataChain:
1516
1582
  same: bool = False,
1517
1583
  status_col: Optional[str] = None,
1518
1584
  ) -> "DataChain":
1519
- """Similar to `.compare()`, which is more generic method to calculate difference
1520
- between two chains. Unlike `.compare()`, this method works only on those chains
1521
- that have `File` object, or it's derivatives, in it. File `source` and `path`
1522
- are used for matching, and file `version` and `etag` for comparing, while in
1523
- `.compare()` user needs to provide arbitrary columns for matching and comparing.
1585
+ """Calculate differences between two chains containing files.
1586
+
1587
+ This method is specifically designed for file chains. It uses file `source`
1588
+ and `path` to match files, and file `version` and `etag` to detect changes.
1524
1589
 
1525
1590
  Parameters:
1526
- other: Chain to calculate diff from.
1527
- on: File signal to match on. If both chains have the
1528
- same file signal then this column is enough for the match. Otherwise,
1529
- `right_on` parameter has to specify the file signal for the other chain.
1530
- This value is used to find corresponding row in other dataset. If not
1531
- found there, row is considered as added (or removed if vice versa), and
1532
- if found then row can be either modified or same.
1533
- right_on: Optional file signal for the `other` to match.
1534
- added (bool): Whether to return added rows in resulting chain.
1535
- deleted (bool): Whether to return deleted rows in resulting chain.
1536
- modified (bool): Whether to return modified rows in resulting chain.
1537
- same (bool): Whether to return unchanged rows in resulting chain.
1538
- status_col (str): Optional name of the new column that is created in
1539
- resulting chain representing diff status.
1591
+ other: Chain to compare against.
1592
+ on: File column name in this chain. Default is "file".
1593
+ right_on: File column name in the other chain. Defaults to `on`.
1594
+ added (bool): Include files that exist in this chain but not in the other.
1595
+ deleted (bool): Include files that exist only in the other chain.
1596
+ modified (bool): Include files that exist in both but have different
1597
+ versions/etags.
1598
+ same (bool): Include files that are identical in both chains.
1599
+ status_col (str): Name for the status column showing differences
1600
+ (A=added, D=deleted, M=modified, S=same).
1601
+
1602
+ Default behavior: By default, includes only new files (added=True and
1603
+ modified=True). This is useful for incremental processing.
1540
1604
 
1541
1605
  Example:
1542
1606
  ```py
1543
- diff = images.diff(
1607
+ diff = images.file_diff(
1544
1608
  new_images,
1545
1609
  on="file",
1546
1610
  right_on="other_file",
@@ -1565,7 +1629,7 @@ class DataChain:
1565
1629
  compare_cols = get_file_signals(on, compare_file_signals)
1566
1630
  right_compare_cols = get_file_signals(right_on, compare_file_signals)
1567
1631
 
1568
- return self.compare(
1632
+ return self.diff(
1569
1633
  other,
1570
1634
  on_cols,
1571
1635
  right_on=right_on_cols,
@@ -1977,7 +2041,7 @@ class DataChain:
1977
2041
  headers, _ = self._effective_signals_schema.get_headers_with_length()
1978
2042
  column_names = [".".join(filter(None, header)) for header in headers]
1979
2043
 
1980
- results_iter = self.collect_flatten()
2044
+ results_iter = self._leaf_values()
1981
2045
 
1982
2046
  with opener(path, "w", newline="") as f:
1983
2047
  writer = csv.writer(f, delimiter=delimiter, **kwargs)
@@ -2029,7 +2093,7 @@ class DataChain:
2029
2093
  if include_outer_list:
2030
2094
  # This makes the file JSON instead of JSON lines.
2031
2095
  f.write(b"[\n")
2032
- for row in self.collect_flatten():
2096
+ for row in self._leaf_values():
2033
2097
  if not is_first:
2034
2098
  if include_outer_list:
2035
2099
  # This makes the file JSON instead of JSON lines.
@@ -2194,7 +2258,7 @@ class DataChain:
2194
2258
  max_threads=num_threads or 1,
2195
2259
  client_config=client_config,
2196
2260
  )
2197
- file_exporter.run(self.collect(signal), progress_bar)
2261
+ file_exporter.run(self.to_values(signal), progress_bar)
2198
2262
 
2199
2263
  def shuffle(self) -> "Self":
2200
2264
  """Shuffle the rows of the chain deterministically."""
@@ -2239,16 +2303,45 @@ class DataChain:
2239
2303
 
2240
2304
  Combining filters with "or"
2241
2305
  ```py
2242
- dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
2306
+ dc.filter(
2307
+ C("file.path").glob("cat*") |
2308
+ C("file.path").glob("dog*")
2309
+ )
2310
+ ```
2311
+
2312
+ ```py
2313
+ dc.filter(dc.func.or_(
2314
+ C("file.path").glob("cat*"),
2315
+ C("file.path").glob("dog*")
2316
+ ))
2243
2317
  ```
2244
2318
 
2245
2319
  Combining filters with "and"
2246
2320
  ```py
2247
2321
  dc.filter(
2248
- C("file.path").glob("*.jpg) &
2322
+ C("file.path").glob("*.jpg"),
2323
+ string.length(C("file.path")) > 5
2324
+ )
2325
+ ```
2326
+
2327
+ ```py
2328
+ dc.filter(
2329
+ C("file.path").glob("*.jpg") &
2249
2330
  (string.length(C("file.path")) > 5)
2250
2331
  )
2251
2332
  ```
2333
+
2334
+ ```py
2335
+ dc.filter(dc.func.and_(
2336
+ C("file.path").glob("*.jpg"),
2337
+ string.length(C("file.path")) > 5
2338
+ ))
2339
+ ```
2340
+
2341
+ Combining filters with "not"
2342
+ ```py
2343
+ dc.filter(~(C("file.path").glob("*.jpg")))
2344
+ ```
2252
2345
  """
2253
2346
  return self._evolve(query=self._query.filter(*args))
2254
2347
 
@@ -2299,3 +2392,72 @@ class DataChain:
2299
2392
  Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
2300
2393
  """
2301
2394
  return self._evolve(query=self._query.chunk(index, total))
2395
+
2396
+ def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
2397
+ """Returns a list of rows of values, optionally limited to the specified
2398
+ columns.
2399
+
2400
+ Args:
2401
+ *cols: Limit to the specified columns. By default, all columns are selected.
2402
+
2403
+ Returns:
2404
+ list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
2405
+
2406
+ Example:
2407
+ Getting all rows as a list:
2408
+ ```py
2409
+ rows = dc.to_list()
2410
+ print(rows)
2411
+ ```
2412
+
2413
+ Getting all rows with selected columns as a list:
2414
+ ```py
2415
+ name_size_pairs = dc.to_list("file.path", "file.size")
2416
+ print(name_size_pairs)
2417
+ ```
2418
+
2419
+ Getting a single column as a list:
2420
+ ```py
2421
+ files = dc.to_list("file.path")
2422
+ print(files) # Returns list of 1-tuples
2423
+ ```
2424
+ """
2425
+ return list(self.to_iter(*cols))
2426
+
2427
+ def to_values(self, col: str) -> list[DataValue]:
2428
+ """Returns a flat list of values from a single column.
2429
+
2430
+ Args:
2431
+ col: The name of the column to extract values from.
2432
+
2433
+ Returns:
2434
+ list[DataValue]: Returns a flat list of values from the specified column.
2435
+
2436
+ Example:
2437
+ Getting all values from a single column:
2438
+ ```py
2439
+ file_paths = dc.to_values("file.path")
2440
+ print(file_paths) # Returns list of strings
2441
+ ```
2442
+
2443
+ Getting all file sizes:
2444
+ ```py
2445
+ sizes = dc.to_values("file.size")
2446
+ print(sizes) # Returns list of integers
2447
+ ```
2448
+ """
2449
+ return [row[0] for row in self.to_list(col)]
2450
+
2451
+ def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
2452
+ """Make DataChain objects iterable.
2453
+
2454
+ Yields:
2455
+ (tuple[DataValue, ...]): Yields tuples of all column values for each row.
2456
+
2457
+ Example:
2458
+ ```py
2459
+ for row in chain:
2460
+ print(row)
2461
+ ```
2462
+ """
2463
+ return self.to_iter()