datachain 0.20.4__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (47) hide show
  1. datachain/__init__.py +0 -2
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +65 -180
  4. datachain/cli/__init__.py +7 -0
  5. datachain/cli/commands/datasets.py +28 -43
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +35 -1
  8. datachain/client/fsspec.py +3 -5
  9. datachain/client/hf.py +0 -10
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +37 -405
  12. datachain/data_storage/sqlite.py +7 -136
  13. datachain/data_storage/warehouse.py +7 -26
  14. datachain/dataset.py +12 -126
  15. datachain/delta.py +7 -11
  16. datachain/error.py +0 -36
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +0 -4
  20. datachain/lib/dc/datachain.py +92 -260
  21. datachain/lib/dc/datasets.py +50 -104
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +0 -1
  24. datachain/lib/dc/storage.py +40 -38
  25. datachain/lib/file.py +23 -77
  26. datachain/lib/listing.py +1 -3
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/pytorch.py +1 -1
  29. datachain/lib/settings.py +0 -10
  30. datachain/lib/tar.py +2 -1
  31. datachain/lib/udf_signature.py +1 -1
  32. datachain/lib/webdataset.py +20 -30
  33. datachain/listing.py +1 -3
  34. datachain/query/dataset.py +46 -71
  35. datachain/query/session.py +1 -1
  36. datachain/remote/studio.py +26 -61
  37. datachain/studio.py +7 -23
  38. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
  39. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
  40. datachain/lib/namespaces.py +0 -71
  41. datachain/lib/projects.py +0 -86
  42. datachain/namespace.py +0 -65
  43. datachain/project.py +0 -78
  44. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
  45. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
  47. {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
datachain/func/func.py CHANGED
@@ -25,7 +25,7 @@ if TYPE_CHECKING:
25
25
  ColT = Union[str, Column, ColumnElement, "Func", tuple]
26
26
 
27
27
 
28
- class Func(Function): # noqa: PLW1641
28
+ class Func(Function):
29
29
  """Represents a function to be applied to a column in a SQL query."""
30
30
 
31
31
  def __init__(
datachain/lib/arrow.py CHANGED
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
76
76
  fs_path = file.path
77
77
  fs = ReferenceFileSystem({fs_path: [cache_path]})
78
78
  else:
79
- fs, fs_path = file.get_fs(), file.get_fs_path()
79
+ fs, fs_path = file.get_fs(), file.get_path()
80
80
 
81
81
  kwargs = self.kwargs
82
82
  if format := kwargs.get("format"):
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
160
160
  kwargs["format"] = fix_pyarrow_format(format, parse_options)
161
161
 
162
162
  schemas = []
163
- for (file,) in chain.to_iter("file"):
164
- ds = dataset(file.get_fs_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
163
+ for file in chain.collect("file"):
164
+ ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
165
165
  schemas.append(ds.schema)
166
166
  if not schemas:
167
167
  raise ValueError(
@@ -22,8 +22,6 @@ if TYPE_CHECKING:
22
22
 
23
23
  class DatasetInfo(DataModel):
24
24
  name: str
25
- namespace: str
26
- project: str
27
25
  uuid: str = Field(default=str(uuid4()))
28
26
  version: str = Field(default=DEFAULT_DATASET_VERSION)
29
27
  status: int = Field(default=DatasetStatus.CREATED)
@@ -93,8 +91,6 @@ class DatasetInfo(DataModel):
93
91
  return cls(
94
92
  uuid=version.uuid,
95
93
  name=dataset.name,
96
- namespace=dataset.project.namespace.name,
97
- project=dataset.project.name,
98
94
  version=version.version,
99
95
  status=version.status,
100
96
  created_at=version.created_at,
@@ -24,9 +24,8 @@ from pydantic import BaseModel
24
24
  from tqdm import tqdm
25
25
 
26
26
  from datachain import semver
27
- from datachain.dataset import DatasetRecord, parse_dataset_name
27
+ from datachain.dataset import DatasetRecord
28
28
  from datachain.delta import delta_disabled
29
- from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
30
29
  from datachain.func import literal
31
30
  from datachain.func.base import Function
32
31
  from datachain.func.func import Func
@@ -262,7 +261,7 @@ class DataChain:
262
261
  """Underlying dataset, if there is one."""
263
262
  if not self.name:
264
263
  return None
265
- return self.session.catalog.get_dataset(self.name, self._query.project)
264
+ return self.session.catalog.get_dataset(self.name)
266
265
 
267
266
  def __or__(self, other: "Self") -> "Self":
268
267
  """Return `self.union(other)`."""
@@ -313,8 +312,6 @@ class DataChain:
313
312
  min_task_size=None,
314
313
  prefetch: Optional[int] = None,
315
314
  sys: Optional[bool] = None,
316
- namespace: Optional[str] = None,
317
- project: Optional[str] = None,
318
315
  ) -> "Self":
319
316
  """Change settings for chain.
320
317
 
@@ -330,8 +327,6 @@ class DataChain:
330
327
  prefetch: number of workers to use for downloading files in advance.
331
328
  This is enabled by default and uses 2 workers.
332
329
  To disable prefetching, set it to 0.
333
- namespace: namespace name.
334
- project: project name.
335
330
 
336
331
  Example:
337
332
  ```py
@@ -345,11 +340,7 @@ class DataChain:
345
340
  if sys is None:
346
341
  sys = self._sys
347
342
  settings = copy.copy(self._settings)
348
- settings.add(
349
- Settings(
350
- cache, parallel, workers, min_task_size, prefetch, namespace, project
351
- )
352
- )
343
+ settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
353
344
  return self._evolve(settings=settings, _sys=sys)
354
345
 
355
346
  def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
@@ -439,10 +430,10 @@ class DataChain:
439
430
 
440
431
  from datachain.lib.arrow import schema_to_output
441
432
 
442
- json_values = self.limit(schema_sample_size).to_list(col)
433
+ json_values = list(self.limit(schema_sample_size).collect(col))
443
434
  json_dicts = [
444
435
  json.loads(json_value) if isinstance(json_value, str) else json_value
445
- for (json_value,) in json_values
436
+ for json_value in json_values
446
437
  ]
447
438
 
448
439
  if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
@@ -499,22 +490,6 @@ class DataChain:
499
490
  )
500
491
  return listings(*args, **kwargs)
501
492
 
502
- @property
503
- def namespace_name(self) -> str:
504
- """Current namespace name in which the chain is running"""
505
- return (
506
- self._settings.namespace
507
- or self.session.catalog.metastore.default_namespace_name
508
- )
509
-
510
- @property
511
- def project_name(self) -> str:
512
- """Current project name in which the chain is running"""
513
- return (
514
- self._settings.project
515
- or self.session.catalog.metastore.default_project_name
516
- )
517
-
518
493
  def persist(self) -> "Self":
519
494
  """Saves temporary chain that will be removed after the process ends.
520
495
  Temporary datasets are useful for optimization, for example when we have
@@ -524,14 +499,7 @@ class DataChain:
524
499
  It returns the chain itself.
525
500
  """
526
501
  schema = self.signals_schema.clone_without_sys_signals().serialize()
527
- project = self.session.catalog.metastore.get_project(
528
- self.project_name,
529
- self.namespace_name,
530
- create=True,
531
- )
532
- return self._evolve(
533
- query=self._query.save(project=project, feature_schema=schema)
534
- )
502
+ return self._evolve(query=self._query.save(feature_schema=schema))
535
503
 
536
504
  def save( # type: ignore[override]
537
505
  self,
@@ -545,10 +513,7 @@ class DataChain:
545
513
  """Save to a Dataset. It returns the chain itself.
546
514
 
547
515
  Parameters:
548
- name : dataset name. It can be full name consisting of namespace and
549
- project, but it can also be just a regular dataset name in which
550
- case we are taking namespace and project from settings, if they
551
- are defined there, or default ones instead.
516
+ name : dataset name.
552
517
  version : version of a dataset. If version is not specified and dataset
553
518
  already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
554
519
  description : description of a dataset.
@@ -570,29 +535,6 @@ class DataChain:
570
535
  " patch"
571
536
  )
572
537
 
573
- namespace_name, project_name, name = parse_dataset_name(name)
574
-
575
- namespace_name = (
576
- namespace_name
577
- or self._settings.namespace
578
- or self.session.catalog.metastore.default_namespace_name
579
- )
580
- project_name = (
581
- project_name
582
- or self._settings.project
583
- or self.session.catalog.metastore.default_project_name
584
- )
585
-
586
- try:
587
- project = self.session.catalog.metastore.get_project(
588
- project_name,
589
- namespace_name,
590
- create=self.session.catalog.metastore.project_allowed_to_create,
591
- )
592
- except ProjectNotFoundError as e:
593
- # not being able to create it as creation is not allowed
594
- raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
595
-
596
538
  schema = self.signals_schema.clone_without_sys_signals().serialize()
597
539
 
598
540
  # Handle retry and delta functionality
@@ -616,7 +558,6 @@ class DataChain:
616
558
  query=result_ds._query.save(
617
559
  name=name,
618
560
  version=version,
619
- project=project,
620
561
  feature_schema=schema,
621
562
  dependencies=dependencies,
622
563
  **kwargs,
@@ -636,7 +577,6 @@ class DataChain:
636
577
  query=self._query.save(
637
578
  name=name,
638
579
  version=version,
639
- project=project,
640
580
  description=description,
641
581
  attrs=attrs,
642
582
  feature_schema=schema,
@@ -903,7 +843,7 @@ class DataChain:
903
843
  Order is not guaranteed when steps are added after an `order_by` statement.
904
844
  I.e. when using `read_dataset` an `order_by` statement should be used if
905
845
  the order of the records in the chain is important.
906
- Using `order_by` directly before `limit`, `to_list` and similar methods
846
+ Using `order_by` directly before `limit`, `collect` and `collect_flatten`
907
847
  will give expected results.
908
848
  See https://github.com/iterative/datachain/issues/477 for further details.
909
849
  """
@@ -1108,32 +1048,32 @@ class DataChain:
1108
1048
 
1109
1049
  @property
1110
1050
  def _effective_signals_schema(self) -> "SignalSchema":
1111
- """Effective schema used for user-facing API like to_list, to_pandas, etc."""
1051
+ """Effective schema used for user-facing API like collect, to_pandas, etc."""
1112
1052
  signals_schema = self.signals_schema
1113
1053
  if not self._sys:
1114
1054
  return signals_schema.clone_without_sys_signals()
1115
1055
  return signals_schema
1116
1056
 
1117
1057
  @overload
1118
- def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
1058
+ def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
1119
1059
 
1120
1060
  @overload
1121
- def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1061
+ def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1122
1062
 
1123
1063
  @overload
1124
- def _leaf_values(
1064
+ def collect_flatten(
1125
1065
  self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
1126
1066
  ) -> Iterator[_T]: ...
1127
1067
 
1128
1068
  @overload
1129
- def _leaf_values(
1069
+ def collect_flatten(
1130
1070
  self,
1131
1071
  *,
1132
1072
  row_factory: Callable[[list[str], tuple[Any, ...]], _T],
1133
1073
  include_hidden: bool,
1134
1074
  ) -> Iterator[_T]: ...
1135
1075
 
1136
- def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
1076
+ def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
1137
1077
  """Yields flattened rows of values as a tuple.
1138
1078
 
1139
1079
  Args:
@@ -1161,7 +1101,7 @@ class DataChain:
1161
1101
  headers, _ = self._effective_signals_schema.get_headers_with_length()
1162
1102
  column_names = [".".join(filter(None, header)) for header in headers]
1163
1103
 
1164
- results_iter = self._leaf_values()
1104
+ results_iter = self.collect_flatten()
1165
1105
 
1166
1106
  def column_chunks() -> Iterator[list[list[Any]]]:
1167
1107
  for chunk_iter in batched_it(results_iter, chunk_size):
@@ -1194,9 +1134,9 @@ class DataChain:
1194
1134
 
1195
1135
  def results(self, *, row_factory=None, include_hidden=True):
1196
1136
  if row_factory is None:
1197
- return list(self._leaf_values(include_hidden=include_hidden))
1137
+ return list(self.collect_flatten(include_hidden=include_hidden))
1198
1138
  return list(
1199
- self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
1139
+ self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
1200
1140
  )
1201
1141
 
1202
1142
  def to_records(self) -> list[dict[str, Any]]:
@@ -1207,38 +1147,42 @@ class DataChain:
1207
1147
 
1208
1148
  return self.results(row_factory=to_dict)
1209
1149
 
1210
- def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
1150
+ @overload
1151
+ def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1152
+
1153
+ @overload
1154
+ def collect(self, col: str) -> Iterator[DataValue]: ...
1155
+
1156
+ @overload
1157
+ def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1158
+
1159
+ def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1211
1160
  """Yields rows of values, optionally limited to the specified columns.
1212
1161
 
1213
1162
  Args:
1214
1163
  *cols: Limit to the specified columns. By default, all columns are selected.
1215
1164
 
1216
1165
  Yields:
1217
- (tuple[DataType, ...]): Yields a tuple of items for each row.
1166
+ (DataType): Yields a single item if a column is selected.
1167
+ (tuple[DataType, ...]): Yields a tuple of items if multiple columns are
1168
+ selected.
1218
1169
 
1219
1170
  Example:
1220
1171
  Iterating over all rows:
1221
1172
  ```py
1222
- for row in ds.to_iter():
1223
- print(row)
1224
- ```
1225
-
1226
- DataChain is iterable and can be used in a for loop directly which is
1227
- equivalent to `ds.to_iter()`:
1228
- ```py
1229
- for row in ds:
1173
+ for row in dc.collect():
1230
1174
  print(row)
1231
1175
  ```
1232
1176
 
1233
1177
  Iterating over all rows with selected columns:
1234
1178
  ```py
1235
- for name, size in ds.to_iter("file.path", "file.size"):
1179
+ for name, size in dc.collect("file.path", "file.size"):
1236
1180
  print(name, size)
1237
1181
  ```
1238
1182
 
1239
1183
  Iterating over a single column:
1240
1184
  ```py
1241
- for (file,) in ds.to_iter("file.path"):
1185
+ for file in dc.collect("file.path"):
1242
1186
  print(file)
1243
1187
  ```
1244
1188
  """
@@ -1250,31 +1194,7 @@ class DataChain:
1250
1194
  ret = signals_schema.row_to_features(
1251
1195
  row, catalog=chain.session.catalog, cache=chain._settings.cache
1252
1196
  )
1253
- yield tuple(ret)
1254
-
1255
- @overload
1256
- def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1257
-
1258
- @overload
1259
- def collect(self, col: str) -> Iterator[DataValue]: ...
1260
-
1261
- @overload
1262
- def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1263
-
1264
- def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1265
- """
1266
- Deprecated. Use `to_iter` method instead.
1267
- """
1268
- warnings.warn(
1269
- "Method `collect` is deprecated. Use `to_iter` method instead.",
1270
- DeprecationWarning,
1271
- stacklevel=2,
1272
- )
1273
-
1274
- if len(cols) == 1:
1275
- yield from [item[0] for item in self.to_iter(*cols)]
1276
- else:
1277
- yield from self.to_iter(*cols)
1197
+ yield ret[0] if len(cols) == 1 else tuple(ret)
1278
1198
 
1279
1199
  def to_pytorch(
1280
1200
  self,
@@ -1509,7 +1429,7 @@ class DataChain:
1509
1429
  )
1510
1430
  return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
1511
1431
 
1512
- def diff(
1432
+ def compare(
1513
1433
  self,
1514
1434
  other: "DataChain",
1515
1435
  on: Union[str, Sequence[str]],
@@ -1522,33 +1442,41 @@ class DataChain:
1522
1442
  same: bool = False,
1523
1443
  status_col: Optional[str] = None,
1524
1444
  ) -> "DataChain":
1525
- """Calculate differences between two chains.
1526
-
1527
- This method identifies records that are added, deleted, modified, or unchanged
1528
- between two chains. It adds a status column with values: A=added, D=deleted,
1529
- M=modified, S=same.
1445
+ """Comparing two chains by identifying rows that are added, deleted, modified
1446
+ or same. Result is the new chain that has additional column with possible
1447
+ values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
1448
+ rows respectively. Note that if only one "status" is asked, by setting proper
1449
+ flags, this additional column is not created as it would have only one value
1450
+ for all rows. Beside additional diff column, new chain has schema of the chain
1451
+ on which method was called.
1530
1452
 
1531
1453
  Parameters:
1532
- other: Chain to compare against.
1533
- on: Column(s) to match records between chains.
1534
- right_on: Column(s) in the other chain to match against. Defaults to `on`.
1535
- compare: Column(s) to check for changes.
1536
- If not specified,all columns are used.
1537
- right_compare: Column(s) in the other chain to compare against.
1538
- Defaults to values of `compare`.
1539
- added (bool): Include records that exist in this chain but not in the other.
1540
- deleted (bool): Include records that exist only in the other chain.
1541
- modified (bool): Include records that exist in both
1542
- but have different values.
1543
- same (bool): Include records that are identical in both chains.
1544
- status_col (str): Name for the status column showing differences.
1545
-
1546
- Default behavior: By default, shows added, deleted, and modified records,
1547
- but excludes unchanged records (same=False). Status column is not created.
1454
+ other: Chain to calculate diff from.
1455
+ on: Column or list of columns to match on. If both chains have the
1456
+ same columns then this column is enough for the match. Otherwise,
1457
+ `right_on` parameter has to specify the columns for the other chain.
1458
+ This value is used to find corresponding row in other dataset. If not
1459
+ found there, row is considered as added (or removed if vice versa), and
1460
+ if found then row can be either modified or same.
1461
+ right_on: Optional column or list of columns
1462
+ for the `other` to match.
1463
+ compare: Column or list of columns to compare on. If both chains have
1464
+ the same columns then this column is enough for the compare. Otherwise,
1465
+ `right_compare` parameter has to specify the columns for the other
1466
+ chain. This value is used to see if row is modified or same. If
1467
+ not set, all columns will be used for comparison
1468
+ right_compare: Optional column or list of columns
1469
+ for the `other` to compare to.
1470
+ added (bool): Whether to return added rows in resulting chain.
1471
+ deleted (bool): Whether to return deleted rows in resulting chain.
1472
+ modified (bool): Whether to return modified rows in resulting chain.
1473
+ same (bool): Whether to return unchanged rows in resulting chain.
1474
+ status_col (str): Name of the new column that is created in resulting chain
1475
+ representing diff status.
1548
1476
 
1549
1477
  Example:
1550
1478
  ```py
1551
- res = persons.diff(
1479
+ res = persons.compare(
1552
1480
  new_persons,
1553
1481
  on=["id"],
1554
1482
  right_on=["other_id"],
@@ -1577,7 +1505,7 @@ class DataChain:
1577
1505
  status_col=status_col,
1578
1506
  )
1579
1507
 
1580
- def file_diff(
1508
+ def diff(
1581
1509
  self,
1582
1510
  other: "DataChain",
1583
1511
  on: str = "file",
@@ -1588,29 +1516,31 @@ class DataChain:
1588
1516
  same: bool = False,
1589
1517
  status_col: Optional[str] = None,
1590
1518
  ) -> "DataChain":
1591
- """Calculate differences between two chains containing files.
1592
-
1593
- This method is specifically designed for file chains. It uses file `source`
1594
- and `path` to match files, and file `version` and `etag` to detect changes.
1519
+ """Similar to `.compare()`, which is more generic method to calculate difference
1520
+ between two chains. Unlike `.compare()`, this method works only on those chains
1521
+ that have `File` object, or it's derivatives, in it. File `source` and `path`
1522
+ are used for matching, and file `version` and `etag` for comparing, while in
1523
+ `.compare()` user needs to provide arbitrary columns for matching and comparing.
1595
1524
 
1596
1525
  Parameters:
1597
- other: Chain to compare against.
1598
- on: File column name in this chain. Default is "file".
1599
- right_on: File column name in the other chain. Defaults to `on`.
1600
- added (bool): Include files that exist in this chain but not in the other.
1601
- deleted (bool): Include files that exist only in the other chain.
1602
- modified (bool): Include files that exist in both but have different
1603
- versions/etags.
1604
- same (bool): Include files that are identical in both chains.
1605
- status_col (str): Name for the status column showing differences
1606
- (A=added, D=deleted, M=modified, S=same).
1607
-
1608
- Default behavior: By default, includes only new files (added=True and
1609
- modified=True). This is useful for incremental processing.
1526
+ other: Chain to calculate diff from.
1527
+ on: File signal to match on. If both chains have the
1528
+ same file signal then this column is enough for the match. Otherwise,
1529
+ `right_on` parameter has to specify the file signal for the other chain.
1530
+ This value is used to find corresponding row in other dataset. If not
1531
+ found there, row is considered as added (or removed if vice versa), and
1532
+ if found then row can be either modified or same.
1533
+ right_on: Optional file signal for the `other` to match.
1534
+ added (bool): Whether to return added rows in resulting chain.
1535
+ deleted (bool): Whether to return deleted rows in resulting chain.
1536
+ modified (bool): Whether to return modified rows in resulting chain.
1537
+ same (bool): Whether to return unchanged rows in resulting chain.
1538
+ status_col (str): Optional name of the new column that is created in
1539
+ resulting chain representing diff status.
1610
1540
 
1611
1541
  Example:
1612
1542
  ```py
1613
- diff = images.file_diff(
1543
+ diff = images.diff(
1614
1544
  new_images,
1615
1545
  on="file",
1616
1546
  right_on="other_file",
@@ -1635,7 +1565,7 @@ class DataChain:
1635
1565
  compare_cols = get_file_signals(on, compare_file_signals)
1636
1566
  right_compare_cols = get_file_signals(right_on, compare_file_signals)
1637
1567
 
1638
- return self.diff(
1568
+ return self.compare(
1639
1569
  other,
1640
1570
  on_cols,
1641
1571
  right_on=right_on_cols,
@@ -2047,7 +1977,7 @@ class DataChain:
2047
1977
  headers, _ = self._effective_signals_schema.get_headers_with_length()
2048
1978
  column_names = [".".join(filter(None, header)) for header in headers]
2049
1979
 
2050
- results_iter = self._leaf_values()
1980
+ results_iter = self.collect_flatten()
2051
1981
 
2052
1982
  with opener(path, "w", newline="") as f:
2053
1983
  writer = csv.writer(f, delimiter=delimiter, **kwargs)
@@ -2099,7 +2029,7 @@ class DataChain:
2099
2029
  if include_outer_list:
2100
2030
  # This makes the file JSON instead of JSON lines.
2101
2031
  f.write(b"[\n")
2102
- for row in self._leaf_values():
2032
+ for row in self.collect_flatten():
2103
2033
  if not is_first:
2104
2034
  if include_outer_list:
2105
2035
  # This makes the file JSON instead of JSON lines.
@@ -2264,7 +2194,7 @@ class DataChain:
2264
2194
  max_threads=num_threads or 1,
2265
2195
  client_config=client_config,
2266
2196
  )
2267
- file_exporter.run(self.to_values(signal), progress_bar)
2197
+ file_exporter.run(self.collect(signal), progress_bar)
2268
2198
 
2269
2199
  def shuffle(self) -> "Self":
2270
2200
  """Shuffle the rows of the chain deterministically."""
@@ -2309,45 +2239,16 @@ class DataChain:
2309
2239
 
2310
2240
  Combining filters with "or"
2311
2241
  ```py
2312
- dc.filter(
2313
- C("file.path").glob("cat*") |
2314
- C("file.path").glob("dog*")
2315
- )
2316
- ```
2317
-
2318
- ```py
2319
- dc.filter(dc.func.or_(
2320
- C("file.path").glob("cat*"),
2321
- C("file.path").glob("dog*")
2322
- ))
2242
+ dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
2323
2243
  ```
2324
2244
 
2325
2245
  Combining filters with "and"
2326
2246
  ```py
2327
2247
  dc.filter(
2328
- C("file.path").glob("*.jpg"),
2329
- string.length(C("file.path")) > 5
2330
- )
2331
- ```
2332
-
2333
- ```py
2334
- dc.filter(
2335
- C("file.path").glob("*.jpg") &
2248
+ C("file.path").glob("*.jpg) &
2336
2249
  (string.length(C("file.path")) > 5)
2337
2250
  )
2338
2251
  ```
2339
-
2340
- ```py
2341
- dc.filter(dc.func.and_(
2342
- C("file.path").glob("*.jpg"),
2343
- string.length(C("file.path")) > 5
2344
- ))
2345
- ```
2346
-
2347
- Combining filters with "not"
2348
- ```py
2349
- dc.filter(~(C("file.path").glob("*.jpg")))
2350
- ```
2351
2252
  """
2352
2253
  return self._evolve(query=self._query.filter(*args))
2353
2254
 
@@ -2398,72 +2299,3 @@ class DataChain:
2398
2299
  Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
2399
2300
  """
2400
2301
  return self._evolve(query=self._query.chunk(index, total))
2401
-
2402
- def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
2403
- """Returns a list of rows of values, optionally limited to the specified
2404
- columns.
2405
-
2406
- Args:
2407
- *cols: Limit to the specified columns. By default, all columns are selected.
2408
-
2409
- Returns:
2410
- list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
2411
-
2412
- Example:
2413
- Getting all rows as a list:
2414
- ```py
2415
- rows = dc.to_list()
2416
- print(rows)
2417
- ```
2418
-
2419
- Getting all rows with selected columns as a list:
2420
- ```py
2421
- name_size_pairs = dc.to_list("file.path", "file.size")
2422
- print(name_size_pairs)
2423
- ```
2424
-
2425
- Getting a single column as a list:
2426
- ```py
2427
- files = dc.to_list("file.path")
2428
- print(files) # Returns list of 1-tuples
2429
- ```
2430
- """
2431
- return list(self.to_iter(*cols))
2432
-
2433
- def to_values(self, col: str) -> list[DataValue]:
2434
- """Returns a flat list of values from a single column.
2435
-
2436
- Args:
2437
- col: The name of the column to extract values from.
2438
-
2439
- Returns:
2440
- list[DataValue]: Returns a flat list of values from the specified column.
2441
-
2442
- Example:
2443
- Getting all values from a single column:
2444
- ```py
2445
- file_paths = dc.to_values("file.path")
2446
- print(file_paths) # Returns list of strings
2447
- ```
2448
-
2449
- Getting all file sizes:
2450
- ```py
2451
- sizes = dc.to_values("file.size")
2452
- print(sizes) # Returns list of integers
2453
- ```
2454
- """
2455
- return [row[0] for row in self.to_list(col)]
2456
-
2457
- def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
2458
- """Make DataChain objects iterable.
2459
-
2460
- Yields:
2461
- (tuple[DataValue, ...]): Yields tuples of all column values for each row.
2462
-
2463
- Example:
2464
- ```py
2465
- for row in chain:
2466
- print(row)
2467
- ```
2468
- """
2469
- return self.to_iter()