datachain 0.20.3__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (47) hide show
  1. datachain/__init__.py +0 -2
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +65 -180
  4. datachain/cli/__init__.py +7 -0
  5. datachain/cli/commands/datasets.py +28 -43
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +35 -1
  8. datachain/client/fsspec.py +3 -5
  9. datachain/client/hf.py +0 -10
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +37 -403
  12. datachain/data_storage/sqlite.py +7 -139
  13. datachain/data_storage/warehouse.py +7 -26
  14. datachain/dataset.py +12 -126
  15. datachain/delta.py +7 -11
  16. datachain/error.py +0 -36
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +0 -4
  20. datachain/lib/dc/datachain.py +92 -259
  21. datachain/lib/dc/datasets.py +49 -87
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +0 -1
  24. datachain/lib/dc/storage.py +40 -38
  25. datachain/lib/file.py +23 -77
  26. datachain/lib/listing.py +1 -3
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/pytorch.py +1 -1
  29. datachain/lib/settings.py +0 -10
  30. datachain/lib/tar.py +2 -1
  31. datachain/lib/udf_signature.py +1 -1
  32. datachain/lib/webdataset.py +20 -30
  33. datachain/listing.py +1 -3
  34. datachain/query/dataset.py +46 -71
  35. datachain/query/session.py +1 -1
  36. datachain/remote/studio.py +26 -61
  37. datachain/studio.py +7 -23
  38. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
  39. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
  40. datachain/lib/namespaces.py +0 -71
  41. datachain/lib/projects.py +0 -86
  42. datachain/namespace.py +0 -65
  43. datachain/project.py +0 -78
  44. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
  45. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
  47. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
datachain/func/func.py CHANGED
@@ -25,7 +25,7 @@ if TYPE_CHECKING:
25
25
  ColT = Union[str, Column, ColumnElement, "Func", tuple]
26
26
 
27
27
 
28
- class Func(Function): # noqa: PLW1641
28
+ class Func(Function):
29
29
  """Represents a function to be applied to a column in a SQL query."""
30
30
 
31
31
  def __init__(
datachain/lib/arrow.py CHANGED
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
76
76
  fs_path = file.path
77
77
  fs = ReferenceFileSystem({fs_path: [cache_path]})
78
78
  else:
79
- fs, fs_path = file.get_fs(), file.get_fs_path()
79
+ fs, fs_path = file.get_fs(), file.get_path()
80
80
 
81
81
  kwargs = self.kwargs
82
82
  if format := kwargs.get("format"):
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
160
160
  kwargs["format"] = fix_pyarrow_format(format, parse_options)
161
161
 
162
162
  schemas = []
163
- for (file,) in chain.to_iter("file"):
164
- ds = dataset(file.get_fs_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
163
+ for file in chain.collect("file"):
164
+ ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
165
165
  schemas.append(ds.schema)
166
166
  if not schemas:
167
167
  raise ValueError(
@@ -22,8 +22,6 @@ if TYPE_CHECKING:
22
22
 
23
23
  class DatasetInfo(DataModel):
24
24
  name: str
25
- namespace: str
26
- project: str
27
25
  uuid: str = Field(default=str(uuid4()))
28
26
  version: str = Field(default=DEFAULT_DATASET_VERSION)
29
27
  status: int = Field(default=DatasetStatus.CREATED)
@@ -93,8 +91,6 @@ class DatasetInfo(DataModel):
93
91
  return cls(
94
92
  uuid=version.uuid,
95
93
  name=dataset.name,
96
- namespace=dataset.project.namespace.name,
97
- project=dataset.project.name,
98
94
  version=version.version,
99
95
  status=version.status,
100
96
  created_at=version.created_at,
@@ -24,9 +24,8 @@ from pydantic import BaseModel
24
24
  from tqdm import tqdm
25
25
 
26
26
  from datachain import semver
27
- from datachain.dataset import DatasetRecord, parse_dataset_name
27
+ from datachain.dataset import DatasetRecord
28
28
  from datachain.delta import delta_disabled
29
- from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
30
29
  from datachain.func import literal
31
30
  from datachain.func.base import Function
32
31
  from datachain.func.func import Func
@@ -38,7 +37,6 @@ from datachain.lib.file import (
38
37
  FileExporter,
39
38
  )
40
39
  from datachain.lib.file import ExportPlacement as FileExportPlacement
41
- from datachain.lib.projects import get as get_project
42
40
  from datachain.lib.settings import Settings
43
41
  from datachain.lib.signal_schema import SignalSchema
44
42
  from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
@@ -263,7 +261,7 @@ class DataChain:
263
261
  """Underlying dataset, if there is one."""
264
262
  if not self.name:
265
263
  return None
266
- return self.session.catalog.get_dataset(self.name, self._query.project)
264
+ return self.session.catalog.get_dataset(self.name)
267
265
 
268
266
  def __or__(self, other: "Self") -> "Self":
269
267
  """Return `self.union(other)`."""
@@ -314,8 +312,6 @@ class DataChain:
314
312
  min_task_size=None,
315
313
  prefetch: Optional[int] = None,
316
314
  sys: Optional[bool] = None,
317
- namespace: Optional[str] = None,
318
- project: Optional[str] = None,
319
315
  ) -> "Self":
320
316
  """Change settings for chain.
321
317
 
@@ -331,8 +327,6 @@ class DataChain:
331
327
  prefetch: number of workers to use for downloading files in advance.
332
328
  This is enabled by default and uses 2 workers.
333
329
  To disable prefetching, set it to 0.
334
- namespace: namespace name.
335
- project: project name.
336
330
 
337
331
  Example:
338
332
  ```py
@@ -346,11 +340,7 @@ class DataChain:
346
340
  if sys is None:
347
341
  sys = self._sys
348
342
  settings = copy.copy(self._settings)
349
- settings.add(
350
- Settings(
351
- cache, parallel, workers, min_task_size, prefetch, namespace, project
352
- )
353
- )
343
+ settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
354
344
  return self._evolve(settings=settings, _sys=sys)
355
345
 
356
346
  def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
@@ -440,10 +430,10 @@ class DataChain:
440
430
 
441
431
  from datachain.lib.arrow import schema_to_output
442
432
 
443
- json_values = self.limit(schema_sample_size).to_list(col)
433
+ json_values = list(self.limit(schema_sample_size).collect(col))
444
434
  json_dicts = [
445
435
  json.loads(json_value) if isinstance(json_value, str) else json_value
446
- for (json_value,) in json_values
436
+ for json_value in json_values
447
437
  ]
448
438
 
449
439
  if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
@@ -500,22 +490,6 @@ class DataChain:
500
490
  )
501
491
  return listings(*args, **kwargs)
502
492
 
503
- @property
504
- def namespace_name(self) -> str:
505
- """Current namespace name in which the chain is running"""
506
- return (
507
- self._settings.namespace
508
- or self.session.catalog.metastore.default_namespace_name
509
- )
510
-
511
- @property
512
- def project_name(self) -> str:
513
- """Current project name in which the chain is running"""
514
- return (
515
- self._settings.project
516
- or self.session.catalog.metastore.default_project_name
517
- )
518
-
519
493
  def persist(self) -> "Self":
520
494
  """Saves temporary chain that will be removed after the process ends.
521
495
  Temporary datasets are useful for optimization, for example when we have
@@ -525,12 +499,7 @@ class DataChain:
525
499
  It returns the chain itself.
526
500
  """
527
501
  schema = self.signals_schema.clone_without_sys_signals().serialize()
528
- project = get_project(
529
- self.project_name, self.namespace_name, session=self.session
530
- )
531
- return self._evolve(
532
- query=self._query.save(project=project, feature_schema=schema)
533
- )
502
+ return self._evolve(query=self._query.save(feature_schema=schema))
534
503
 
535
504
  def save( # type: ignore[override]
536
505
  self,
@@ -544,10 +513,7 @@ class DataChain:
544
513
  """Save to a Dataset. It returns the chain itself.
545
514
 
546
515
  Parameters:
547
- name : dataset name. It can be full name consisting of namespace and
548
- project, but it can also be just a regular dataset name in which
549
- case we are taking namespace and project from settings, if they
550
- are defined there, or default ones instead.
516
+ name : dataset name.
551
517
  version : version of a dataset. If version is not specified and dataset
552
518
  already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
553
519
  description : description of a dataset.
@@ -569,29 +535,6 @@ class DataChain:
569
535
  " patch"
570
536
  )
571
537
 
572
- namespace_name, project_name, name = parse_dataset_name(name)
573
-
574
- namespace_name = (
575
- namespace_name
576
- or self._settings.namespace
577
- or self.session.catalog.metastore.default_namespace_name
578
- )
579
- project_name = (
580
- project_name
581
- or self._settings.project
582
- or self.session.catalog.metastore.default_project_name
583
- )
584
-
585
- try:
586
- project = self.session.catalog.metastore.get_project(
587
- project_name,
588
- namespace_name,
589
- create=self.session.catalog.metastore.project_allowed_to_create,
590
- )
591
- except ProjectNotFoundError as e:
592
- # not being able to create it as creation is not allowed
593
- raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
594
-
595
538
  schema = self.signals_schema.clone_without_sys_signals().serialize()
596
539
 
597
540
  # Handle retry and delta functionality
@@ -615,7 +558,6 @@ class DataChain:
615
558
  query=result_ds._query.save(
616
559
  name=name,
617
560
  version=version,
618
- project=project,
619
561
  feature_schema=schema,
620
562
  dependencies=dependencies,
621
563
  **kwargs,
@@ -635,7 +577,6 @@ class DataChain:
635
577
  query=self._query.save(
636
578
  name=name,
637
579
  version=version,
638
- project=project,
639
580
  description=description,
640
581
  attrs=attrs,
641
582
  feature_schema=schema,
@@ -902,7 +843,7 @@ class DataChain:
902
843
  Order is not guaranteed when steps are added after an `order_by` statement.
903
844
  I.e. when using `read_dataset` an `order_by` statement should be used if
904
845
  the order of the records in the chain is important.
905
- Using `order_by` directly before `limit`, `to_list` and similar methods
846
+ Using `order_by` directly before `limit`, `collect` and `collect_flatten`
906
847
  will give expected results.
907
848
  See https://github.com/iterative/datachain/issues/477 for further details.
908
849
  """
@@ -1107,32 +1048,32 @@ class DataChain:
1107
1048
 
1108
1049
  @property
1109
1050
  def _effective_signals_schema(self) -> "SignalSchema":
1110
- """Effective schema used for user-facing API like to_list, to_pandas, etc."""
1051
+ """Effective schema used for user-facing API like collect, to_pandas, etc."""
1111
1052
  signals_schema = self.signals_schema
1112
1053
  if not self._sys:
1113
1054
  return signals_schema.clone_without_sys_signals()
1114
1055
  return signals_schema
1115
1056
 
1116
1057
  @overload
1117
- def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
1058
+ def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
1118
1059
 
1119
1060
  @overload
1120
- def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1061
+ def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
1121
1062
 
1122
1063
  @overload
1123
- def _leaf_values(
1064
+ def collect_flatten(
1124
1065
  self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
1125
1066
  ) -> Iterator[_T]: ...
1126
1067
 
1127
1068
  @overload
1128
- def _leaf_values(
1069
+ def collect_flatten(
1129
1070
  self,
1130
1071
  *,
1131
1072
  row_factory: Callable[[list[str], tuple[Any, ...]], _T],
1132
1073
  include_hidden: bool,
1133
1074
  ) -> Iterator[_T]: ...
1134
1075
 
1135
- def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
1076
+ def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
1136
1077
  """Yields flattened rows of values as a tuple.
1137
1078
 
1138
1079
  Args:
@@ -1160,7 +1101,7 @@ class DataChain:
1160
1101
  headers, _ = self._effective_signals_schema.get_headers_with_length()
1161
1102
  column_names = [".".join(filter(None, header)) for header in headers]
1162
1103
 
1163
- results_iter = self._leaf_values()
1104
+ results_iter = self.collect_flatten()
1164
1105
 
1165
1106
  def column_chunks() -> Iterator[list[list[Any]]]:
1166
1107
  for chunk_iter in batched_it(results_iter, chunk_size):
@@ -1193,9 +1134,9 @@ class DataChain:
1193
1134
 
1194
1135
  def results(self, *, row_factory=None, include_hidden=True):
1195
1136
  if row_factory is None:
1196
- return list(self._leaf_values(include_hidden=include_hidden))
1137
+ return list(self.collect_flatten(include_hidden=include_hidden))
1197
1138
  return list(
1198
- self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
1139
+ self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
1199
1140
  )
1200
1141
 
1201
1142
  def to_records(self) -> list[dict[str, Any]]:
@@ -1206,38 +1147,42 @@ class DataChain:
1206
1147
 
1207
1148
  return self.results(row_factory=to_dict)
1208
1149
 
1209
- def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
1150
+ @overload
1151
+ def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1152
+
1153
+ @overload
1154
+ def collect(self, col: str) -> Iterator[DataValue]: ...
1155
+
1156
+ @overload
1157
+ def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1158
+
1159
+ def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1210
1160
  """Yields rows of values, optionally limited to the specified columns.
1211
1161
 
1212
1162
  Args:
1213
1163
  *cols: Limit to the specified columns. By default, all columns are selected.
1214
1164
 
1215
1165
  Yields:
1216
- (tuple[DataType, ...]): Yields a tuple of items for each row.
1166
+ (DataType): Yields a single item if a column is selected.
1167
+ (tuple[DataType, ...]): Yields a tuple of items if multiple columns are
1168
+ selected.
1217
1169
 
1218
1170
  Example:
1219
1171
  Iterating over all rows:
1220
1172
  ```py
1221
- for row in ds.to_iter():
1222
- print(row)
1223
- ```
1224
-
1225
- DataChain is iterable and can be used in a for loop directly which is
1226
- equivalent to `ds.to_iter()`:
1227
- ```py
1228
- for row in ds:
1173
+ for row in dc.collect():
1229
1174
  print(row)
1230
1175
  ```
1231
1176
 
1232
1177
  Iterating over all rows with selected columns:
1233
1178
  ```py
1234
- for name, size in ds.to_iter("file.path", "file.size"):
1179
+ for name, size in dc.collect("file.path", "file.size"):
1235
1180
  print(name, size)
1236
1181
  ```
1237
1182
 
1238
1183
  Iterating over a single column:
1239
1184
  ```py
1240
- for (file,) in ds.to_iter("file.path"):
1185
+ for file in dc.collect("file.path"):
1241
1186
  print(file)
1242
1187
  ```
1243
1188
  """
@@ -1249,31 +1194,7 @@ class DataChain:
1249
1194
  ret = signals_schema.row_to_features(
1250
1195
  row, catalog=chain.session.catalog, cache=chain._settings.cache
1251
1196
  )
1252
- yield tuple(ret)
1253
-
1254
- @overload
1255
- def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
1256
-
1257
- @overload
1258
- def collect(self, col: str) -> Iterator[DataValue]: ...
1259
-
1260
- @overload
1261
- def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1262
-
1263
- def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1264
- """
1265
- Deprecated. Use `to_iter` method instead.
1266
- """
1267
- warnings.warn(
1268
- "Method `collect` is deprecated. Use `to_iter` method instead.",
1269
- DeprecationWarning,
1270
- stacklevel=2,
1271
- )
1272
-
1273
- if len(cols) == 1:
1274
- yield from [item[0] for item in self.to_iter(*cols)]
1275
- else:
1276
- yield from self.to_iter(*cols)
1197
+ yield ret[0] if len(cols) == 1 else tuple(ret)
1277
1198
 
1278
1199
  def to_pytorch(
1279
1200
  self,
@@ -1508,7 +1429,7 @@ class DataChain:
1508
1429
  )
1509
1430
  return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
1510
1431
 
1511
- def diff(
1432
+ def compare(
1512
1433
  self,
1513
1434
  other: "DataChain",
1514
1435
  on: Union[str, Sequence[str]],
@@ -1521,33 +1442,41 @@ class DataChain:
1521
1442
  same: bool = False,
1522
1443
  status_col: Optional[str] = None,
1523
1444
  ) -> "DataChain":
1524
- """Calculate differences between two chains.
1525
-
1526
- This method identifies records that are added, deleted, modified, or unchanged
1527
- between two chains. It adds a status column with values: A=added, D=deleted,
1528
- M=modified, S=same.
1445
+ """Comparing two chains by identifying rows that are added, deleted, modified
1446
+ or same. Result is the new chain that has additional column with possible
1447
+ values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
1448
+ rows respectively. Note that if only one "status" is asked, by setting proper
1449
+ flags, this additional column is not created as it would have only one value
1450
+ for all rows. Beside additional diff column, new chain has schema of the chain
1451
+ on which method was called.
1529
1452
 
1530
1453
  Parameters:
1531
- other: Chain to compare against.
1532
- on: Column(s) to match records between chains.
1533
- right_on: Column(s) in the other chain to match against. Defaults to `on`.
1534
- compare: Column(s) to check for changes.
1535
- If not specified,all columns are used.
1536
- right_compare: Column(s) in the other chain to compare against.
1537
- Defaults to values of `compare`.
1538
- added (bool): Include records that exist in this chain but not in the other.
1539
- deleted (bool): Include records that exist only in the other chain.
1540
- modified (bool): Include records that exist in both
1541
- but have different values.
1542
- same (bool): Include records that are identical in both chains.
1543
- status_col (str): Name for the status column showing differences.
1544
-
1545
- Default behavior: By default, shows added, deleted, and modified records,
1546
- but excludes unchanged records (same=False). Status column is not created.
1454
+ other: Chain to calculate diff from.
1455
+ on: Column or list of columns to match on. If both chains have the
1456
+ same columns then this column is enough for the match. Otherwise,
1457
+ `right_on` parameter has to specify the columns for the other chain.
1458
+ This value is used to find corresponding row in other dataset. If not
1459
+ found there, row is considered as added (or removed if vice versa), and
1460
+ if found then row can be either modified or same.
1461
+ right_on: Optional column or list of columns
1462
+ for the `other` to match.
1463
+ compare: Column or list of columns to compare on. If both chains have
1464
+ the same columns then this column is enough for the compare. Otherwise,
1465
+ `right_compare` parameter has to specify the columns for the other
1466
+ chain. This value is used to see if row is modified or same. If
1467
+ not set, all columns will be used for comparison
1468
+ right_compare: Optional column or list of columns
1469
+ for the `other` to compare to.
1470
+ added (bool): Whether to return added rows in resulting chain.
1471
+ deleted (bool): Whether to return deleted rows in resulting chain.
1472
+ modified (bool): Whether to return modified rows in resulting chain.
1473
+ same (bool): Whether to return unchanged rows in resulting chain.
1474
+ status_col (str): Name of the new column that is created in resulting chain
1475
+ representing diff status.
1547
1476
 
1548
1477
  Example:
1549
1478
  ```py
1550
- res = persons.diff(
1479
+ res = persons.compare(
1551
1480
  new_persons,
1552
1481
  on=["id"],
1553
1482
  right_on=["other_id"],
@@ -1576,7 +1505,7 @@ class DataChain:
1576
1505
  status_col=status_col,
1577
1506
  )
1578
1507
 
1579
- def file_diff(
1508
+ def diff(
1580
1509
  self,
1581
1510
  other: "DataChain",
1582
1511
  on: str = "file",
@@ -1587,29 +1516,31 @@ class DataChain:
1587
1516
  same: bool = False,
1588
1517
  status_col: Optional[str] = None,
1589
1518
  ) -> "DataChain":
1590
- """Calculate differences between two chains containing files.
1591
-
1592
- This method is specifically designed for file chains. It uses file `source`
1593
- and `path` to match files, and file `version` and `etag` to detect changes.
1519
+ """Similar to `.compare()`, which is more generic method to calculate difference
1520
+ between two chains. Unlike `.compare()`, this method works only on those chains
1521
+ that have `File` object, or it's derivatives, in it. File `source` and `path`
1522
+ are used for matching, and file `version` and `etag` for comparing, while in
1523
+ `.compare()` user needs to provide arbitrary columns for matching and comparing.
1594
1524
 
1595
1525
  Parameters:
1596
- other: Chain to compare against.
1597
- on: File column name in this chain. Default is "file".
1598
- right_on: File column name in the other chain. Defaults to `on`.
1599
- added (bool): Include files that exist in this chain but not in the other.
1600
- deleted (bool): Include files that exist only in the other chain.
1601
- modified (bool): Include files that exist in both but have different
1602
- versions/etags.
1603
- same (bool): Include files that are identical in both chains.
1604
- status_col (str): Name for the status column showing differences
1605
- (A=added, D=deleted, M=modified, S=same).
1606
-
1607
- Default behavior: By default, includes only new files (added=True and
1608
- modified=True). This is useful for incremental processing.
1526
+ other: Chain to calculate diff from.
1527
+ on: File signal to match on. If both chains have the
1528
+ same file signal then this column is enough for the match. Otherwise,
1529
+ `right_on` parameter has to specify the file signal for the other chain.
1530
+ This value is used to find corresponding row in other dataset. If not
1531
+ found there, row is considered as added (or removed if vice versa), and
1532
+ if found then row can be either modified or same.
1533
+ right_on: Optional file signal for the `other` to match.
1534
+ added (bool): Whether to return added rows in resulting chain.
1535
+ deleted (bool): Whether to return deleted rows in resulting chain.
1536
+ modified (bool): Whether to return modified rows in resulting chain.
1537
+ same (bool): Whether to return unchanged rows in resulting chain.
1538
+ status_col (str): Optional name of the new column that is created in
1539
+ resulting chain representing diff status.
1609
1540
 
1610
1541
  Example:
1611
1542
  ```py
1612
- diff = images.file_diff(
1543
+ diff = images.diff(
1613
1544
  new_images,
1614
1545
  on="file",
1615
1546
  right_on="other_file",
@@ -1634,7 +1565,7 @@ class DataChain:
1634
1565
  compare_cols = get_file_signals(on, compare_file_signals)
1635
1566
  right_compare_cols = get_file_signals(right_on, compare_file_signals)
1636
1567
 
1637
- return self.diff(
1568
+ return self.compare(
1638
1569
  other,
1639
1570
  on_cols,
1640
1571
  right_on=right_on_cols,
@@ -2046,7 +1977,7 @@ class DataChain:
2046
1977
  headers, _ = self._effective_signals_schema.get_headers_with_length()
2047
1978
  column_names = [".".join(filter(None, header)) for header in headers]
2048
1979
 
2049
- results_iter = self._leaf_values()
1980
+ results_iter = self.collect_flatten()
2050
1981
 
2051
1982
  with opener(path, "w", newline="") as f:
2052
1983
  writer = csv.writer(f, delimiter=delimiter, **kwargs)
@@ -2098,7 +2029,7 @@ class DataChain:
2098
2029
  if include_outer_list:
2099
2030
  # This makes the file JSON instead of JSON lines.
2100
2031
  f.write(b"[\n")
2101
- for row in self._leaf_values():
2032
+ for row in self.collect_flatten():
2102
2033
  if not is_first:
2103
2034
  if include_outer_list:
2104
2035
  # This makes the file JSON instead of JSON lines.
@@ -2263,7 +2194,7 @@ class DataChain:
2263
2194
  max_threads=num_threads or 1,
2264
2195
  client_config=client_config,
2265
2196
  )
2266
- file_exporter.run(self.to_values(signal), progress_bar)
2197
+ file_exporter.run(self.collect(signal), progress_bar)
2267
2198
 
2268
2199
  def shuffle(self) -> "Self":
2269
2200
  """Shuffle the rows of the chain deterministically."""
@@ -2308,45 +2239,16 @@ class DataChain:
2308
2239
 
2309
2240
  Combining filters with "or"
2310
2241
  ```py
2311
- dc.filter(
2312
- C("file.path").glob("cat*") |
2313
- C("file.path").glob("dog*")
2314
- )
2315
- ```
2316
-
2317
- ```py
2318
- dc.filter(dc.func.or_(
2319
- C("file.path").glob("cat*"),
2320
- C("file.path").glob("dog*")
2321
- ))
2242
+ dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
2322
2243
  ```
2323
2244
 
2324
2245
  Combining filters with "and"
2325
2246
  ```py
2326
2247
  dc.filter(
2327
- C("file.path").glob("*.jpg"),
2328
- string.length(C("file.path")) > 5
2329
- )
2330
- ```
2331
-
2332
- ```py
2333
- dc.filter(
2334
- C("file.path").glob("*.jpg") &
2248
+ C("file.path").glob("*.jpg) &
2335
2249
  (string.length(C("file.path")) > 5)
2336
2250
  )
2337
2251
  ```
2338
-
2339
- ```py
2340
- dc.filter(dc.func.and_(
2341
- C("file.path").glob("*.jpg"),
2342
- string.length(C("file.path")) > 5
2343
- ))
2344
- ```
2345
-
2346
- Combining filters with "not"
2347
- ```py
2348
- dc.filter(~(C("file.path").glob("*.jpg")))
2349
- ```
2350
2252
  """
2351
2253
  return self._evolve(query=self._query.filter(*args))
2352
2254
 
@@ -2397,72 +2299,3 @@ class DataChain:
2397
2299
  Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
2398
2300
  """
2399
2301
  return self._evolve(query=self._query.chunk(index, total))
2400
-
2401
- def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
2402
- """Returns a list of rows of values, optionally limited to the specified
2403
- columns.
2404
-
2405
- Args:
2406
- *cols: Limit to the specified columns. By default, all columns are selected.
2407
-
2408
- Returns:
2409
- list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
2410
-
2411
- Example:
2412
- Getting all rows as a list:
2413
- ```py
2414
- rows = dc.to_list()
2415
- print(rows)
2416
- ```
2417
-
2418
- Getting all rows with selected columns as a list:
2419
- ```py
2420
- name_size_pairs = dc.to_list("file.path", "file.size")
2421
- print(name_size_pairs)
2422
- ```
2423
-
2424
- Getting a single column as a list:
2425
- ```py
2426
- files = dc.to_list("file.path")
2427
- print(files) # Returns list of 1-tuples
2428
- ```
2429
- """
2430
- return list(self.to_iter(*cols))
2431
-
2432
- def to_values(self, col: str) -> list[DataValue]:
2433
- """Returns a flat list of values from a single column.
2434
-
2435
- Args:
2436
- col: The name of the column to extract values from.
2437
-
2438
- Returns:
2439
- list[DataValue]: Returns a flat list of values from the specified column.
2440
-
2441
- Example:
2442
- Getting all values from a single column:
2443
- ```py
2444
- file_paths = dc.to_values("file.path")
2445
- print(file_paths) # Returns list of strings
2446
- ```
2447
-
2448
- Getting all file sizes:
2449
- ```py
2450
- sizes = dc.to_values("file.size")
2451
- print(sizes) # Returns list of integers
2452
- ```
2453
- """
2454
- return [row[0] for row in self.to_list(col)]
2455
-
2456
- def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
2457
- """Make DataChain objects iterable.
2458
-
2459
- Yields:
2460
- (tuple[DataValue, ...]): Yields tuples of all column values for each row.
2461
-
2462
- Example:
2463
- ```py
2464
- for row in chain:
2465
- print(row)
2466
- ```
2467
- """
2468
- return self.to_iter()