datachain 0.20.2__py3-none-any.whl → 0.20.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -3
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +3 -3
- datachain/cli/commands/ls.py +2 -2
- datachain/client/fsspec.py +5 -3
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +22 -7
- datachain/data_storage/sqlite.py +1 -4
- datachain/dataset.py +4 -3
- datachain/delta.py +2 -2
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -4
- datachain/lib/dc/datachain.py +178 -89
- datachain/lib/dc/datasets.py +46 -42
- datachain/lib/dc/storage.py +24 -38
- datachain/lib/file.py +77 -23
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +16 -18
- datachain/lib/projects.py +26 -26
- datachain/lib/pytorch.py +1 -1
- datachain/lib/tar.py +1 -2
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/namespace.py +3 -3
- datachain/project.py +5 -5
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/METADATA +1 -1
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/RECORD +32 -32
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/WHEEL +0 -0
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/top_level.txt +0 -0
datachain/lib/dc/datachain.py
CHANGED
|
@@ -26,6 +26,7 @@ from tqdm import tqdm
|
|
|
26
26
|
from datachain import semver
|
|
27
27
|
from datachain.dataset import DatasetRecord, parse_dataset_name
|
|
28
28
|
from datachain.delta import delta_disabled
|
|
29
|
+
from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
|
|
29
30
|
from datachain.func import literal
|
|
30
31
|
from datachain.func.base import Function
|
|
31
32
|
from datachain.func.func import Func
|
|
@@ -37,7 +38,6 @@ from datachain.lib.file import (
|
|
|
37
38
|
FileExporter,
|
|
38
39
|
)
|
|
39
40
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
40
|
-
from datachain.lib.projects import get as get_project
|
|
41
41
|
from datachain.lib.settings import Settings
|
|
42
42
|
from datachain.lib.signal_schema import SignalSchema
|
|
43
43
|
from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
|
|
@@ -439,10 +439,10 @@ class DataChain:
|
|
|
439
439
|
|
|
440
440
|
from datachain.lib.arrow import schema_to_output
|
|
441
441
|
|
|
442
|
-
json_values =
|
|
442
|
+
json_values = self.limit(schema_sample_size).to_list(col)
|
|
443
443
|
json_dicts = [
|
|
444
444
|
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
445
|
-
for json_value in json_values
|
|
445
|
+
for (json_value,) in json_values
|
|
446
446
|
]
|
|
447
447
|
|
|
448
448
|
if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
|
|
@@ -524,8 +524,10 @@ class DataChain:
|
|
|
524
524
|
It returns the chain itself.
|
|
525
525
|
"""
|
|
526
526
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
527
|
-
project = get_project(
|
|
528
|
-
self.project_name,
|
|
527
|
+
project = self.session.catalog.metastore.get_project(
|
|
528
|
+
self.project_name,
|
|
529
|
+
self.namespace_name,
|
|
530
|
+
create=True,
|
|
529
531
|
)
|
|
530
532
|
return self._evolve(
|
|
531
533
|
query=self._query.save(project=project, feature_schema=schema)
|
|
@@ -581,7 +583,15 @@ class DataChain:
|
|
|
581
583
|
or self.session.catalog.metastore.default_project_name
|
|
582
584
|
)
|
|
583
585
|
|
|
584
|
-
|
|
586
|
+
try:
|
|
587
|
+
project = self.session.catalog.metastore.get_project(
|
|
588
|
+
project_name,
|
|
589
|
+
namespace_name,
|
|
590
|
+
create=self.session.catalog.metastore.project_allowed_to_create,
|
|
591
|
+
)
|
|
592
|
+
except ProjectNotFoundError as e:
|
|
593
|
+
# not being able to create it as creation is not allowed
|
|
594
|
+
raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
|
|
585
595
|
|
|
586
596
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
587
597
|
|
|
@@ -893,7 +903,7 @@ class DataChain:
|
|
|
893
903
|
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
894
904
|
I.e. when using `read_dataset` an `order_by` statement should be used if
|
|
895
905
|
the order of the records in the chain is important.
|
|
896
|
-
Using `order_by` directly before `limit`, `
|
|
906
|
+
Using `order_by` directly before `limit`, `to_list` and similar methods
|
|
897
907
|
will give expected results.
|
|
898
908
|
See https://github.com/iterative/datachain/issues/477 for further details.
|
|
899
909
|
"""
|
|
@@ -1098,32 +1108,32 @@ class DataChain:
|
|
|
1098
1108
|
|
|
1099
1109
|
@property
|
|
1100
1110
|
def _effective_signals_schema(self) -> "SignalSchema":
|
|
1101
|
-
"""Effective schema used for user-facing API like
|
|
1111
|
+
"""Effective schema used for user-facing API like to_list, to_pandas, etc."""
|
|
1102
1112
|
signals_schema = self.signals_schema
|
|
1103
1113
|
if not self._sys:
|
|
1104
1114
|
return signals_schema.clone_without_sys_signals()
|
|
1105
1115
|
return signals_schema
|
|
1106
1116
|
|
|
1107
1117
|
@overload
|
|
1108
|
-
def
|
|
1118
|
+
def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
|
|
1109
1119
|
|
|
1110
1120
|
@overload
|
|
1111
|
-
def
|
|
1121
|
+
def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
|
|
1112
1122
|
|
|
1113
1123
|
@overload
|
|
1114
|
-
def
|
|
1124
|
+
def _leaf_values(
|
|
1115
1125
|
self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
|
|
1116
1126
|
) -> Iterator[_T]: ...
|
|
1117
1127
|
|
|
1118
1128
|
@overload
|
|
1119
|
-
def
|
|
1129
|
+
def _leaf_values(
|
|
1120
1130
|
self,
|
|
1121
1131
|
*,
|
|
1122
1132
|
row_factory: Callable[[list[str], tuple[Any, ...]], _T],
|
|
1123
1133
|
include_hidden: bool,
|
|
1124
1134
|
) -> Iterator[_T]: ...
|
|
1125
1135
|
|
|
1126
|
-
def
|
|
1136
|
+
def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
|
|
1127
1137
|
"""Yields flattened rows of values as a tuple.
|
|
1128
1138
|
|
|
1129
1139
|
Args:
|
|
@@ -1151,7 +1161,7 @@ class DataChain:
|
|
|
1151
1161
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
1152
1162
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
1153
1163
|
|
|
1154
|
-
results_iter = self.
|
|
1164
|
+
results_iter = self._leaf_values()
|
|
1155
1165
|
|
|
1156
1166
|
def column_chunks() -> Iterator[list[list[Any]]]:
|
|
1157
1167
|
for chunk_iter in batched_it(results_iter, chunk_size):
|
|
@@ -1184,9 +1194,9 @@ class DataChain:
|
|
|
1184
1194
|
|
|
1185
1195
|
def results(self, *, row_factory=None, include_hidden=True):
|
|
1186
1196
|
if row_factory is None:
|
|
1187
|
-
return list(self.
|
|
1197
|
+
return list(self._leaf_values(include_hidden=include_hidden))
|
|
1188
1198
|
return list(
|
|
1189
|
-
self.
|
|
1199
|
+
self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
|
|
1190
1200
|
)
|
|
1191
1201
|
|
|
1192
1202
|
def to_records(self) -> list[dict[str, Any]]:
|
|
@@ -1197,42 +1207,38 @@ class DataChain:
|
|
|
1197
1207
|
|
|
1198
1208
|
return self.results(row_factory=to_dict)
|
|
1199
1209
|
|
|
1200
|
-
|
|
1201
|
-
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1202
|
-
|
|
1203
|
-
@overload
|
|
1204
|
-
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1205
|
-
|
|
1206
|
-
@overload
|
|
1207
|
-
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1208
|
-
|
|
1209
|
-
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1210
|
+
def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
|
|
1210
1211
|
"""Yields rows of values, optionally limited to the specified columns.
|
|
1211
1212
|
|
|
1212
1213
|
Args:
|
|
1213
1214
|
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
1214
1215
|
|
|
1215
1216
|
Yields:
|
|
1216
|
-
(DataType): Yields a
|
|
1217
|
-
(tuple[DataType, ...]): Yields a tuple of items if multiple columns are
|
|
1218
|
-
selected.
|
|
1217
|
+
(tuple[DataType, ...]): Yields a tuple of items for each row.
|
|
1219
1218
|
|
|
1220
1219
|
Example:
|
|
1221
1220
|
Iterating over all rows:
|
|
1222
1221
|
```py
|
|
1223
|
-
for row in
|
|
1222
|
+
for row in ds.to_iter():
|
|
1223
|
+
print(row)
|
|
1224
|
+
```
|
|
1225
|
+
|
|
1226
|
+
DataChain is iterable and can be used in a for loop directly which is
|
|
1227
|
+
equivalent to `ds.to_iter()`:
|
|
1228
|
+
```py
|
|
1229
|
+
for row in ds:
|
|
1224
1230
|
print(row)
|
|
1225
1231
|
```
|
|
1226
1232
|
|
|
1227
1233
|
Iterating over all rows with selected columns:
|
|
1228
1234
|
```py
|
|
1229
|
-
for name, size in
|
|
1235
|
+
for name, size in ds.to_iter("file.path", "file.size"):
|
|
1230
1236
|
print(name, size)
|
|
1231
1237
|
```
|
|
1232
1238
|
|
|
1233
1239
|
Iterating over a single column:
|
|
1234
1240
|
```py
|
|
1235
|
-
for file in
|
|
1241
|
+
for (file,) in ds.to_iter("file.path"):
|
|
1236
1242
|
print(file)
|
|
1237
1243
|
```
|
|
1238
1244
|
"""
|
|
@@ -1244,7 +1250,31 @@ class DataChain:
|
|
|
1244
1250
|
ret = signals_schema.row_to_features(
|
|
1245
1251
|
row, catalog=chain.session.catalog, cache=chain._settings.cache
|
|
1246
1252
|
)
|
|
1247
|
-
yield
|
|
1253
|
+
yield tuple(ret)
|
|
1254
|
+
|
|
1255
|
+
@overload
|
|
1256
|
+
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1257
|
+
|
|
1258
|
+
@overload
|
|
1259
|
+
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1260
|
+
|
|
1261
|
+
@overload
|
|
1262
|
+
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1263
|
+
|
|
1264
|
+
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1265
|
+
"""
|
|
1266
|
+
Deprecated. Use `to_iter` method instead.
|
|
1267
|
+
"""
|
|
1268
|
+
warnings.warn(
|
|
1269
|
+
"Method `collect` is deprecated. Use `to_iter` method instead.",
|
|
1270
|
+
DeprecationWarning,
|
|
1271
|
+
stacklevel=2,
|
|
1272
|
+
)
|
|
1273
|
+
|
|
1274
|
+
if len(cols) == 1:
|
|
1275
|
+
yield from [item[0] for item in self.to_iter(*cols)]
|
|
1276
|
+
else:
|
|
1277
|
+
yield from self.to_iter(*cols)
|
|
1248
1278
|
|
|
1249
1279
|
def to_pytorch(
|
|
1250
1280
|
self,
|
|
@@ -1479,7 +1509,7 @@ class DataChain:
|
|
|
1479
1509
|
)
|
|
1480
1510
|
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1481
1511
|
|
|
1482
|
-
def
|
|
1512
|
+
def diff(
|
|
1483
1513
|
self,
|
|
1484
1514
|
other: "DataChain",
|
|
1485
1515
|
on: Union[str, Sequence[str]],
|
|
@@ -1492,41 +1522,33 @@ class DataChain:
|
|
|
1492
1522
|
same: bool = False,
|
|
1493
1523
|
status_col: Optional[str] = None,
|
|
1494
1524
|
) -> "DataChain":
|
|
1495
|
-
"""
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
for all rows. Beside additional diff column, new chain has schema of the chain
|
|
1501
|
-
on which method was called.
|
|
1525
|
+
"""Calculate differences between two chains.
|
|
1526
|
+
|
|
1527
|
+
This method identifies records that are added, deleted, modified, or unchanged
|
|
1528
|
+
between two chains. It adds a status column with values: A=added, D=deleted,
|
|
1529
|
+
M=modified, S=same.
|
|
1502
1530
|
|
|
1503
1531
|
Parameters:
|
|
1504
|
-
other: Chain to
|
|
1505
|
-
on: Column
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
added (bool): Whether to return added rows in resulting chain.
|
|
1521
|
-
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1522
|
-
modified (bool): Whether to return modified rows in resulting chain.
|
|
1523
|
-
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1524
|
-
status_col (str): Name of the new column that is created in resulting chain
|
|
1525
|
-
representing diff status.
|
|
1532
|
+
other: Chain to compare against.
|
|
1533
|
+
on: Column(s) to match records between chains.
|
|
1534
|
+
right_on: Column(s) in the other chain to match against. Defaults to `on`.
|
|
1535
|
+
compare: Column(s) to check for changes.
|
|
1536
|
+
If not specified,all columns are used.
|
|
1537
|
+
right_compare: Column(s) in the other chain to compare against.
|
|
1538
|
+
Defaults to values of `compare`.
|
|
1539
|
+
added (bool): Include records that exist in this chain but not in the other.
|
|
1540
|
+
deleted (bool): Include records that exist only in the other chain.
|
|
1541
|
+
modified (bool): Include records that exist in both
|
|
1542
|
+
but have different values.
|
|
1543
|
+
same (bool): Include records that are identical in both chains.
|
|
1544
|
+
status_col (str): Name for the status column showing differences.
|
|
1545
|
+
|
|
1546
|
+
Default behavior: By default, shows added, deleted, and modified records,
|
|
1547
|
+
but excludes unchanged records (same=False). Status column is not created.
|
|
1526
1548
|
|
|
1527
1549
|
Example:
|
|
1528
1550
|
```py
|
|
1529
|
-
res = persons.
|
|
1551
|
+
res = persons.diff(
|
|
1530
1552
|
new_persons,
|
|
1531
1553
|
on=["id"],
|
|
1532
1554
|
right_on=["other_id"],
|
|
@@ -1555,7 +1577,7 @@ class DataChain:
|
|
|
1555
1577
|
status_col=status_col,
|
|
1556
1578
|
)
|
|
1557
1579
|
|
|
1558
|
-
def
|
|
1580
|
+
def file_diff(
|
|
1559
1581
|
self,
|
|
1560
1582
|
other: "DataChain",
|
|
1561
1583
|
on: str = "file",
|
|
@@ -1566,31 +1588,29 @@ class DataChain:
|
|
|
1566
1588
|
same: bool = False,
|
|
1567
1589
|
status_col: Optional[str] = None,
|
|
1568
1590
|
) -> "DataChain":
|
|
1569
|
-
"""
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
`.compare()` user needs to provide arbitrary columns for matching and comparing.
|
|
1591
|
+
"""Calculate differences between two chains containing files.
|
|
1592
|
+
|
|
1593
|
+
This method is specifically designed for file chains. It uses file `source`
|
|
1594
|
+
and `path` to match files, and file `version` and `etag` to detect changes.
|
|
1574
1595
|
|
|
1575
1596
|
Parameters:
|
|
1576
|
-
other: Chain to
|
|
1577
|
-
on: File
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
resulting chain representing diff status.
|
|
1597
|
+
other: Chain to compare against.
|
|
1598
|
+
on: File column name in this chain. Default is "file".
|
|
1599
|
+
right_on: File column name in the other chain. Defaults to `on`.
|
|
1600
|
+
added (bool): Include files that exist in this chain but not in the other.
|
|
1601
|
+
deleted (bool): Include files that exist only in the other chain.
|
|
1602
|
+
modified (bool): Include files that exist in both but have different
|
|
1603
|
+
versions/etags.
|
|
1604
|
+
same (bool): Include files that are identical in both chains.
|
|
1605
|
+
status_col (str): Name for the status column showing differences
|
|
1606
|
+
(A=added, D=deleted, M=modified, S=same).
|
|
1607
|
+
|
|
1608
|
+
Default behavior: By default, includes only new files (added=True and
|
|
1609
|
+
modified=True). This is useful for incremental processing.
|
|
1590
1610
|
|
|
1591
1611
|
Example:
|
|
1592
1612
|
```py
|
|
1593
|
-
diff = images.
|
|
1613
|
+
diff = images.file_diff(
|
|
1594
1614
|
new_images,
|
|
1595
1615
|
on="file",
|
|
1596
1616
|
right_on="other_file",
|
|
@@ -1615,7 +1635,7 @@ class DataChain:
|
|
|
1615
1635
|
compare_cols = get_file_signals(on, compare_file_signals)
|
|
1616
1636
|
right_compare_cols = get_file_signals(right_on, compare_file_signals)
|
|
1617
1637
|
|
|
1618
|
-
return self.
|
|
1638
|
+
return self.diff(
|
|
1619
1639
|
other,
|
|
1620
1640
|
on_cols,
|
|
1621
1641
|
right_on=right_on_cols,
|
|
@@ -2027,7 +2047,7 @@ class DataChain:
|
|
|
2027
2047
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
2028
2048
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
2029
2049
|
|
|
2030
|
-
results_iter = self.
|
|
2050
|
+
results_iter = self._leaf_values()
|
|
2031
2051
|
|
|
2032
2052
|
with opener(path, "w", newline="") as f:
|
|
2033
2053
|
writer = csv.writer(f, delimiter=delimiter, **kwargs)
|
|
@@ -2079,7 +2099,7 @@ class DataChain:
|
|
|
2079
2099
|
if include_outer_list:
|
|
2080
2100
|
# This makes the file JSON instead of JSON lines.
|
|
2081
2101
|
f.write(b"[\n")
|
|
2082
|
-
for row in self.
|
|
2102
|
+
for row in self._leaf_values():
|
|
2083
2103
|
if not is_first:
|
|
2084
2104
|
if include_outer_list:
|
|
2085
2105
|
# This makes the file JSON instead of JSON lines.
|
|
@@ -2244,7 +2264,7 @@ class DataChain:
|
|
|
2244
2264
|
max_threads=num_threads or 1,
|
|
2245
2265
|
client_config=client_config,
|
|
2246
2266
|
)
|
|
2247
|
-
file_exporter.run(self.
|
|
2267
|
+
file_exporter.run(self.to_values(signal), progress_bar)
|
|
2248
2268
|
|
|
2249
2269
|
def shuffle(self) -> "Self":
|
|
2250
2270
|
"""Shuffle the rows of the chain deterministically."""
|
|
@@ -2378,3 +2398,72 @@ class DataChain:
|
|
|
2378
2398
|
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|
|
2379
2399
|
"""
|
|
2380
2400
|
return self._evolve(query=self._query.chunk(index, total))
|
|
2401
|
+
|
|
2402
|
+
def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
|
|
2403
|
+
"""Returns a list of rows of values, optionally limited to the specified
|
|
2404
|
+
columns.
|
|
2405
|
+
|
|
2406
|
+
Args:
|
|
2407
|
+
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
2408
|
+
|
|
2409
|
+
Returns:
|
|
2410
|
+
list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
|
|
2411
|
+
|
|
2412
|
+
Example:
|
|
2413
|
+
Getting all rows as a list:
|
|
2414
|
+
```py
|
|
2415
|
+
rows = dc.to_list()
|
|
2416
|
+
print(rows)
|
|
2417
|
+
```
|
|
2418
|
+
|
|
2419
|
+
Getting all rows with selected columns as a list:
|
|
2420
|
+
```py
|
|
2421
|
+
name_size_pairs = dc.to_list("file.path", "file.size")
|
|
2422
|
+
print(name_size_pairs)
|
|
2423
|
+
```
|
|
2424
|
+
|
|
2425
|
+
Getting a single column as a list:
|
|
2426
|
+
```py
|
|
2427
|
+
files = dc.to_list("file.path")
|
|
2428
|
+
print(files) # Returns list of 1-tuples
|
|
2429
|
+
```
|
|
2430
|
+
"""
|
|
2431
|
+
return list(self.to_iter(*cols))
|
|
2432
|
+
|
|
2433
|
+
def to_values(self, col: str) -> list[DataValue]:
|
|
2434
|
+
"""Returns a flat list of values from a single column.
|
|
2435
|
+
|
|
2436
|
+
Args:
|
|
2437
|
+
col: The name of the column to extract values from.
|
|
2438
|
+
|
|
2439
|
+
Returns:
|
|
2440
|
+
list[DataValue]: Returns a flat list of values from the specified column.
|
|
2441
|
+
|
|
2442
|
+
Example:
|
|
2443
|
+
Getting all values from a single column:
|
|
2444
|
+
```py
|
|
2445
|
+
file_paths = dc.to_values("file.path")
|
|
2446
|
+
print(file_paths) # Returns list of strings
|
|
2447
|
+
```
|
|
2448
|
+
|
|
2449
|
+
Getting all file sizes:
|
|
2450
|
+
```py
|
|
2451
|
+
sizes = dc.to_values("file.size")
|
|
2452
|
+
print(sizes) # Returns list of integers
|
|
2453
|
+
```
|
|
2454
|
+
"""
|
|
2455
|
+
return [row[0] for row in self.to_list(col)]
|
|
2456
|
+
|
|
2457
|
+
def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
|
|
2458
|
+
"""Make DataChain objects iterable.
|
|
2459
|
+
|
|
2460
|
+
Yields:
|
|
2461
|
+
(tuple[DataValue, ...]): Yields tuples of all column values for each row.
|
|
2462
|
+
|
|
2463
|
+
Example:
|
|
2464
|
+
```py
|
|
2465
|
+
for row in chain:
|
|
2466
|
+
print(row)
|
|
2467
|
+
```
|
|
2468
|
+
"""
|
|
2469
|
+
return self.to_iter()
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -2,7 +2,11 @@ from collections.abc import Sequence
|
|
|
2
2
|
from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
|
|
3
3
|
|
|
4
4
|
from datachain.dataset import parse_dataset_name
|
|
5
|
-
from datachain.error import
|
|
5
|
+
from datachain.error import (
|
|
6
|
+
DatasetNotFoundError,
|
|
7
|
+
DatasetVersionNotFoundError,
|
|
8
|
+
ProjectNotFoundError,
|
|
9
|
+
)
|
|
6
10
|
from datachain.lib.dataset_info import DatasetInfo
|
|
7
11
|
from datachain.lib.file import (
|
|
8
12
|
File,
|
|
@@ -33,7 +37,11 @@ def read_dataset(
|
|
|
33
37
|
settings: Optional[dict] = None,
|
|
34
38
|
fallback_to_studio: bool = True,
|
|
35
39
|
delta: Optional[bool] = False,
|
|
36
|
-
delta_on: Optional[Union[str, Sequence[str]]] =
|
|
40
|
+
delta_on: Optional[Union[str, Sequence[str]]] = (
|
|
41
|
+
"file.path",
|
|
42
|
+
"file.etag",
|
|
43
|
+
"file.version",
|
|
44
|
+
),
|
|
37
45
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
38
46
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
39
47
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
@@ -53,41 +61,25 @@ def read_dataset(
|
|
|
53
61
|
settings : Settings to use for the chain.
|
|
54
62
|
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
55
63
|
Default is True.
|
|
56
|
-
delta: If
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
dataset
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
76
|
-
to the `delta_on` fields from the source.
|
|
77
|
-
This is needed to identify rows that have changed in the source but are
|
|
78
|
-
already present in the current version of the resulting dataset, in order
|
|
79
|
-
to avoid including outdated versions of those rows in the new dataset.
|
|
80
|
-
We retain only the latest versions of rows to prevent duplication.
|
|
81
|
-
There is no need to define this if the `delta_on` fields are present in
|
|
82
|
-
the final dataset and have not been renamed.
|
|
83
|
-
delta_compare: A list of fields used to check if the same row has been modified
|
|
84
|
-
in the new version of the source.
|
|
85
|
-
If not defined, all fields except those defined in delta_on will be used.
|
|
86
|
-
delta_retry: Specifies retry behavior for delta processing. If a string,
|
|
87
|
-
it's the name of a field in the result dataset that indicates an error
|
|
88
|
-
when not None - records with errors will be reprocessed. If True,
|
|
89
|
-
records that exist in the source dataset but not in the result dataset
|
|
90
|
-
will be reprocessed.
|
|
64
|
+
delta: If True, only process new or changed files instead of reprocessing
|
|
65
|
+
everything. This saves time by skipping files that were already processed in
|
|
66
|
+
previous versions. The optimization is working when a new version of the
|
|
67
|
+
dataset is created.
|
|
68
|
+
Default is False.
|
|
69
|
+
delta_on: Field(s) that uniquely identify each record in the source data.
|
|
70
|
+
Used to detect which records are new or changed.
|
|
71
|
+
Default is ("file.path", "file.etag", "file.version").
|
|
72
|
+
delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
|
|
73
|
+
Only needed if you rename the identifying fields during processing.
|
|
74
|
+
Default is None.
|
|
75
|
+
delta_compare: Field(s) used to detect if a record has changed.
|
|
76
|
+
If not specified, all fields except `delta_on` fields are used.
|
|
77
|
+
Default is None.
|
|
78
|
+
delta_retry: Controls retry behavior for failed records:
|
|
79
|
+
- String (field name): Reprocess records where this field is not empty
|
|
80
|
+
(error mode)
|
|
81
|
+
- True: Reprocess records missing from the result dataset (missing mode)
|
|
82
|
+
- None: No retry processing (default)
|
|
91
83
|
|
|
92
84
|
Example:
|
|
93
85
|
```py
|
|
@@ -148,9 +140,15 @@ def read_dataset(
|
|
|
148
140
|
# all 2.* dataset versions). If dataset doesn't have any versions where
|
|
149
141
|
# major part is equal to that input, exception is thrown.
|
|
150
142
|
major = int(version)
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
143
|
+
try:
|
|
144
|
+
ds_project = get_project(project_name, namespace_name, session=session)
|
|
145
|
+
except ProjectNotFoundError:
|
|
146
|
+
raise DatasetNotFoundError(
|
|
147
|
+
f"Dataset {name} not found in namespace {namespace_name} and",
|
|
148
|
+
f" project {project_name}",
|
|
149
|
+
) from None
|
|
150
|
+
|
|
151
|
+
dataset = session.catalog.get_dataset(name, ds_project)
|
|
154
152
|
latest_major = dataset.latest_major_version(major)
|
|
155
153
|
if not latest_major:
|
|
156
154
|
raise DatasetVersionNotFoundError(
|
|
@@ -228,7 +226,7 @@ def datasets(
|
|
|
228
226
|
import datachain as dc
|
|
229
227
|
|
|
230
228
|
chain = dc.datasets(column="dataset")
|
|
231
|
-
for ds in chain.
|
|
229
|
+
for ds in chain.to_iter("dataset"):
|
|
232
230
|
print(f"{ds.name}@v{ds.version}")
|
|
233
231
|
```
|
|
234
232
|
"""
|
|
@@ -333,7 +331,13 @@ def delete_dataset(
|
|
|
333
331
|
None, name, namespace_name, project_name, version=version, force=force
|
|
334
332
|
)
|
|
335
333
|
|
|
336
|
-
|
|
334
|
+
try:
|
|
335
|
+
ds_project = get_project(project_name, namespace_name, session=session)
|
|
336
|
+
except ProjectNotFoundError:
|
|
337
|
+
raise DatasetNotFoundError(
|
|
338
|
+
f"Dataset {name} not found in namespace {namespace_name} and project",
|
|
339
|
+
f" {project_name}",
|
|
340
|
+
) from None
|
|
337
341
|
|
|
338
342
|
if not force:
|
|
339
343
|
version = version or catalog.get_dataset(name, ds_project).latest_version
|