datachain 0.20.1__py3-none-any.whl → 0.20.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -3
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +3 -3
- datachain/cli/commands/ls.py +2 -2
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +19 -6
- datachain/data_storage/sqlite.py +2 -2
- datachain/dataset.py +4 -3
- datachain/delta.py +2 -2
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -4
- datachain/lib/dc/datachain.py +174 -86
- datachain/lib/dc/datasets.py +25 -37
- datachain/lib/dc/storage.py +24 -38
- datachain/lib/file.py +77 -23
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +16 -18
- datachain/lib/projects.py +26 -26
- datachain/lib/pytorch.py +1 -1
- datachain/lib/tar.py +1 -2
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/namespace.py +3 -3
- datachain/project.py +5 -5
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/METADATA +1 -1
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/RECORD +33 -33
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/WHEEL +0 -0
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/top_level.txt +0 -0
datachain/lib/dc/datachain.py
CHANGED
|
@@ -26,6 +26,7 @@ from tqdm import tqdm
|
|
|
26
26
|
from datachain import semver
|
|
27
27
|
from datachain.dataset import DatasetRecord, parse_dataset_name
|
|
28
28
|
from datachain.delta import delta_disabled
|
|
29
|
+
from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
|
|
29
30
|
from datachain.func import literal
|
|
30
31
|
from datachain.func.base import Function
|
|
31
32
|
from datachain.func.func import Func
|
|
@@ -439,10 +440,10 @@ class DataChain:
|
|
|
439
440
|
|
|
440
441
|
from datachain.lib.arrow import schema_to_output
|
|
441
442
|
|
|
442
|
-
json_values =
|
|
443
|
+
json_values = self.limit(schema_sample_size).to_list(col)
|
|
443
444
|
json_dicts = [
|
|
444
445
|
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
445
|
-
for json_value in json_values
|
|
446
|
+
for (json_value,) in json_values
|
|
446
447
|
]
|
|
447
448
|
|
|
448
449
|
if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
|
|
@@ -581,7 +582,15 @@ class DataChain:
|
|
|
581
582
|
or self.session.catalog.metastore.default_project_name
|
|
582
583
|
)
|
|
583
584
|
|
|
584
|
-
|
|
585
|
+
try:
|
|
586
|
+
project = self.session.catalog.metastore.get_project(
|
|
587
|
+
project_name,
|
|
588
|
+
namespace_name,
|
|
589
|
+
create=self.session.catalog.metastore.project_allowed_to_create,
|
|
590
|
+
)
|
|
591
|
+
except ProjectNotFoundError as e:
|
|
592
|
+
# not being able to create it as creation is not allowed
|
|
593
|
+
raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
|
|
585
594
|
|
|
586
595
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
587
596
|
|
|
@@ -893,7 +902,7 @@ class DataChain:
|
|
|
893
902
|
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
894
903
|
I.e. when using `read_dataset` an `order_by` statement should be used if
|
|
895
904
|
the order of the records in the chain is important.
|
|
896
|
-
Using `order_by` directly before `limit`, `
|
|
905
|
+
Using `order_by` directly before `limit`, `to_list` and similar methods
|
|
897
906
|
will give expected results.
|
|
898
907
|
See https://github.com/iterative/datachain/issues/477 for further details.
|
|
899
908
|
"""
|
|
@@ -1098,32 +1107,32 @@ class DataChain:
|
|
|
1098
1107
|
|
|
1099
1108
|
@property
|
|
1100
1109
|
def _effective_signals_schema(self) -> "SignalSchema":
|
|
1101
|
-
"""Effective schema used for user-facing API like
|
|
1110
|
+
"""Effective schema used for user-facing API like to_list, to_pandas, etc."""
|
|
1102
1111
|
signals_schema = self.signals_schema
|
|
1103
1112
|
if not self._sys:
|
|
1104
1113
|
return signals_schema.clone_without_sys_signals()
|
|
1105
1114
|
return signals_schema
|
|
1106
1115
|
|
|
1107
1116
|
@overload
|
|
1108
|
-
def
|
|
1117
|
+
def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
|
|
1109
1118
|
|
|
1110
1119
|
@overload
|
|
1111
|
-
def
|
|
1120
|
+
def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
|
|
1112
1121
|
|
|
1113
1122
|
@overload
|
|
1114
|
-
def
|
|
1123
|
+
def _leaf_values(
|
|
1115
1124
|
self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
|
|
1116
1125
|
) -> Iterator[_T]: ...
|
|
1117
1126
|
|
|
1118
1127
|
@overload
|
|
1119
|
-
def
|
|
1128
|
+
def _leaf_values(
|
|
1120
1129
|
self,
|
|
1121
1130
|
*,
|
|
1122
1131
|
row_factory: Callable[[list[str], tuple[Any, ...]], _T],
|
|
1123
1132
|
include_hidden: bool,
|
|
1124
1133
|
) -> Iterator[_T]: ...
|
|
1125
1134
|
|
|
1126
|
-
def
|
|
1135
|
+
def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
|
|
1127
1136
|
"""Yields flattened rows of values as a tuple.
|
|
1128
1137
|
|
|
1129
1138
|
Args:
|
|
@@ -1151,7 +1160,7 @@ class DataChain:
|
|
|
1151
1160
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
1152
1161
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
1153
1162
|
|
|
1154
|
-
results_iter = self.
|
|
1163
|
+
results_iter = self._leaf_values()
|
|
1155
1164
|
|
|
1156
1165
|
def column_chunks() -> Iterator[list[list[Any]]]:
|
|
1157
1166
|
for chunk_iter in batched_it(results_iter, chunk_size):
|
|
@@ -1184,9 +1193,9 @@ class DataChain:
|
|
|
1184
1193
|
|
|
1185
1194
|
def results(self, *, row_factory=None, include_hidden=True):
|
|
1186
1195
|
if row_factory is None:
|
|
1187
|
-
return list(self.
|
|
1196
|
+
return list(self._leaf_values(include_hidden=include_hidden))
|
|
1188
1197
|
return list(
|
|
1189
|
-
self.
|
|
1198
|
+
self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
|
|
1190
1199
|
)
|
|
1191
1200
|
|
|
1192
1201
|
def to_records(self) -> list[dict[str, Any]]:
|
|
@@ -1197,42 +1206,38 @@ class DataChain:
|
|
|
1197
1206
|
|
|
1198
1207
|
return self.results(row_factory=to_dict)
|
|
1199
1208
|
|
|
1200
|
-
|
|
1201
|
-
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1202
|
-
|
|
1203
|
-
@overload
|
|
1204
|
-
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1205
|
-
|
|
1206
|
-
@overload
|
|
1207
|
-
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1208
|
-
|
|
1209
|
-
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1209
|
+
def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
|
|
1210
1210
|
"""Yields rows of values, optionally limited to the specified columns.
|
|
1211
1211
|
|
|
1212
1212
|
Args:
|
|
1213
1213
|
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
1214
1214
|
|
|
1215
1215
|
Yields:
|
|
1216
|
-
(DataType): Yields a
|
|
1217
|
-
(tuple[DataType, ...]): Yields a tuple of items if multiple columns are
|
|
1218
|
-
selected.
|
|
1216
|
+
(tuple[DataType, ...]): Yields a tuple of items for each row.
|
|
1219
1217
|
|
|
1220
1218
|
Example:
|
|
1221
1219
|
Iterating over all rows:
|
|
1222
1220
|
```py
|
|
1223
|
-
for row in
|
|
1221
|
+
for row in ds.to_iter():
|
|
1222
|
+
print(row)
|
|
1223
|
+
```
|
|
1224
|
+
|
|
1225
|
+
DataChain is iterable and can be used in a for loop directly which is
|
|
1226
|
+
equivalent to `ds.to_iter()`:
|
|
1227
|
+
```py
|
|
1228
|
+
for row in ds:
|
|
1224
1229
|
print(row)
|
|
1225
1230
|
```
|
|
1226
1231
|
|
|
1227
1232
|
Iterating over all rows with selected columns:
|
|
1228
1233
|
```py
|
|
1229
|
-
for name, size in
|
|
1234
|
+
for name, size in ds.to_iter("file.path", "file.size"):
|
|
1230
1235
|
print(name, size)
|
|
1231
1236
|
```
|
|
1232
1237
|
|
|
1233
1238
|
Iterating over a single column:
|
|
1234
1239
|
```py
|
|
1235
|
-
for file in
|
|
1240
|
+
for (file,) in ds.to_iter("file.path"):
|
|
1236
1241
|
print(file)
|
|
1237
1242
|
```
|
|
1238
1243
|
"""
|
|
@@ -1244,7 +1249,31 @@ class DataChain:
|
|
|
1244
1249
|
ret = signals_schema.row_to_features(
|
|
1245
1250
|
row, catalog=chain.session.catalog, cache=chain._settings.cache
|
|
1246
1251
|
)
|
|
1247
|
-
yield
|
|
1252
|
+
yield tuple(ret)
|
|
1253
|
+
|
|
1254
|
+
@overload
|
|
1255
|
+
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1256
|
+
|
|
1257
|
+
@overload
|
|
1258
|
+
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1259
|
+
|
|
1260
|
+
@overload
|
|
1261
|
+
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1262
|
+
|
|
1263
|
+
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1264
|
+
"""
|
|
1265
|
+
Deprecated. Use `to_iter` method instead.
|
|
1266
|
+
"""
|
|
1267
|
+
warnings.warn(
|
|
1268
|
+
"Method `collect` is deprecated. Use `to_iter` method instead.",
|
|
1269
|
+
DeprecationWarning,
|
|
1270
|
+
stacklevel=2,
|
|
1271
|
+
)
|
|
1272
|
+
|
|
1273
|
+
if len(cols) == 1:
|
|
1274
|
+
yield from [item[0] for item in self.to_iter(*cols)]
|
|
1275
|
+
else:
|
|
1276
|
+
yield from self.to_iter(*cols)
|
|
1248
1277
|
|
|
1249
1278
|
def to_pytorch(
|
|
1250
1279
|
self,
|
|
@@ -1479,7 +1508,7 @@ class DataChain:
|
|
|
1479
1508
|
)
|
|
1480
1509
|
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1481
1510
|
|
|
1482
|
-
def
|
|
1511
|
+
def diff(
|
|
1483
1512
|
self,
|
|
1484
1513
|
other: "DataChain",
|
|
1485
1514
|
on: Union[str, Sequence[str]],
|
|
@@ -1492,41 +1521,33 @@ class DataChain:
|
|
|
1492
1521
|
same: bool = False,
|
|
1493
1522
|
status_col: Optional[str] = None,
|
|
1494
1523
|
) -> "DataChain":
|
|
1495
|
-
"""
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
for all rows. Beside additional diff column, new chain has schema of the chain
|
|
1501
|
-
on which method was called.
|
|
1524
|
+
"""Calculate differences between two chains.
|
|
1525
|
+
|
|
1526
|
+
This method identifies records that are added, deleted, modified, or unchanged
|
|
1527
|
+
between two chains. It adds a status column with values: A=added, D=deleted,
|
|
1528
|
+
M=modified, S=same.
|
|
1502
1529
|
|
|
1503
1530
|
Parameters:
|
|
1504
|
-
other: Chain to
|
|
1505
|
-
on: Column
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
added (bool): Whether to return added rows in resulting chain.
|
|
1521
|
-
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1522
|
-
modified (bool): Whether to return modified rows in resulting chain.
|
|
1523
|
-
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1524
|
-
status_col (str): Name of the new column that is created in resulting chain
|
|
1525
|
-
representing diff status.
|
|
1531
|
+
other: Chain to compare against.
|
|
1532
|
+
on: Column(s) to match records between chains.
|
|
1533
|
+
right_on: Column(s) in the other chain to match against. Defaults to `on`.
|
|
1534
|
+
compare: Column(s) to check for changes.
|
|
1535
|
+
If not specified,all columns are used.
|
|
1536
|
+
right_compare: Column(s) in the other chain to compare against.
|
|
1537
|
+
Defaults to values of `compare`.
|
|
1538
|
+
added (bool): Include records that exist in this chain but not in the other.
|
|
1539
|
+
deleted (bool): Include records that exist only in the other chain.
|
|
1540
|
+
modified (bool): Include records that exist in both
|
|
1541
|
+
but have different values.
|
|
1542
|
+
same (bool): Include records that are identical in both chains.
|
|
1543
|
+
status_col (str): Name for the status column showing differences.
|
|
1544
|
+
|
|
1545
|
+
Default behavior: By default, shows added, deleted, and modified records,
|
|
1546
|
+
but excludes unchanged records (same=False). Status column is not created.
|
|
1526
1547
|
|
|
1527
1548
|
Example:
|
|
1528
1549
|
```py
|
|
1529
|
-
res = persons.
|
|
1550
|
+
res = persons.diff(
|
|
1530
1551
|
new_persons,
|
|
1531
1552
|
on=["id"],
|
|
1532
1553
|
right_on=["other_id"],
|
|
@@ -1555,7 +1576,7 @@ class DataChain:
|
|
|
1555
1576
|
status_col=status_col,
|
|
1556
1577
|
)
|
|
1557
1578
|
|
|
1558
|
-
def
|
|
1579
|
+
def file_diff(
|
|
1559
1580
|
self,
|
|
1560
1581
|
other: "DataChain",
|
|
1561
1582
|
on: str = "file",
|
|
@@ -1566,31 +1587,29 @@ class DataChain:
|
|
|
1566
1587
|
same: bool = False,
|
|
1567
1588
|
status_col: Optional[str] = None,
|
|
1568
1589
|
) -> "DataChain":
|
|
1569
|
-
"""
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
`.compare()` user needs to provide arbitrary columns for matching and comparing.
|
|
1590
|
+
"""Calculate differences between two chains containing files.
|
|
1591
|
+
|
|
1592
|
+
This method is specifically designed for file chains. It uses file `source`
|
|
1593
|
+
and `path` to match files, and file `version` and `etag` to detect changes.
|
|
1574
1594
|
|
|
1575
1595
|
Parameters:
|
|
1576
|
-
other: Chain to
|
|
1577
|
-
on: File
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
resulting chain representing diff status.
|
|
1596
|
+
other: Chain to compare against.
|
|
1597
|
+
on: File column name in this chain. Default is "file".
|
|
1598
|
+
right_on: File column name in the other chain. Defaults to `on`.
|
|
1599
|
+
added (bool): Include files that exist in this chain but not in the other.
|
|
1600
|
+
deleted (bool): Include files that exist only in the other chain.
|
|
1601
|
+
modified (bool): Include files that exist in both but have different
|
|
1602
|
+
versions/etags.
|
|
1603
|
+
same (bool): Include files that are identical in both chains.
|
|
1604
|
+
status_col (str): Name for the status column showing differences
|
|
1605
|
+
(A=added, D=deleted, M=modified, S=same).
|
|
1606
|
+
|
|
1607
|
+
Default behavior: By default, includes only new files (added=True and
|
|
1608
|
+
modified=True). This is useful for incremental processing.
|
|
1590
1609
|
|
|
1591
1610
|
Example:
|
|
1592
1611
|
```py
|
|
1593
|
-
diff = images.
|
|
1612
|
+
diff = images.file_diff(
|
|
1594
1613
|
new_images,
|
|
1595
1614
|
on="file",
|
|
1596
1615
|
right_on="other_file",
|
|
@@ -1615,7 +1634,7 @@ class DataChain:
|
|
|
1615
1634
|
compare_cols = get_file_signals(on, compare_file_signals)
|
|
1616
1635
|
right_compare_cols = get_file_signals(right_on, compare_file_signals)
|
|
1617
1636
|
|
|
1618
|
-
return self.
|
|
1637
|
+
return self.diff(
|
|
1619
1638
|
other,
|
|
1620
1639
|
on_cols,
|
|
1621
1640
|
right_on=right_on_cols,
|
|
@@ -2027,7 +2046,7 @@ class DataChain:
|
|
|
2027
2046
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
2028
2047
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
2029
2048
|
|
|
2030
|
-
results_iter = self.
|
|
2049
|
+
results_iter = self._leaf_values()
|
|
2031
2050
|
|
|
2032
2051
|
with opener(path, "w", newline="") as f:
|
|
2033
2052
|
writer = csv.writer(f, delimiter=delimiter, **kwargs)
|
|
@@ -2079,7 +2098,7 @@ class DataChain:
|
|
|
2079
2098
|
if include_outer_list:
|
|
2080
2099
|
# This makes the file JSON instead of JSON lines.
|
|
2081
2100
|
f.write(b"[\n")
|
|
2082
|
-
for row in self.
|
|
2101
|
+
for row in self._leaf_values():
|
|
2083
2102
|
if not is_first:
|
|
2084
2103
|
if include_outer_list:
|
|
2085
2104
|
# This makes the file JSON instead of JSON lines.
|
|
@@ -2244,7 +2263,7 @@ class DataChain:
|
|
|
2244
2263
|
max_threads=num_threads or 1,
|
|
2245
2264
|
client_config=client_config,
|
|
2246
2265
|
)
|
|
2247
|
-
file_exporter.run(self.
|
|
2266
|
+
file_exporter.run(self.to_values(signal), progress_bar)
|
|
2248
2267
|
|
|
2249
2268
|
def shuffle(self) -> "Self":
|
|
2250
2269
|
"""Shuffle the rows of the chain deterministically."""
|
|
@@ -2378,3 +2397,72 @@ class DataChain:
|
|
|
2378
2397
|
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|
|
2379
2398
|
"""
|
|
2380
2399
|
return self._evolve(query=self._query.chunk(index, total))
|
|
2400
|
+
|
|
2401
|
+
def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
|
|
2402
|
+
"""Returns a list of rows of values, optionally limited to the specified
|
|
2403
|
+
columns.
|
|
2404
|
+
|
|
2405
|
+
Args:
|
|
2406
|
+
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
2407
|
+
|
|
2408
|
+
Returns:
|
|
2409
|
+
list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
|
|
2410
|
+
|
|
2411
|
+
Example:
|
|
2412
|
+
Getting all rows as a list:
|
|
2413
|
+
```py
|
|
2414
|
+
rows = dc.to_list()
|
|
2415
|
+
print(rows)
|
|
2416
|
+
```
|
|
2417
|
+
|
|
2418
|
+
Getting all rows with selected columns as a list:
|
|
2419
|
+
```py
|
|
2420
|
+
name_size_pairs = dc.to_list("file.path", "file.size")
|
|
2421
|
+
print(name_size_pairs)
|
|
2422
|
+
```
|
|
2423
|
+
|
|
2424
|
+
Getting a single column as a list:
|
|
2425
|
+
```py
|
|
2426
|
+
files = dc.to_list("file.path")
|
|
2427
|
+
print(files) # Returns list of 1-tuples
|
|
2428
|
+
```
|
|
2429
|
+
"""
|
|
2430
|
+
return list(self.to_iter(*cols))
|
|
2431
|
+
|
|
2432
|
+
def to_values(self, col: str) -> list[DataValue]:
|
|
2433
|
+
"""Returns a flat list of values from a single column.
|
|
2434
|
+
|
|
2435
|
+
Args:
|
|
2436
|
+
col: The name of the column to extract values from.
|
|
2437
|
+
|
|
2438
|
+
Returns:
|
|
2439
|
+
list[DataValue]: Returns a flat list of values from the specified column.
|
|
2440
|
+
|
|
2441
|
+
Example:
|
|
2442
|
+
Getting all values from a single column:
|
|
2443
|
+
```py
|
|
2444
|
+
file_paths = dc.to_values("file.path")
|
|
2445
|
+
print(file_paths) # Returns list of strings
|
|
2446
|
+
```
|
|
2447
|
+
|
|
2448
|
+
Getting all file sizes:
|
|
2449
|
+
```py
|
|
2450
|
+
sizes = dc.to_values("file.size")
|
|
2451
|
+
print(sizes) # Returns list of integers
|
|
2452
|
+
```
|
|
2453
|
+
"""
|
|
2454
|
+
return [row[0] for row in self.to_list(col)]
|
|
2455
|
+
|
|
2456
|
+
def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
|
|
2457
|
+
"""Make DataChain objects iterable.
|
|
2458
|
+
|
|
2459
|
+
Yields:
|
|
2460
|
+
(tuple[DataValue, ...]): Yields tuples of all column values for each row.
|
|
2461
|
+
|
|
2462
|
+
Example:
|
|
2463
|
+
```py
|
|
2464
|
+
for row in chain:
|
|
2465
|
+
print(row)
|
|
2466
|
+
```
|
|
2467
|
+
"""
|
|
2468
|
+
return self.to_iter()
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -33,7 +33,11 @@ def read_dataset(
|
|
|
33
33
|
settings: Optional[dict] = None,
|
|
34
34
|
fallback_to_studio: bool = True,
|
|
35
35
|
delta: Optional[bool] = False,
|
|
36
|
-
delta_on: Optional[Union[str, Sequence[str]]] =
|
|
36
|
+
delta_on: Optional[Union[str, Sequence[str]]] = (
|
|
37
|
+
"file.path",
|
|
38
|
+
"file.etag",
|
|
39
|
+
"file.version",
|
|
40
|
+
),
|
|
37
41
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
38
42
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
39
43
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
@@ -53,41 +57,25 @@ def read_dataset(
|
|
|
53
57
|
settings : Settings to use for the chain.
|
|
54
58
|
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
55
59
|
Default is True.
|
|
56
|
-
delta: If
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
dataset
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
76
|
-
to the `delta_on` fields from the source.
|
|
77
|
-
This is needed to identify rows that have changed in the source but are
|
|
78
|
-
already present in the current version of the resulting dataset, in order
|
|
79
|
-
to avoid including outdated versions of those rows in the new dataset.
|
|
80
|
-
We retain only the latest versions of rows to prevent duplication.
|
|
81
|
-
There is no need to define this if the `delta_on` fields are present in
|
|
82
|
-
the final dataset and have not been renamed.
|
|
83
|
-
delta_compare: A list of fields used to check if the same row has been modified
|
|
84
|
-
in the new version of the source.
|
|
85
|
-
If not defined, all fields except those defined in delta_on will be used.
|
|
86
|
-
delta_retry: Specifies retry behavior for delta processing. If a string,
|
|
87
|
-
it's the name of a field in the result dataset that indicates an error
|
|
88
|
-
when not None - records with errors will be reprocessed. If True,
|
|
89
|
-
records that exist in the source dataset but not in the result dataset
|
|
90
|
-
will be reprocessed.
|
|
60
|
+
delta: If True, only process new or changed files instead of reprocessing
|
|
61
|
+
everything. This saves time by skipping files that were already processed in
|
|
62
|
+
previous versions. The optimization is working when a new version of the
|
|
63
|
+
dataset is created.
|
|
64
|
+
Default is False.
|
|
65
|
+
delta_on: Field(s) that uniquely identify each record in the source data.
|
|
66
|
+
Used to detect which records are new or changed.
|
|
67
|
+
Default is ("file.path", "file.etag", "file.version").
|
|
68
|
+
delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
|
|
69
|
+
Only needed if you rename the identifying fields during processing.
|
|
70
|
+
Default is None.
|
|
71
|
+
delta_compare: Field(s) used to detect if a record has changed.
|
|
72
|
+
If not specified, all fields except `delta_on` fields are used.
|
|
73
|
+
Default is None.
|
|
74
|
+
delta_retry: Controls retry behavior for failed records:
|
|
75
|
+
- String (field name): Reprocess records where this field is not empty
|
|
76
|
+
(error mode)
|
|
77
|
+
- True: Reprocess records missing from the result dataset (missing mode)
|
|
78
|
+
- None: No retry processing (default)
|
|
91
79
|
|
|
92
80
|
Example:
|
|
93
81
|
```py
|
|
@@ -228,7 +216,7 @@ def datasets(
|
|
|
228
216
|
import datachain as dc
|
|
229
217
|
|
|
230
218
|
chain = dc.datasets(column="dataset")
|
|
231
|
-
for ds in chain.
|
|
219
|
+
for ds in chain.to_iter("dataset"):
|
|
232
220
|
print(f"{ds.name}@v{ds.version}")
|
|
233
221
|
```
|
|
234
222
|
"""
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -35,7 +35,11 @@ def read_storage(
|
|
|
35
35
|
update: bool = False,
|
|
36
36
|
anon: bool = False,
|
|
37
37
|
delta: Optional[bool] = False,
|
|
38
|
-
delta_on: Optional[Union[str, Sequence[str]]] =
|
|
38
|
+
delta_on: Optional[Union[str, Sequence[str]]] = (
|
|
39
|
+
"file.path",
|
|
40
|
+
"file.etag",
|
|
41
|
+
"file.version",
|
|
42
|
+
),
|
|
39
43
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
40
44
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
41
45
|
delta_retry: Optional[Union[bool, str]] = None,
|
|
@@ -54,43 +58,25 @@ def read_storage(
|
|
|
54
58
|
update : force storage reindexing. Default is False.
|
|
55
59
|
anon : If True, we will treat cloud bucket as public one
|
|
56
60
|
client_config : Optional client configuration for the storage client.
|
|
57
|
-
delta: If
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
dataset
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
77
|
-
to the `delta_on` fields from the source.
|
|
78
|
-
This is needed to identify rows that have changed in the source but are
|
|
79
|
-
already present in the current version of the resulting dataset, in order
|
|
80
|
-
to avoid including outdated versions of those rows in the new dataset.
|
|
81
|
-
We retain only the latest versions of rows to prevent duplication.
|
|
82
|
-
There is no need to define this if the `delta_on` fields are present in
|
|
83
|
-
the final dataset and have not been renamed.
|
|
84
|
-
delta_compare: A list of fields used to check if the same row has been modified
|
|
85
|
-
in the new version of the source.
|
|
86
|
-
If not defined, all fields except those defined in `delta_on` will be used.
|
|
87
|
-
delta_retry: Controls which records to reprocess. Can be:
|
|
88
|
-
- A string specifying a field name: Records where this field is not None
|
|
89
|
-
will be reprocessed (error checking mode).
|
|
90
|
-
- True: Records that exist in the source dataset but not in the result
|
|
91
|
-
dataset (based on delta_on/delta_result_on fields) will be reprocessed
|
|
92
|
-
(missing records mode).
|
|
93
|
-
- False or None: No retry processing.
|
|
61
|
+
delta: If True, only process new or changed files instead of reprocessing
|
|
62
|
+
everything. This saves time by skipping files that were already processed in
|
|
63
|
+
previous versions. The optimization is working when a new version of the
|
|
64
|
+
dataset is created.
|
|
65
|
+
Default is False.
|
|
66
|
+
delta_on: Field(s) that uniquely identify each record in the source data.
|
|
67
|
+
Used to detect which records are new or changed.
|
|
68
|
+
Default is ("file.path", "file.etag", "file.version").
|
|
69
|
+
delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
|
|
70
|
+
Only needed if you rename the identifying fields during processing.
|
|
71
|
+
Default is None.
|
|
72
|
+
delta_compare: Field(s) used to detect if a record has changed.
|
|
73
|
+
If not specified, all fields except `delta_on` fields are used.
|
|
74
|
+
Default is None.
|
|
75
|
+
delta_retry: Controls retry behavior for failed records:
|
|
76
|
+
- String (field name): Reprocess records where this field is not empty
|
|
77
|
+
(error mode)
|
|
78
|
+
- True: Reprocess records missing from the result dataset (missing mode)
|
|
79
|
+
- None: No retry processing (default)
|
|
94
80
|
|
|
95
81
|
Returns:
|
|
96
82
|
DataChain: A DataChain object containing the file information.
|