datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +213 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +35 -26
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +1 -35
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +433 -37
- datachain/data_storage/sqlite.py +140 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +128 -12
- datachain/delta.py +11 -7
- datachain/error.py +36 -0
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +253 -91
- datachain/lib/dc/datasets.py +103 -50
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +2 -1
- datachain/lib/dc/storage.py +38 -40
- datachain/lib/file.py +77 -23
- datachain/lib/listing.py +3 -1
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +71 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +10 -0
- datachain/lib/signal_schema.py +8 -0
- datachain/lib/tar.py +1 -2
- datachain/lib/udf.py +1 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +61 -26
- datachain/studio.py +23 -6
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
datachain/func/func.py
CHANGED
datachain/lib/arrow.py
CHANGED
|
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
|
|
|
76
76
|
fs_path = file.path
|
|
77
77
|
fs = ReferenceFileSystem({fs_path: [cache_path]})
|
|
78
78
|
else:
|
|
79
|
-
fs, fs_path = file.get_fs(), file.
|
|
79
|
+
fs, fs_path = file.get_fs(), file.get_fs_path()
|
|
80
80
|
|
|
81
81
|
kwargs = self.kwargs
|
|
82
82
|
if format := kwargs.get("format"):
|
|
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
|
|
|
160
160
|
kwargs["format"] = fix_pyarrow_format(format, parse_options)
|
|
161
161
|
|
|
162
162
|
schemas = []
|
|
163
|
-
for file in chain.
|
|
164
|
-
ds = dataset(file.
|
|
163
|
+
for (file,) in chain.to_iter("file"):
|
|
164
|
+
ds = dataset(file.get_fs_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
|
|
165
165
|
schemas.append(ds.schema)
|
|
166
166
|
if not schemas:
|
|
167
167
|
raise ValueError(
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -22,6 +22,8 @@ if TYPE_CHECKING:
|
|
|
22
22
|
|
|
23
23
|
class DatasetInfo(DataModel):
|
|
24
24
|
name: str
|
|
25
|
+
namespace: str
|
|
26
|
+
project: str
|
|
25
27
|
uuid: str = Field(default=str(uuid4()))
|
|
26
28
|
version: str = Field(default=DEFAULT_DATASET_VERSION)
|
|
27
29
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
@@ -91,6 +93,8 @@ class DatasetInfo(DataModel):
|
|
|
91
93
|
return cls(
|
|
92
94
|
uuid=version.uuid,
|
|
93
95
|
name=dataset.name,
|
|
96
|
+
namespace=dataset.project.namespace.name,
|
|
97
|
+
project=dataset.project.name,
|
|
94
98
|
version=version.version,
|
|
95
99
|
status=version.status,
|
|
96
100
|
created_at=version.created_at,
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -26,6 +26,7 @@ from tqdm import tqdm
|
|
|
26
26
|
from datachain import semver
|
|
27
27
|
from datachain.dataset import DatasetRecord
|
|
28
28
|
from datachain.delta import delta_disabled
|
|
29
|
+
from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
|
|
29
30
|
from datachain.func import literal
|
|
30
31
|
from datachain.func.base import Function
|
|
31
32
|
from datachain.func.func import Func
|
|
@@ -261,7 +262,7 @@ class DataChain:
|
|
|
261
262
|
"""Underlying dataset, if there is one."""
|
|
262
263
|
if not self.name:
|
|
263
264
|
return None
|
|
264
|
-
return self.session.catalog.get_dataset(self.name)
|
|
265
|
+
return self.session.catalog.get_dataset(self.name, self._query.project)
|
|
265
266
|
|
|
266
267
|
def __or__(self, other: "Self") -> "Self":
|
|
267
268
|
"""Return `self.union(other)`."""
|
|
@@ -312,6 +313,8 @@ class DataChain:
|
|
|
312
313
|
min_task_size=None,
|
|
313
314
|
prefetch: Optional[int] = None,
|
|
314
315
|
sys: Optional[bool] = None,
|
|
316
|
+
namespace: Optional[str] = None,
|
|
317
|
+
project: Optional[str] = None,
|
|
315
318
|
) -> "Self":
|
|
316
319
|
"""Change settings for chain.
|
|
317
320
|
|
|
@@ -327,6 +330,8 @@ class DataChain:
|
|
|
327
330
|
prefetch: number of workers to use for downloading files in advance.
|
|
328
331
|
This is enabled by default and uses 2 workers.
|
|
329
332
|
To disable prefetching, set it to 0.
|
|
333
|
+
namespace: namespace name.
|
|
334
|
+
project: project name.
|
|
330
335
|
|
|
331
336
|
Example:
|
|
332
337
|
```py
|
|
@@ -340,7 +345,11 @@ class DataChain:
|
|
|
340
345
|
if sys is None:
|
|
341
346
|
sys = self._sys
|
|
342
347
|
settings = copy.copy(self._settings)
|
|
343
|
-
settings.add(
|
|
348
|
+
settings.add(
|
|
349
|
+
Settings(
|
|
350
|
+
cache, parallel, workers, min_task_size, prefetch, namespace, project
|
|
351
|
+
)
|
|
352
|
+
)
|
|
344
353
|
return self._evolve(settings=settings, _sys=sys)
|
|
345
354
|
|
|
346
355
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
@@ -430,10 +439,10 @@ class DataChain:
|
|
|
430
439
|
|
|
431
440
|
from datachain.lib.arrow import schema_to_output
|
|
432
441
|
|
|
433
|
-
json_values =
|
|
442
|
+
json_values = self.limit(schema_sample_size).to_list(col)
|
|
434
443
|
json_dicts = [
|
|
435
444
|
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
436
|
-
for json_value in json_values
|
|
445
|
+
for (json_value,) in json_values
|
|
437
446
|
]
|
|
438
447
|
|
|
439
448
|
if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
|
|
@@ -490,6 +499,22 @@ class DataChain:
|
|
|
490
499
|
)
|
|
491
500
|
return listings(*args, **kwargs)
|
|
492
501
|
|
|
502
|
+
@property
|
|
503
|
+
def namespace_name(self) -> str:
|
|
504
|
+
"""Current namespace name in which the chain is running"""
|
|
505
|
+
return (
|
|
506
|
+
self._settings.namespace
|
|
507
|
+
or self.session.catalog.metastore.default_namespace_name
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
@property
|
|
511
|
+
def project_name(self) -> str:
|
|
512
|
+
"""Current project name in which the chain is running"""
|
|
513
|
+
return (
|
|
514
|
+
self._settings.project
|
|
515
|
+
or self.session.catalog.metastore.default_project_name
|
|
516
|
+
)
|
|
517
|
+
|
|
493
518
|
def persist(self) -> "Self":
|
|
494
519
|
"""Saves temporary chain that will be removed after the process ends.
|
|
495
520
|
Temporary datasets are useful for optimization, for example when we have
|
|
@@ -499,7 +524,14 @@ class DataChain:
|
|
|
499
524
|
It returns the chain itself.
|
|
500
525
|
"""
|
|
501
526
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
502
|
-
|
|
527
|
+
project = self.session.catalog.metastore.get_project(
|
|
528
|
+
self.project_name,
|
|
529
|
+
self.namespace_name,
|
|
530
|
+
create=True,
|
|
531
|
+
)
|
|
532
|
+
return self._evolve(
|
|
533
|
+
query=self._query.save(project=project, feature_schema=schema)
|
|
534
|
+
)
|
|
503
535
|
|
|
504
536
|
def save( # type: ignore[override]
|
|
505
537
|
self,
|
|
@@ -513,7 +545,10 @@ class DataChain:
|
|
|
513
545
|
"""Save to a Dataset. It returns the chain itself.
|
|
514
546
|
|
|
515
547
|
Parameters:
|
|
516
|
-
name : dataset name.
|
|
548
|
+
name : dataset name. It can be full name consisting of namespace and
|
|
549
|
+
project, but it can also be just a regular dataset name in which
|
|
550
|
+
case we are taking namespace and project from settings, if they
|
|
551
|
+
are defined there, or default ones instead.
|
|
517
552
|
version : version of a dataset. If version is not specified and dataset
|
|
518
553
|
already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
|
|
519
554
|
description : description of a dataset.
|
|
@@ -522,6 +557,7 @@ class DataChain:
|
|
|
522
557
|
update_version: which part of the dataset version to automatically increase.
|
|
523
558
|
Available values: `major`, `minor` or `patch`. Default is `patch`.
|
|
524
559
|
"""
|
|
560
|
+
catalog = self.session.catalog
|
|
525
561
|
if version is not None:
|
|
526
562
|
semver.validate(version)
|
|
527
563
|
|
|
@@ -535,6 +571,22 @@ class DataChain:
|
|
|
535
571
|
" patch"
|
|
536
572
|
)
|
|
537
573
|
|
|
574
|
+
namespace_name, project_name, name = catalog.get_full_dataset_name(
|
|
575
|
+
name,
|
|
576
|
+
namespace_name=self._settings.namespace,
|
|
577
|
+
project_name=self._settings.project,
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
try:
|
|
581
|
+
project = self.session.catalog.metastore.get_project(
|
|
582
|
+
project_name,
|
|
583
|
+
namespace_name,
|
|
584
|
+
create=self.session.catalog.metastore.project_allowed_to_create,
|
|
585
|
+
)
|
|
586
|
+
except ProjectNotFoundError as e:
|
|
587
|
+
# not being able to create it as creation is not allowed
|
|
588
|
+
raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
|
|
589
|
+
|
|
538
590
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
539
591
|
|
|
540
592
|
# Handle retry and delta functionality
|
|
@@ -558,6 +610,7 @@ class DataChain:
|
|
|
558
610
|
query=result_ds._query.save(
|
|
559
611
|
name=name,
|
|
560
612
|
version=version,
|
|
613
|
+
project=project,
|
|
561
614
|
feature_schema=schema,
|
|
562
615
|
dependencies=dependencies,
|
|
563
616
|
**kwargs,
|
|
@@ -577,6 +630,7 @@ class DataChain:
|
|
|
577
630
|
query=self._query.save(
|
|
578
631
|
name=name,
|
|
579
632
|
version=version,
|
|
633
|
+
project=project,
|
|
580
634
|
description=description,
|
|
581
635
|
attrs=attrs,
|
|
582
636
|
feature_schema=schema,
|
|
@@ -843,7 +897,7 @@ class DataChain:
|
|
|
843
897
|
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
844
898
|
I.e. when using `read_dataset` an `order_by` statement should be used if
|
|
845
899
|
the order of the records in the chain is important.
|
|
846
|
-
Using `order_by` directly before `limit`, `
|
|
900
|
+
Using `order_by` directly before `limit`, `to_list` and similar methods
|
|
847
901
|
will give expected results.
|
|
848
902
|
See https://github.com/iterative/datachain/issues/477 for further details.
|
|
849
903
|
"""
|
|
@@ -1048,32 +1102,32 @@ class DataChain:
|
|
|
1048
1102
|
|
|
1049
1103
|
@property
|
|
1050
1104
|
def _effective_signals_schema(self) -> "SignalSchema":
|
|
1051
|
-
"""Effective schema used for user-facing API like
|
|
1105
|
+
"""Effective schema used for user-facing API like to_list, to_pandas, etc."""
|
|
1052
1106
|
signals_schema = self.signals_schema
|
|
1053
1107
|
if not self._sys:
|
|
1054
1108
|
return signals_schema.clone_without_sys_signals()
|
|
1055
1109
|
return signals_schema
|
|
1056
1110
|
|
|
1057
1111
|
@overload
|
|
1058
|
-
def
|
|
1112
|
+
def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
|
|
1059
1113
|
|
|
1060
1114
|
@overload
|
|
1061
|
-
def
|
|
1115
|
+
def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
|
|
1062
1116
|
|
|
1063
1117
|
@overload
|
|
1064
|
-
def
|
|
1118
|
+
def _leaf_values(
|
|
1065
1119
|
self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
|
|
1066
1120
|
) -> Iterator[_T]: ...
|
|
1067
1121
|
|
|
1068
1122
|
@overload
|
|
1069
|
-
def
|
|
1123
|
+
def _leaf_values(
|
|
1070
1124
|
self,
|
|
1071
1125
|
*,
|
|
1072
1126
|
row_factory: Callable[[list[str], tuple[Any, ...]], _T],
|
|
1073
1127
|
include_hidden: bool,
|
|
1074
1128
|
) -> Iterator[_T]: ...
|
|
1075
1129
|
|
|
1076
|
-
def
|
|
1130
|
+
def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
|
|
1077
1131
|
"""Yields flattened rows of values as a tuple.
|
|
1078
1132
|
|
|
1079
1133
|
Args:
|
|
@@ -1101,7 +1155,7 @@ class DataChain:
|
|
|
1101
1155
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
1102
1156
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
1103
1157
|
|
|
1104
|
-
results_iter = self.
|
|
1158
|
+
results_iter = self._leaf_values()
|
|
1105
1159
|
|
|
1106
1160
|
def column_chunks() -> Iterator[list[list[Any]]]:
|
|
1107
1161
|
for chunk_iter in batched_it(results_iter, chunk_size):
|
|
@@ -1134,9 +1188,9 @@ class DataChain:
|
|
|
1134
1188
|
|
|
1135
1189
|
def results(self, *, row_factory=None, include_hidden=True):
|
|
1136
1190
|
if row_factory is None:
|
|
1137
|
-
return list(self.
|
|
1191
|
+
return list(self._leaf_values(include_hidden=include_hidden))
|
|
1138
1192
|
return list(
|
|
1139
|
-
self.
|
|
1193
|
+
self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
|
|
1140
1194
|
)
|
|
1141
1195
|
|
|
1142
1196
|
def to_records(self) -> list[dict[str, Any]]:
|
|
@@ -1147,42 +1201,38 @@ class DataChain:
|
|
|
1147
1201
|
|
|
1148
1202
|
return self.results(row_factory=to_dict)
|
|
1149
1203
|
|
|
1150
|
-
|
|
1151
|
-
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1152
|
-
|
|
1153
|
-
@overload
|
|
1154
|
-
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1155
|
-
|
|
1156
|
-
@overload
|
|
1157
|
-
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1158
|
-
|
|
1159
|
-
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1204
|
+
def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
|
|
1160
1205
|
"""Yields rows of values, optionally limited to the specified columns.
|
|
1161
1206
|
|
|
1162
1207
|
Args:
|
|
1163
1208
|
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
1164
1209
|
|
|
1165
1210
|
Yields:
|
|
1166
|
-
(DataType): Yields a
|
|
1167
|
-
(tuple[DataType, ...]): Yields a tuple of items if multiple columns are
|
|
1168
|
-
selected.
|
|
1211
|
+
(tuple[DataType, ...]): Yields a tuple of items for each row.
|
|
1169
1212
|
|
|
1170
1213
|
Example:
|
|
1171
1214
|
Iterating over all rows:
|
|
1172
1215
|
```py
|
|
1173
|
-
for row in
|
|
1216
|
+
for row in ds.to_iter():
|
|
1217
|
+
print(row)
|
|
1218
|
+
```
|
|
1219
|
+
|
|
1220
|
+
DataChain is iterable and can be used in a for loop directly which is
|
|
1221
|
+
equivalent to `ds.to_iter()`:
|
|
1222
|
+
```py
|
|
1223
|
+
for row in ds:
|
|
1174
1224
|
print(row)
|
|
1175
1225
|
```
|
|
1176
1226
|
|
|
1177
1227
|
Iterating over all rows with selected columns:
|
|
1178
1228
|
```py
|
|
1179
|
-
for name, size in
|
|
1229
|
+
for name, size in ds.to_iter("file.path", "file.size"):
|
|
1180
1230
|
print(name, size)
|
|
1181
1231
|
```
|
|
1182
1232
|
|
|
1183
1233
|
Iterating over a single column:
|
|
1184
1234
|
```py
|
|
1185
|
-
for file in
|
|
1235
|
+
for (file,) in ds.to_iter("file.path"):
|
|
1186
1236
|
print(file)
|
|
1187
1237
|
```
|
|
1188
1238
|
"""
|
|
@@ -1194,7 +1244,31 @@ class DataChain:
|
|
|
1194
1244
|
ret = signals_schema.row_to_features(
|
|
1195
1245
|
row, catalog=chain.session.catalog, cache=chain._settings.cache
|
|
1196
1246
|
)
|
|
1197
|
-
yield
|
|
1247
|
+
yield tuple(ret)
|
|
1248
|
+
|
|
1249
|
+
@overload
|
|
1250
|
+
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1251
|
+
|
|
1252
|
+
@overload
|
|
1253
|
+
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1254
|
+
|
|
1255
|
+
@overload
|
|
1256
|
+
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1257
|
+
|
|
1258
|
+
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1259
|
+
"""
|
|
1260
|
+
Deprecated. Use `to_iter` method instead.
|
|
1261
|
+
"""
|
|
1262
|
+
warnings.warn(
|
|
1263
|
+
"Method `collect` is deprecated. Use `to_iter` method instead.",
|
|
1264
|
+
DeprecationWarning,
|
|
1265
|
+
stacklevel=2,
|
|
1266
|
+
)
|
|
1267
|
+
|
|
1268
|
+
if len(cols) == 1:
|
|
1269
|
+
yield from [item[0] for item in self.to_iter(*cols)]
|
|
1270
|
+
else:
|
|
1271
|
+
yield from self.to_iter(*cols)
|
|
1198
1272
|
|
|
1199
1273
|
def to_pytorch(
|
|
1200
1274
|
self,
|
|
@@ -1429,7 +1503,7 @@ class DataChain:
|
|
|
1429
1503
|
)
|
|
1430
1504
|
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1431
1505
|
|
|
1432
|
-
def
|
|
1506
|
+
def diff(
|
|
1433
1507
|
self,
|
|
1434
1508
|
other: "DataChain",
|
|
1435
1509
|
on: Union[str, Sequence[str]],
|
|
@@ -1442,41 +1516,33 @@ class DataChain:
|
|
|
1442
1516
|
same: bool = False,
|
|
1443
1517
|
status_col: Optional[str] = None,
|
|
1444
1518
|
) -> "DataChain":
|
|
1445
|
-
"""
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
for all rows. Beside additional diff column, new chain has schema of the chain
|
|
1451
|
-
on which method was called.
|
|
1519
|
+
"""Calculate differences between two chains.
|
|
1520
|
+
|
|
1521
|
+
This method identifies records that are added, deleted, modified, or unchanged
|
|
1522
|
+
between two chains. It adds a status column with values: A=added, D=deleted,
|
|
1523
|
+
M=modified, S=same.
|
|
1452
1524
|
|
|
1453
1525
|
Parameters:
|
|
1454
|
-
other: Chain to
|
|
1455
|
-
on: Column
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
added (bool): Whether to return added rows in resulting chain.
|
|
1471
|
-
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1472
|
-
modified (bool): Whether to return modified rows in resulting chain.
|
|
1473
|
-
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1474
|
-
status_col (str): Name of the new column that is created in resulting chain
|
|
1475
|
-
representing diff status.
|
|
1526
|
+
other: Chain to compare against.
|
|
1527
|
+
on: Column(s) to match records between chains.
|
|
1528
|
+
right_on: Column(s) in the other chain to match against. Defaults to `on`.
|
|
1529
|
+
compare: Column(s) to check for changes.
|
|
1530
|
+
If not specified,all columns are used.
|
|
1531
|
+
right_compare: Column(s) in the other chain to compare against.
|
|
1532
|
+
Defaults to values of `compare`.
|
|
1533
|
+
added (bool): Include records that exist in this chain but not in the other.
|
|
1534
|
+
deleted (bool): Include records that exist only in the other chain.
|
|
1535
|
+
modified (bool): Include records that exist in both
|
|
1536
|
+
but have different values.
|
|
1537
|
+
same (bool): Include records that are identical in both chains.
|
|
1538
|
+
status_col (str): Name for the status column showing differences.
|
|
1539
|
+
|
|
1540
|
+
Default behavior: By default, shows added, deleted, and modified records,
|
|
1541
|
+
but excludes unchanged records (same=False). Status column is not created.
|
|
1476
1542
|
|
|
1477
1543
|
Example:
|
|
1478
1544
|
```py
|
|
1479
|
-
res = persons.
|
|
1545
|
+
res = persons.diff(
|
|
1480
1546
|
new_persons,
|
|
1481
1547
|
on=["id"],
|
|
1482
1548
|
right_on=["other_id"],
|
|
@@ -1505,7 +1571,7 @@ class DataChain:
|
|
|
1505
1571
|
status_col=status_col,
|
|
1506
1572
|
)
|
|
1507
1573
|
|
|
1508
|
-
def
|
|
1574
|
+
def file_diff(
|
|
1509
1575
|
self,
|
|
1510
1576
|
other: "DataChain",
|
|
1511
1577
|
on: str = "file",
|
|
@@ -1516,31 +1582,29 @@ class DataChain:
|
|
|
1516
1582
|
same: bool = False,
|
|
1517
1583
|
status_col: Optional[str] = None,
|
|
1518
1584
|
) -> "DataChain":
|
|
1519
|
-
"""
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
`.compare()` user needs to provide arbitrary columns for matching and comparing.
|
|
1585
|
+
"""Calculate differences between two chains containing files.
|
|
1586
|
+
|
|
1587
|
+
This method is specifically designed for file chains. It uses file `source`
|
|
1588
|
+
and `path` to match files, and file `version` and `etag` to detect changes.
|
|
1524
1589
|
|
|
1525
1590
|
Parameters:
|
|
1526
|
-
other: Chain to
|
|
1527
|
-
on: File
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
resulting chain representing diff status.
|
|
1591
|
+
other: Chain to compare against.
|
|
1592
|
+
on: File column name in this chain. Default is "file".
|
|
1593
|
+
right_on: File column name in the other chain. Defaults to `on`.
|
|
1594
|
+
added (bool): Include files that exist in this chain but not in the other.
|
|
1595
|
+
deleted (bool): Include files that exist only in the other chain.
|
|
1596
|
+
modified (bool): Include files that exist in both but have different
|
|
1597
|
+
versions/etags.
|
|
1598
|
+
same (bool): Include files that are identical in both chains.
|
|
1599
|
+
status_col (str): Name for the status column showing differences
|
|
1600
|
+
(A=added, D=deleted, M=modified, S=same).
|
|
1601
|
+
|
|
1602
|
+
Default behavior: By default, includes only new files (added=True and
|
|
1603
|
+
modified=True). This is useful for incremental processing.
|
|
1540
1604
|
|
|
1541
1605
|
Example:
|
|
1542
1606
|
```py
|
|
1543
|
-
diff = images.
|
|
1607
|
+
diff = images.file_diff(
|
|
1544
1608
|
new_images,
|
|
1545
1609
|
on="file",
|
|
1546
1610
|
right_on="other_file",
|
|
@@ -1565,7 +1629,7 @@ class DataChain:
|
|
|
1565
1629
|
compare_cols = get_file_signals(on, compare_file_signals)
|
|
1566
1630
|
right_compare_cols = get_file_signals(right_on, compare_file_signals)
|
|
1567
1631
|
|
|
1568
|
-
return self.
|
|
1632
|
+
return self.diff(
|
|
1569
1633
|
other,
|
|
1570
1634
|
on_cols,
|
|
1571
1635
|
right_on=right_on_cols,
|
|
@@ -1977,7 +2041,7 @@ class DataChain:
|
|
|
1977
2041
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
1978
2042
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
1979
2043
|
|
|
1980
|
-
results_iter = self.
|
|
2044
|
+
results_iter = self._leaf_values()
|
|
1981
2045
|
|
|
1982
2046
|
with opener(path, "w", newline="") as f:
|
|
1983
2047
|
writer = csv.writer(f, delimiter=delimiter, **kwargs)
|
|
@@ -2029,7 +2093,7 @@ class DataChain:
|
|
|
2029
2093
|
if include_outer_list:
|
|
2030
2094
|
# This makes the file JSON instead of JSON lines.
|
|
2031
2095
|
f.write(b"[\n")
|
|
2032
|
-
for row in self.
|
|
2096
|
+
for row in self._leaf_values():
|
|
2033
2097
|
if not is_first:
|
|
2034
2098
|
if include_outer_list:
|
|
2035
2099
|
# This makes the file JSON instead of JSON lines.
|
|
@@ -2194,7 +2258,7 @@ class DataChain:
|
|
|
2194
2258
|
max_threads=num_threads or 1,
|
|
2195
2259
|
client_config=client_config,
|
|
2196
2260
|
)
|
|
2197
|
-
file_exporter.run(self.
|
|
2261
|
+
file_exporter.run(self.to_values(signal), progress_bar)
|
|
2198
2262
|
|
|
2199
2263
|
def shuffle(self) -> "Self":
|
|
2200
2264
|
"""Shuffle the rows of the chain deterministically."""
|
|
@@ -2239,16 +2303,45 @@ class DataChain:
|
|
|
2239
2303
|
|
|
2240
2304
|
Combining filters with "or"
|
|
2241
2305
|
```py
|
|
2242
|
-
dc.filter(
|
|
2306
|
+
dc.filter(
|
|
2307
|
+
C("file.path").glob("cat*") |
|
|
2308
|
+
C("file.path").glob("dog*")
|
|
2309
|
+
)
|
|
2310
|
+
```
|
|
2311
|
+
|
|
2312
|
+
```py
|
|
2313
|
+
dc.filter(dc.func.or_(
|
|
2314
|
+
C("file.path").glob("cat*"),
|
|
2315
|
+
C("file.path").glob("dog*")
|
|
2316
|
+
))
|
|
2243
2317
|
```
|
|
2244
2318
|
|
|
2245
2319
|
Combining filters with "and"
|
|
2246
2320
|
```py
|
|
2247
2321
|
dc.filter(
|
|
2248
|
-
C("file.path").glob("*.jpg)
|
|
2322
|
+
C("file.path").glob("*.jpg"),
|
|
2323
|
+
string.length(C("file.path")) > 5
|
|
2324
|
+
)
|
|
2325
|
+
```
|
|
2326
|
+
|
|
2327
|
+
```py
|
|
2328
|
+
dc.filter(
|
|
2329
|
+
C("file.path").glob("*.jpg") &
|
|
2249
2330
|
(string.length(C("file.path")) > 5)
|
|
2250
2331
|
)
|
|
2251
2332
|
```
|
|
2333
|
+
|
|
2334
|
+
```py
|
|
2335
|
+
dc.filter(dc.func.and_(
|
|
2336
|
+
C("file.path").glob("*.jpg"),
|
|
2337
|
+
string.length(C("file.path")) > 5
|
|
2338
|
+
))
|
|
2339
|
+
```
|
|
2340
|
+
|
|
2341
|
+
Combining filters with "not"
|
|
2342
|
+
```py
|
|
2343
|
+
dc.filter(~(C("file.path").glob("*.jpg")))
|
|
2344
|
+
```
|
|
2252
2345
|
"""
|
|
2253
2346
|
return self._evolve(query=self._query.filter(*args))
|
|
2254
2347
|
|
|
@@ -2299,3 +2392,72 @@ class DataChain:
|
|
|
2299
2392
|
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|
|
2300
2393
|
"""
|
|
2301
2394
|
return self._evolve(query=self._query.chunk(index, total))
|
|
2395
|
+
|
|
2396
|
+
def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
|
|
2397
|
+
"""Returns a list of rows of values, optionally limited to the specified
|
|
2398
|
+
columns.
|
|
2399
|
+
|
|
2400
|
+
Args:
|
|
2401
|
+
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
2402
|
+
|
|
2403
|
+
Returns:
|
|
2404
|
+
list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
|
|
2405
|
+
|
|
2406
|
+
Example:
|
|
2407
|
+
Getting all rows as a list:
|
|
2408
|
+
```py
|
|
2409
|
+
rows = dc.to_list()
|
|
2410
|
+
print(rows)
|
|
2411
|
+
```
|
|
2412
|
+
|
|
2413
|
+
Getting all rows with selected columns as a list:
|
|
2414
|
+
```py
|
|
2415
|
+
name_size_pairs = dc.to_list("file.path", "file.size")
|
|
2416
|
+
print(name_size_pairs)
|
|
2417
|
+
```
|
|
2418
|
+
|
|
2419
|
+
Getting a single column as a list:
|
|
2420
|
+
```py
|
|
2421
|
+
files = dc.to_list("file.path")
|
|
2422
|
+
print(files) # Returns list of 1-tuples
|
|
2423
|
+
```
|
|
2424
|
+
"""
|
|
2425
|
+
return list(self.to_iter(*cols))
|
|
2426
|
+
|
|
2427
|
+
def to_values(self, col: str) -> list[DataValue]:
|
|
2428
|
+
"""Returns a flat list of values from a single column.
|
|
2429
|
+
|
|
2430
|
+
Args:
|
|
2431
|
+
col: The name of the column to extract values from.
|
|
2432
|
+
|
|
2433
|
+
Returns:
|
|
2434
|
+
list[DataValue]: Returns a flat list of values from the specified column.
|
|
2435
|
+
|
|
2436
|
+
Example:
|
|
2437
|
+
Getting all values from a single column:
|
|
2438
|
+
```py
|
|
2439
|
+
file_paths = dc.to_values("file.path")
|
|
2440
|
+
print(file_paths) # Returns list of strings
|
|
2441
|
+
```
|
|
2442
|
+
|
|
2443
|
+
Getting all file sizes:
|
|
2444
|
+
```py
|
|
2445
|
+
sizes = dc.to_values("file.size")
|
|
2446
|
+
print(sizes) # Returns list of integers
|
|
2447
|
+
```
|
|
2448
|
+
"""
|
|
2449
|
+
return [row[0] for row in self.to_list(col)]
|
|
2450
|
+
|
|
2451
|
+
def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
|
|
2452
|
+
"""Make DataChain objects iterable.
|
|
2453
|
+
|
|
2454
|
+
Yields:
|
|
2455
|
+
(tuple[DataValue, ...]): Yields tuples of all column values for each row.
|
|
2456
|
+
|
|
2457
|
+
Example:
|
|
2458
|
+
```py
|
|
2459
|
+
for row in chain:
|
|
2460
|
+
print(row)
|
|
2461
|
+
```
|
|
2462
|
+
"""
|
|
2463
|
+
return self.to_iter()
|