datachain 0.21.0__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +180 -65
- datachain/cli/__init__.py +4 -9
- datachain/cli/commands/datasets.py +43 -28
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +1 -35
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +422 -37
- datachain/data_storage/sqlite.py +136 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +126 -12
- datachain/delta.py +11 -7
- datachain/error.py +36 -0
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +260 -92
- datachain/lib/dc/datasets.py +104 -50
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +1 -0
- datachain/lib/dc/storage.py +38 -40
- datachain/lib/file.py +77 -23
- datachain/lib/listing.py +3 -1
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +71 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +10 -0
- datachain/lib/tar.py +1 -2
- datachain/lib/udf.py +1 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +61 -26
- datachain/studio.py +36 -10
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
datachain/func/func.py
CHANGED
datachain/lib/arrow.py
CHANGED
|
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
|
|
|
76
76
|
fs_path = file.path
|
|
77
77
|
fs = ReferenceFileSystem({fs_path: [cache_path]})
|
|
78
78
|
else:
|
|
79
|
-
fs, fs_path = file.get_fs(), file.
|
|
79
|
+
fs, fs_path = file.get_fs(), file.get_fs_path()
|
|
80
80
|
|
|
81
81
|
kwargs = self.kwargs
|
|
82
82
|
if format := kwargs.get("format"):
|
|
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
|
|
|
160
160
|
kwargs["format"] = fix_pyarrow_format(format, parse_options)
|
|
161
161
|
|
|
162
162
|
schemas = []
|
|
163
|
-
for file in chain.
|
|
164
|
-
ds = dataset(file.
|
|
163
|
+
for (file,) in chain.to_iter("file"):
|
|
164
|
+
ds = dataset(file.get_fs_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
|
|
165
165
|
schemas.append(ds.schema)
|
|
166
166
|
if not schemas:
|
|
167
167
|
raise ValueError(
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -22,6 +22,8 @@ if TYPE_CHECKING:
|
|
|
22
22
|
|
|
23
23
|
class DatasetInfo(DataModel):
|
|
24
24
|
name: str
|
|
25
|
+
namespace: str
|
|
26
|
+
project: str
|
|
25
27
|
uuid: str = Field(default=str(uuid4()))
|
|
26
28
|
version: str = Field(default=DEFAULT_DATASET_VERSION)
|
|
27
29
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
@@ -91,6 +93,8 @@ class DatasetInfo(DataModel):
|
|
|
91
93
|
return cls(
|
|
92
94
|
uuid=version.uuid,
|
|
93
95
|
name=dataset.name,
|
|
96
|
+
namespace=dataset.project.namespace.name,
|
|
97
|
+
project=dataset.project.name,
|
|
94
98
|
version=version.version,
|
|
95
99
|
status=version.status,
|
|
96
100
|
created_at=version.created_at,
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -24,8 +24,9 @@ from pydantic import BaseModel
|
|
|
24
24
|
from tqdm import tqdm
|
|
25
25
|
|
|
26
26
|
from datachain import semver
|
|
27
|
-
from datachain.dataset import DatasetRecord
|
|
27
|
+
from datachain.dataset import DatasetRecord, parse_dataset_name
|
|
28
28
|
from datachain.delta import delta_disabled
|
|
29
|
+
from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
|
|
29
30
|
from datachain.func import literal
|
|
30
31
|
from datachain.func.base import Function
|
|
31
32
|
from datachain.func.func import Func
|
|
@@ -261,7 +262,7 @@ class DataChain:
|
|
|
261
262
|
"""Underlying dataset, if there is one."""
|
|
262
263
|
if not self.name:
|
|
263
264
|
return None
|
|
264
|
-
return self.session.catalog.get_dataset(self.name)
|
|
265
|
+
return self.session.catalog.get_dataset(self.name, self._query.project)
|
|
265
266
|
|
|
266
267
|
def __or__(self, other: "Self") -> "Self":
|
|
267
268
|
"""Return `self.union(other)`."""
|
|
@@ -312,6 +313,8 @@ class DataChain:
|
|
|
312
313
|
min_task_size=None,
|
|
313
314
|
prefetch: Optional[int] = None,
|
|
314
315
|
sys: Optional[bool] = None,
|
|
316
|
+
namespace: Optional[str] = None,
|
|
317
|
+
project: Optional[str] = None,
|
|
315
318
|
) -> "Self":
|
|
316
319
|
"""Change settings for chain.
|
|
317
320
|
|
|
@@ -327,6 +330,8 @@ class DataChain:
|
|
|
327
330
|
prefetch: number of workers to use for downloading files in advance.
|
|
328
331
|
This is enabled by default and uses 2 workers.
|
|
329
332
|
To disable prefetching, set it to 0.
|
|
333
|
+
namespace: namespace name.
|
|
334
|
+
project: project name.
|
|
330
335
|
|
|
331
336
|
Example:
|
|
332
337
|
```py
|
|
@@ -340,7 +345,11 @@ class DataChain:
|
|
|
340
345
|
if sys is None:
|
|
341
346
|
sys = self._sys
|
|
342
347
|
settings = copy.copy(self._settings)
|
|
343
|
-
settings.add(
|
|
348
|
+
settings.add(
|
|
349
|
+
Settings(
|
|
350
|
+
cache, parallel, workers, min_task_size, prefetch, namespace, project
|
|
351
|
+
)
|
|
352
|
+
)
|
|
344
353
|
return self._evolve(settings=settings, _sys=sys)
|
|
345
354
|
|
|
346
355
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
@@ -430,10 +439,10 @@ class DataChain:
|
|
|
430
439
|
|
|
431
440
|
from datachain.lib.arrow import schema_to_output
|
|
432
441
|
|
|
433
|
-
json_values =
|
|
442
|
+
json_values = self.limit(schema_sample_size).to_list(col)
|
|
434
443
|
json_dicts = [
|
|
435
444
|
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
436
|
-
for json_value in json_values
|
|
445
|
+
for (json_value,) in json_values
|
|
437
446
|
]
|
|
438
447
|
|
|
439
448
|
if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
|
|
@@ -490,6 +499,22 @@ class DataChain:
|
|
|
490
499
|
)
|
|
491
500
|
return listings(*args, **kwargs)
|
|
492
501
|
|
|
502
|
+
@property
|
|
503
|
+
def namespace_name(self) -> str:
|
|
504
|
+
"""Current namespace name in which the chain is running"""
|
|
505
|
+
return (
|
|
506
|
+
self._settings.namespace
|
|
507
|
+
or self.session.catalog.metastore.default_namespace_name
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
@property
|
|
511
|
+
def project_name(self) -> str:
|
|
512
|
+
"""Current project name in which the chain is running"""
|
|
513
|
+
return (
|
|
514
|
+
self._settings.project
|
|
515
|
+
or self.session.catalog.metastore.default_project_name
|
|
516
|
+
)
|
|
517
|
+
|
|
493
518
|
def persist(self) -> "Self":
|
|
494
519
|
"""Saves temporary chain that will be removed after the process ends.
|
|
495
520
|
Temporary datasets are useful for optimization, for example when we have
|
|
@@ -499,7 +524,14 @@ class DataChain:
|
|
|
499
524
|
It returns the chain itself.
|
|
500
525
|
"""
|
|
501
526
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
502
|
-
|
|
527
|
+
project = self.session.catalog.metastore.get_project(
|
|
528
|
+
self.project_name,
|
|
529
|
+
self.namespace_name,
|
|
530
|
+
create=True,
|
|
531
|
+
)
|
|
532
|
+
return self._evolve(
|
|
533
|
+
query=self._query.save(project=project, feature_schema=schema)
|
|
534
|
+
)
|
|
503
535
|
|
|
504
536
|
def save( # type: ignore[override]
|
|
505
537
|
self,
|
|
@@ -513,7 +545,10 @@ class DataChain:
|
|
|
513
545
|
"""Save to a Dataset. It returns the chain itself.
|
|
514
546
|
|
|
515
547
|
Parameters:
|
|
516
|
-
name : dataset name.
|
|
548
|
+
name : dataset name. It can be full name consisting of namespace and
|
|
549
|
+
project, but it can also be just a regular dataset name in which
|
|
550
|
+
case we are taking namespace and project from settings, if they
|
|
551
|
+
are defined there, or default ones instead.
|
|
517
552
|
version : version of a dataset. If version is not specified and dataset
|
|
518
553
|
already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
|
|
519
554
|
description : description of a dataset.
|
|
@@ -535,6 +570,29 @@ class DataChain:
|
|
|
535
570
|
" patch"
|
|
536
571
|
)
|
|
537
572
|
|
|
573
|
+
namespace_name, project_name, name = parse_dataset_name(name)
|
|
574
|
+
|
|
575
|
+
namespace_name = (
|
|
576
|
+
namespace_name
|
|
577
|
+
or self._settings.namespace
|
|
578
|
+
or self.session.catalog.metastore.default_namespace_name
|
|
579
|
+
)
|
|
580
|
+
project_name = (
|
|
581
|
+
project_name
|
|
582
|
+
or self._settings.project
|
|
583
|
+
or self.session.catalog.metastore.default_project_name
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
try:
|
|
587
|
+
project = self.session.catalog.metastore.get_project(
|
|
588
|
+
project_name,
|
|
589
|
+
namespace_name,
|
|
590
|
+
create=self.session.catalog.metastore.project_allowed_to_create,
|
|
591
|
+
)
|
|
592
|
+
except ProjectNotFoundError as e:
|
|
593
|
+
# not being able to create it as creation is not allowed
|
|
594
|
+
raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
|
|
595
|
+
|
|
538
596
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
539
597
|
|
|
540
598
|
# Handle retry and delta functionality
|
|
@@ -558,6 +616,7 @@ class DataChain:
|
|
|
558
616
|
query=result_ds._query.save(
|
|
559
617
|
name=name,
|
|
560
618
|
version=version,
|
|
619
|
+
project=project,
|
|
561
620
|
feature_schema=schema,
|
|
562
621
|
dependencies=dependencies,
|
|
563
622
|
**kwargs,
|
|
@@ -577,6 +636,7 @@ class DataChain:
|
|
|
577
636
|
query=self._query.save(
|
|
578
637
|
name=name,
|
|
579
638
|
version=version,
|
|
639
|
+
project=project,
|
|
580
640
|
description=description,
|
|
581
641
|
attrs=attrs,
|
|
582
642
|
feature_schema=schema,
|
|
@@ -843,7 +903,7 @@ class DataChain:
|
|
|
843
903
|
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
844
904
|
I.e. when using `read_dataset` an `order_by` statement should be used if
|
|
845
905
|
the order of the records in the chain is important.
|
|
846
|
-
Using `order_by` directly before `limit`, `
|
|
906
|
+
Using `order_by` directly before `limit`, `to_list` and similar methods
|
|
847
907
|
will give expected results.
|
|
848
908
|
See https://github.com/iterative/datachain/issues/477 for further details.
|
|
849
909
|
"""
|
|
@@ -1048,32 +1108,32 @@ class DataChain:
|
|
|
1048
1108
|
|
|
1049
1109
|
@property
|
|
1050
1110
|
def _effective_signals_schema(self) -> "SignalSchema":
|
|
1051
|
-
"""Effective schema used for user-facing API like
|
|
1111
|
+
"""Effective schema used for user-facing API like to_list, to_pandas, etc."""
|
|
1052
1112
|
signals_schema = self.signals_schema
|
|
1053
1113
|
if not self._sys:
|
|
1054
1114
|
return signals_schema.clone_without_sys_signals()
|
|
1055
1115
|
return signals_schema
|
|
1056
1116
|
|
|
1057
1117
|
@overload
|
|
1058
|
-
def
|
|
1118
|
+
def _leaf_values(self) -> Iterator[tuple[Any, ...]]: ...
|
|
1059
1119
|
|
|
1060
1120
|
@overload
|
|
1061
|
-
def
|
|
1121
|
+
def _leaf_values(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
|
|
1062
1122
|
|
|
1063
1123
|
@overload
|
|
1064
|
-
def
|
|
1124
|
+
def _leaf_values(
|
|
1065
1125
|
self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
|
|
1066
1126
|
) -> Iterator[_T]: ...
|
|
1067
1127
|
|
|
1068
1128
|
@overload
|
|
1069
|
-
def
|
|
1129
|
+
def _leaf_values(
|
|
1070
1130
|
self,
|
|
1071
1131
|
*,
|
|
1072
1132
|
row_factory: Callable[[list[str], tuple[Any, ...]], _T],
|
|
1073
1133
|
include_hidden: bool,
|
|
1074
1134
|
) -> Iterator[_T]: ...
|
|
1075
1135
|
|
|
1076
|
-
def
|
|
1136
|
+
def _leaf_values(self, *, row_factory=None, include_hidden: bool = True):
|
|
1077
1137
|
"""Yields flattened rows of values as a tuple.
|
|
1078
1138
|
|
|
1079
1139
|
Args:
|
|
@@ -1101,7 +1161,7 @@ class DataChain:
|
|
|
1101
1161
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
1102
1162
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
1103
1163
|
|
|
1104
|
-
results_iter = self.
|
|
1164
|
+
results_iter = self._leaf_values()
|
|
1105
1165
|
|
|
1106
1166
|
def column_chunks() -> Iterator[list[list[Any]]]:
|
|
1107
1167
|
for chunk_iter in batched_it(results_iter, chunk_size):
|
|
@@ -1134,9 +1194,9 @@ class DataChain:
|
|
|
1134
1194
|
|
|
1135
1195
|
def results(self, *, row_factory=None, include_hidden=True):
|
|
1136
1196
|
if row_factory is None:
|
|
1137
|
-
return list(self.
|
|
1197
|
+
return list(self._leaf_values(include_hidden=include_hidden))
|
|
1138
1198
|
return list(
|
|
1139
|
-
self.
|
|
1199
|
+
self._leaf_values(row_factory=row_factory, include_hidden=include_hidden)
|
|
1140
1200
|
)
|
|
1141
1201
|
|
|
1142
1202
|
def to_records(self) -> list[dict[str, Any]]:
|
|
@@ -1147,42 +1207,38 @@ class DataChain:
|
|
|
1147
1207
|
|
|
1148
1208
|
return self.results(row_factory=to_dict)
|
|
1149
1209
|
|
|
1150
|
-
|
|
1151
|
-
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1152
|
-
|
|
1153
|
-
@overload
|
|
1154
|
-
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1155
|
-
|
|
1156
|
-
@overload
|
|
1157
|
-
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1158
|
-
|
|
1159
|
-
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1210
|
+
def to_iter(self, *cols: str) -> Iterator[tuple[DataValue, ...]]:
|
|
1160
1211
|
"""Yields rows of values, optionally limited to the specified columns.
|
|
1161
1212
|
|
|
1162
1213
|
Args:
|
|
1163
1214
|
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
1164
1215
|
|
|
1165
1216
|
Yields:
|
|
1166
|
-
(DataType): Yields a
|
|
1167
|
-
(tuple[DataType, ...]): Yields a tuple of items if multiple columns are
|
|
1168
|
-
selected.
|
|
1217
|
+
(tuple[DataType, ...]): Yields a tuple of items for each row.
|
|
1169
1218
|
|
|
1170
1219
|
Example:
|
|
1171
1220
|
Iterating over all rows:
|
|
1172
1221
|
```py
|
|
1173
|
-
for row in
|
|
1222
|
+
for row in ds.to_iter():
|
|
1223
|
+
print(row)
|
|
1224
|
+
```
|
|
1225
|
+
|
|
1226
|
+
DataChain is iterable and can be used in a for loop directly which is
|
|
1227
|
+
equivalent to `ds.to_iter()`:
|
|
1228
|
+
```py
|
|
1229
|
+
for row in ds:
|
|
1174
1230
|
print(row)
|
|
1175
1231
|
```
|
|
1176
1232
|
|
|
1177
1233
|
Iterating over all rows with selected columns:
|
|
1178
1234
|
```py
|
|
1179
|
-
for name, size in
|
|
1235
|
+
for name, size in ds.to_iter("file.path", "file.size"):
|
|
1180
1236
|
print(name, size)
|
|
1181
1237
|
```
|
|
1182
1238
|
|
|
1183
1239
|
Iterating over a single column:
|
|
1184
1240
|
```py
|
|
1185
|
-
for file in
|
|
1241
|
+
for (file,) in ds.to_iter("file.path"):
|
|
1186
1242
|
print(file)
|
|
1187
1243
|
```
|
|
1188
1244
|
"""
|
|
@@ -1194,7 +1250,31 @@ class DataChain:
|
|
|
1194
1250
|
ret = signals_schema.row_to_features(
|
|
1195
1251
|
row, catalog=chain.session.catalog, cache=chain._settings.cache
|
|
1196
1252
|
)
|
|
1197
|
-
yield
|
|
1253
|
+
yield tuple(ret)
|
|
1254
|
+
|
|
1255
|
+
@overload
|
|
1256
|
+
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1257
|
+
|
|
1258
|
+
@overload
|
|
1259
|
+
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1260
|
+
|
|
1261
|
+
@overload
|
|
1262
|
+
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1263
|
+
|
|
1264
|
+
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1265
|
+
"""
|
|
1266
|
+
Deprecated. Use `to_iter` method instead.
|
|
1267
|
+
"""
|
|
1268
|
+
warnings.warn(
|
|
1269
|
+
"Method `collect` is deprecated. Use `to_iter` method instead.",
|
|
1270
|
+
DeprecationWarning,
|
|
1271
|
+
stacklevel=2,
|
|
1272
|
+
)
|
|
1273
|
+
|
|
1274
|
+
if len(cols) == 1:
|
|
1275
|
+
yield from [item[0] for item in self.to_iter(*cols)]
|
|
1276
|
+
else:
|
|
1277
|
+
yield from self.to_iter(*cols)
|
|
1198
1278
|
|
|
1199
1279
|
def to_pytorch(
|
|
1200
1280
|
self,
|
|
@@ -1429,7 +1509,7 @@ class DataChain:
|
|
|
1429
1509
|
)
|
|
1430
1510
|
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1431
1511
|
|
|
1432
|
-
def
|
|
1512
|
+
def diff(
|
|
1433
1513
|
self,
|
|
1434
1514
|
other: "DataChain",
|
|
1435
1515
|
on: Union[str, Sequence[str]],
|
|
@@ -1442,41 +1522,33 @@ class DataChain:
|
|
|
1442
1522
|
same: bool = False,
|
|
1443
1523
|
status_col: Optional[str] = None,
|
|
1444
1524
|
) -> "DataChain":
|
|
1445
|
-
"""
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
for all rows. Beside additional diff column, new chain has schema of the chain
|
|
1451
|
-
on which method was called.
|
|
1525
|
+
"""Calculate differences between two chains.
|
|
1526
|
+
|
|
1527
|
+
This method identifies records that are added, deleted, modified, or unchanged
|
|
1528
|
+
between two chains. It adds a status column with values: A=added, D=deleted,
|
|
1529
|
+
M=modified, S=same.
|
|
1452
1530
|
|
|
1453
1531
|
Parameters:
|
|
1454
|
-
other: Chain to
|
|
1455
|
-
on: Column
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
added (bool): Whether to return added rows in resulting chain.
|
|
1471
|
-
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1472
|
-
modified (bool): Whether to return modified rows in resulting chain.
|
|
1473
|
-
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1474
|
-
status_col (str): Name of the new column that is created in resulting chain
|
|
1475
|
-
representing diff status.
|
|
1532
|
+
other: Chain to compare against.
|
|
1533
|
+
on: Column(s) to match records between chains.
|
|
1534
|
+
right_on: Column(s) in the other chain to match against. Defaults to `on`.
|
|
1535
|
+
compare: Column(s) to check for changes.
|
|
1536
|
+
If not specified,all columns are used.
|
|
1537
|
+
right_compare: Column(s) in the other chain to compare against.
|
|
1538
|
+
Defaults to values of `compare`.
|
|
1539
|
+
added (bool): Include records that exist in this chain but not in the other.
|
|
1540
|
+
deleted (bool): Include records that exist only in the other chain.
|
|
1541
|
+
modified (bool): Include records that exist in both
|
|
1542
|
+
but have different values.
|
|
1543
|
+
same (bool): Include records that are identical in both chains.
|
|
1544
|
+
status_col (str): Name for the status column showing differences.
|
|
1545
|
+
|
|
1546
|
+
Default behavior: By default, shows added, deleted, and modified records,
|
|
1547
|
+
but excludes unchanged records (same=False). Status column is not created.
|
|
1476
1548
|
|
|
1477
1549
|
Example:
|
|
1478
1550
|
```py
|
|
1479
|
-
res = persons.
|
|
1551
|
+
res = persons.diff(
|
|
1480
1552
|
new_persons,
|
|
1481
1553
|
on=["id"],
|
|
1482
1554
|
right_on=["other_id"],
|
|
@@ -1505,7 +1577,7 @@ class DataChain:
|
|
|
1505
1577
|
status_col=status_col,
|
|
1506
1578
|
)
|
|
1507
1579
|
|
|
1508
|
-
def
|
|
1580
|
+
def file_diff(
|
|
1509
1581
|
self,
|
|
1510
1582
|
other: "DataChain",
|
|
1511
1583
|
on: str = "file",
|
|
@@ -1516,31 +1588,29 @@ class DataChain:
|
|
|
1516
1588
|
same: bool = False,
|
|
1517
1589
|
status_col: Optional[str] = None,
|
|
1518
1590
|
) -> "DataChain":
|
|
1519
|
-
"""
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
`.compare()` user needs to provide arbitrary columns for matching and comparing.
|
|
1591
|
+
"""Calculate differences between two chains containing files.
|
|
1592
|
+
|
|
1593
|
+
This method is specifically designed for file chains. It uses file `source`
|
|
1594
|
+
and `path` to match files, and file `version` and `etag` to detect changes.
|
|
1524
1595
|
|
|
1525
1596
|
Parameters:
|
|
1526
|
-
other: Chain to
|
|
1527
|
-
on: File
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
resulting chain representing diff status.
|
|
1597
|
+
other: Chain to compare against.
|
|
1598
|
+
on: File column name in this chain. Default is "file".
|
|
1599
|
+
right_on: File column name in the other chain. Defaults to `on`.
|
|
1600
|
+
added (bool): Include files that exist in this chain but not in the other.
|
|
1601
|
+
deleted (bool): Include files that exist only in the other chain.
|
|
1602
|
+
modified (bool): Include files that exist in both but have different
|
|
1603
|
+
versions/etags.
|
|
1604
|
+
same (bool): Include files that are identical in both chains.
|
|
1605
|
+
status_col (str): Name for the status column showing differences
|
|
1606
|
+
(A=added, D=deleted, M=modified, S=same).
|
|
1607
|
+
|
|
1608
|
+
Default behavior: By default, includes only new files (added=True and
|
|
1609
|
+
modified=True). This is useful for incremental processing.
|
|
1540
1610
|
|
|
1541
1611
|
Example:
|
|
1542
1612
|
```py
|
|
1543
|
-
diff = images.
|
|
1613
|
+
diff = images.file_diff(
|
|
1544
1614
|
new_images,
|
|
1545
1615
|
on="file",
|
|
1546
1616
|
right_on="other_file",
|
|
@@ -1565,7 +1635,7 @@ class DataChain:
|
|
|
1565
1635
|
compare_cols = get_file_signals(on, compare_file_signals)
|
|
1566
1636
|
right_compare_cols = get_file_signals(right_on, compare_file_signals)
|
|
1567
1637
|
|
|
1568
|
-
return self.
|
|
1638
|
+
return self.diff(
|
|
1569
1639
|
other,
|
|
1570
1640
|
on_cols,
|
|
1571
1641
|
right_on=right_on_cols,
|
|
@@ -1977,7 +2047,7 @@ class DataChain:
|
|
|
1977
2047
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
1978
2048
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
1979
2049
|
|
|
1980
|
-
results_iter = self.
|
|
2050
|
+
results_iter = self._leaf_values()
|
|
1981
2051
|
|
|
1982
2052
|
with opener(path, "w", newline="") as f:
|
|
1983
2053
|
writer = csv.writer(f, delimiter=delimiter, **kwargs)
|
|
@@ -2029,7 +2099,7 @@ class DataChain:
|
|
|
2029
2099
|
if include_outer_list:
|
|
2030
2100
|
# This makes the file JSON instead of JSON lines.
|
|
2031
2101
|
f.write(b"[\n")
|
|
2032
|
-
for row in self.
|
|
2102
|
+
for row in self._leaf_values():
|
|
2033
2103
|
if not is_first:
|
|
2034
2104
|
if include_outer_list:
|
|
2035
2105
|
# This makes the file JSON instead of JSON lines.
|
|
@@ -2194,7 +2264,7 @@ class DataChain:
|
|
|
2194
2264
|
max_threads=num_threads or 1,
|
|
2195
2265
|
client_config=client_config,
|
|
2196
2266
|
)
|
|
2197
|
-
file_exporter.run(self.
|
|
2267
|
+
file_exporter.run(self.to_values(signal), progress_bar)
|
|
2198
2268
|
|
|
2199
2269
|
def shuffle(self) -> "Self":
|
|
2200
2270
|
"""Shuffle the rows of the chain deterministically."""
|
|
@@ -2239,16 +2309,45 @@ class DataChain:
|
|
|
2239
2309
|
|
|
2240
2310
|
Combining filters with "or"
|
|
2241
2311
|
```py
|
|
2242
|
-
dc.filter(
|
|
2312
|
+
dc.filter(
|
|
2313
|
+
C("file.path").glob("cat*") |
|
|
2314
|
+
C("file.path").glob("dog*")
|
|
2315
|
+
)
|
|
2316
|
+
```
|
|
2317
|
+
|
|
2318
|
+
```py
|
|
2319
|
+
dc.filter(dc.func.or_(
|
|
2320
|
+
C("file.path").glob("cat*"),
|
|
2321
|
+
C("file.path").glob("dog*")
|
|
2322
|
+
))
|
|
2243
2323
|
```
|
|
2244
2324
|
|
|
2245
2325
|
Combining filters with "and"
|
|
2246
2326
|
```py
|
|
2247
2327
|
dc.filter(
|
|
2248
|
-
C("file.path").glob("*.jpg)
|
|
2328
|
+
C("file.path").glob("*.jpg"),
|
|
2329
|
+
string.length(C("file.path")) > 5
|
|
2330
|
+
)
|
|
2331
|
+
```
|
|
2332
|
+
|
|
2333
|
+
```py
|
|
2334
|
+
dc.filter(
|
|
2335
|
+
C("file.path").glob("*.jpg") &
|
|
2249
2336
|
(string.length(C("file.path")) > 5)
|
|
2250
2337
|
)
|
|
2251
2338
|
```
|
|
2339
|
+
|
|
2340
|
+
```py
|
|
2341
|
+
dc.filter(dc.func.and_(
|
|
2342
|
+
C("file.path").glob("*.jpg"),
|
|
2343
|
+
string.length(C("file.path")) > 5
|
|
2344
|
+
))
|
|
2345
|
+
```
|
|
2346
|
+
|
|
2347
|
+
Combining filters with "not"
|
|
2348
|
+
```py
|
|
2349
|
+
dc.filter(~(C("file.path").glob("*.jpg")))
|
|
2350
|
+
```
|
|
2252
2351
|
"""
|
|
2253
2352
|
return self._evolve(query=self._query.filter(*args))
|
|
2254
2353
|
|
|
@@ -2299,3 +2398,72 @@ class DataChain:
|
|
|
2299
2398
|
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|
|
2300
2399
|
"""
|
|
2301
2400
|
return self._evolve(query=self._query.chunk(index, total))
|
|
2401
|
+
|
|
2402
|
+
def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
|
|
2403
|
+
"""Returns a list of rows of values, optionally limited to the specified
|
|
2404
|
+
columns.
|
|
2405
|
+
|
|
2406
|
+
Args:
|
|
2407
|
+
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
2408
|
+
|
|
2409
|
+
Returns:
|
|
2410
|
+
list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
|
|
2411
|
+
|
|
2412
|
+
Example:
|
|
2413
|
+
Getting all rows as a list:
|
|
2414
|
+
```py
|
|
2415
|
+
rows = dc.to_list()
|
|
2416
|
+
print(rows)
|
|
2417
|
+
```
|
|
2418
|
+
|
|
2419
|
+
Getting all rows with selected columns as a list:
|
|
2420
|
+
```py
|
|
2421
|
+
name_size_pairs = dc.to_list("file.path", "file.size")
|
|
2422
|
+
print(name_size_pairs)
|
|
2423
|
+
```
|
|
2424
|
+
|
|
2425
|
+
Getting a single column as a list:
|
|
2426
|
+
```py
|
|
2427
|
+
files = dc.to_list("file.path")
|
|
2428
|
+
print(files) # Returns list of 1-tuples
|
|
2429
|
+
```
|
|
2430
|
+
"""
|
|
2431
|
+
return list(self.to_iter(*cols))
|
|
2432
|
+
|
|
2433
|
+
def to_values(self, col: str) -> list[DataValue]:
|
|
2434
|
+
"""Returns a flat list of values from a single column.
|
|
2435
|
+
|
|
2436
|
+
Args:
|
|
2437
|
+
col: The name of the column to extract values from.
|
|
2438
|
+
|
|
2439
|
+
Returns:
|
|
2440
|
+
list[DataValue]: Returns a flat list of values from the specified column.
|
|
2441
|
+
|
|
2442
|
+
Example:
|
|
2443
|
+
Getting all values from a single column:
|
|
2444
|
+
```py
|
|
2445
|
+
file_paths = dc.to_values("file.path")
|
|
2446
|
+
print(file_paths) # Returns list of strings
|
|
2447
|
+
```
|
|
2448
|
+
|
|
2449
|
+
Getting all file sizes:
|
|
2450
|
+
```py
|
|
2451
|
+
sizes = dc.to_values("file.size")
|
|
2452
|
+
print(sizes) # Returns list of integers
|
|
2453
|
+
```
|
|
2454
|
+
"""
|
|
2455
|
+
return [row[0] for row in self.to_list(col)]
|
|
2456
|
+
|
|
2457
|
+
def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
|
|
2458
|
+
"""Make DataChain objects iterable.
|
|
2459
|
+
|
|
2460
|
+
Yields:
|
|
2461
|
+
(tuple[DataValue, ...]): Yields tuples of all column values for each row.
|
|
2462
|
+
|
|
2463
|
+
Example:
|
|
2464
|
+
```py
|
|
2465
|
+
for row in chain:
|
|
2466
|
+
print(row)
|
|
2467
|
+
```
|
|
2468
|
+
"""
|
|
2469
|
+
return self.to_iter()
|