datachain 0.20.4__py3-none-any.whl → 0.21.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +0 -2
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +65 -180
- datachain/cli/__init__.py +11 -2
- datachain/cli/commands/datasets.py +28 -43
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +35 -1
- datachain/client/fsspec.py +3 -5
- datachain/client/hf.py +0 -10
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +37 -405
- datachain/data_storage/sqlite.py +7 -136
- datachain/data_storage/warehouse.py +7 -26
- datachain/dataset.py +12 -126
- datachain/delta.py +7 -11
- datachain/error.py +0 -36
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +0 -4
- datachain/lib/dc/datachain.py +92 -260
- datachain/lib/dc/datasets.py +50 -104
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +0 -1
- datachain/lib/dc/storage.py +40 -38
- datachain/lib/file.py +23 -77
- datachain/lib/listing.py +1 -3
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +0 -10
- datachain/lib/tar.py +2 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +20 -30
- datachain/listing.py +1 -3
- datachain/query/dataset.py +46 -71
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +26 -61
- datachain/studio.py +20 -27
- {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/METADATA +2 -2
- {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/RECORD +43 -47
- datachain/lib/namespaces.py +0 -71
- datachain/lib/projects.py +0 -86
- datachain/namespace.py +0 -65
- datachain/project.py +0 -78
- {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/WHEEL +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/top_level.txt +0 -0
datachain/func/func.py
CHANGED
datachain/lib/arrow.py
CHANGED
|
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
|
|
|
76
76
|
fs_path = file.path
|
|
77
77
|
fs = ReferenceFileSystem({fs_path: [cache_path]})
|
|
78
78
|
else:
|
|
79
|
-
fs, fs_path = file.get_fs(), file.
|
|
79
|
+
fs, fs_path = file.get_fs(), file.get_path()
|
|
80
80
|
|
|
81
81
|
kwargs = self.kwargs
|
|
82
82
|
if format := kwargs.get("format"):
|
|
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
|
|
|
160
160
|
kwargs["format"] = fix_pyarrow_format(format, parse_options)
|
|
161
161
|
|
|
162
162
|
schemas = []
|
|
163
|
-
for
|
|
164
|
-
ds = dataset(file.
|
|
163
|
+
for file in chain.collect("file"):
|
|
164
|
+
ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
|
|
165
165
|
schemas.append(ds.schema)
|
|
166
166
|
if not schemas:
|
|
167
167
|
raise ValueError(
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -22,8 +22,6 @@ if TYPE_CHECKING:
|
|
|
22
22
|
|
|
23
23
|
class DatasetInfo(DataModel):
|
|
24
24
|
name: str
|
|
25
|
-
namespace: str
|
|
26
|
-
project: str
|
|
27
25
|
uuid: str = Field(default=str(uuid4()))
|
|
28
26
|
version: str = Field(default=DEFAULT_DATASET_VERSION)
|
|
29
27
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
@@ -93,8 +91,6 @@ class DatasetInfo(DataModel):
|
|
|
93
91
|
return cls(
|
|
94
92
|
uuid=version.uuid,
|
|
95
93
|
name=dataset.name,
|
|
96
|
-
namespace=dataset.project.namespace.name,
|
|
97
|
-
project=dataset.project.name,
|
|
98
94
|
version=version.version,
|
|
99
95
|
status=version.status,
|
|
100
96
|
created_at=version.created_at,
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -24,9 +24,8 @@ from pydantic import BaseModel
|
|
|
24
24
|
from tqdm import tqdm
|
|
25
25
|
|
|
26
26
|
from datachain import semver
|
|
27
|
-
from datachain.dataset import DatasetRecord
|
|
27
|
+
from datachain.dataset import DatasetRecord
|
|
28
28
|
from datachain.delta import delta_disabled
|
|
29
|
-
from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
|
|
30
29
|
from datachain.func import literal
|
|
31
30
|
from datachain.func.base import Function
|
|
32
31
|
from datachain.func.func import Func
|
|
@@ -262,7 +261,7 @@ class DataChain:
|
|
|
262
261
|
"""Underlying dataset, if there is one."""
|
|
263
262
|
if not self.name:
|
|
264
263
|
return None
|
|
265
|
-
return self.session.catalog.get_dataset(self.name
|
|
264
|
+
return self.session.catalog.get_dataset(self.name)
|
|
266
265
|
|
|
267
266
|
def __or__(self, other: "Self") -> "Self":
|
|
268
267
|
"""Return `self.union(other)`."""
|
|
@@ -313,8 +312,6 @@ class DataChain:
|
|
|
313
312
|
min_task_size=None,
|
|
314
313
|
prefetch: Optional[int] = None,
|
|
315
314
|
sys: Optional[bool] = None,
|
|
316
|
-
namespace: Optional[str] = None,
|
|
317
|
-
project: Optional[str] = None,
|
|
318
315
|
) -> "Self":
|
|
319
316
|
"""Change settings for chain.
|
|
320
317
|
|
|
@@ -330,8 +327,6 @@ class DataChain:
|
|
|
330
327
|
prefetch: number of workers to use for downloading files in advance.
|
|
331
328
|
This is enabled by default and uses 2 workers.
|
|
332
329
|
To disable prefetching, set it to 0.
|
|
333
|
-
namespace: namespace name.
|
|
334
|
-
project: project name.
|
|
335
330
|
|
|
336
331
|
Example:
|
|
337
332
|
```py
|
|
@@ -345,11 +340,7 @@ class DataChain:
|
|
|
345
340
|
if sys is None:
|
|
346
341
|
sys = self._sys
|
|
347
342
|
settings = copy.copy(self._settings)
|
|
348
|
-
settings.add(
|
|
349
|
-
Settings(
|
|
350
|
-
cache, parallel, workers, min_task_size, prefetch, namespace, project
|
|
351
|
-
)
|
|
352
|
-
)
|
|
343
|
+
settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
|
|
353
344
|
return self._evolve(settings=settings, _sys=sys)
|
|
354
345
|
|
|
355
346
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
@@ -439,10 +430,10 @@ class DataChain:
|
|
|
439
430
|
|
|
440
431
|
from datachain.lib.arrow import schema_to_output
|
|
441
432
|
|
|
442
|
-
json_values = self.limit(schema_sample_size).
|
|
433
|
+
json_values = list(self.limit(schema_sample_size).collect(col))
|
|
443
434
|
json_dicts = [
|
|
444
435
|
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
445
|
-
for
|
|
436
|
+
for json_value in json_values
|
|
446
437
|
]
|
|
447
438
|
|
|
448
439
|
if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
|
|
@@ -499,22 +490,6 @@ class DataChain:
|
|
|
499
490
|
)
|
|
500
491
|
return listings(*args, **kwargs)
|
|
501
492
|
|
|
502
|
-
@property
|
|
503
|
-
def namespace_name(self) -> str:
|
|
504
|
-
"""Current namespace name in which the chain is running"""
|
|
505
|
-
return (
|
|
506
|
-
self._settings.namespace
|
|
507
|
-
or self.session.catalog.metastore.default_namespace_name
|
|
508
|
-
)
|
|
509
|
-
|
|
510
|
-
@property
|
|
511
|
-
def project_name(self) -> str:
|
|
512
|
-
"""Current project name in which the chain is running"""
|
|
513
|
-
return (
|
|
514
|
-
self._settings.project
|
|
515
|
-
or self.session.catalog.metastore.default_project_name
|
|
516
|
-
)
|
|
517
|
-
|
|
518
493
|
def persist(self) -> "Self":
|
|
519
494
|
"""Saves temporary chain that will be removed after the process ends.
|
|
520
495
|
Temporary datasets are useful for optimization, for example when we have
|
|
@@ -524,14 +499,7 @@ class DataChain:
|
|
|
524
499
|
It returns the chain itself.
|
|
525
500
|
"""
|
|
526
501
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
527
|
-
|
|
528
|
-
self.project_name,
|
|
529
|
-
self.namespace_name,
|
|
530
|
-
create=True,
|
|
531
|
-
)
|
|
532
|
-
return self._evolve(
|
|
533
|
-
query=self._query.save(project=project, feature_schema=schema)
|
|
534
|
-
)
|
|
502
|
+
return self._evolve(query=self._query.save(feature_schema=schema))
|
|
535
503
|
|
|
536
504
|
def save( # type: ignore[override]
|
|
537
505
|
self,
|
|
@@ -545,10 +513,7 @@ class DataChain:
|
|
|
545
513
|
"""Save to a Dataset. It returns the chain itself.
|
|
546
514
|
|
|
547
515
|
Parameters:
|
|
548
|
-
name : dataset name.
|
|
549
|
-
project, but it can also be just a regular dataset name in which
|
|
550
|
-
case we are taking namespace and project from settings, if they
|
|
551
|
-
are defined there, or default ones instead.
|
|
516
|
+
name : dataset name.
|
|
552
517
|
version : version of a dataset. If version is not specified and dataset
|
|
553
518
|
already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
|
|
554
519
|
description : description of a dataset.
|
|
@@ -570,29 +535,6 @@ class DataChain:
|
|
|
570
535
|
" patch"
|
|
571
536
|
)
|
|
572
537
|
|
|
573
|
-
namespace_name, project_name, name = parse_dataset_name(name)
|
|
574
|
-
|
|
575
|
-
namespace_name = (
|
|
576
|
-
namespace_name
|
|
577
|
-
or self._settings.namespace
|
|
578
|
-
or self.session.catalog.metastore.default_namespace_name
|
|
579
|
-
)
|
|
580
|
-
project_name = (
|
|
581
|
-
project_name
|
|
582
|
-
or self._settings.project
|
|
583
|
-
or self.session.catalog.metastore.default_project_name
|
|
584
|
-
)
|
|
585
|
-
|
|
586
|
-
try:
|
|
587
|
-
project = self.session.catalog.metastore.get_project(
|
|
588
|
-
project_name,
|
|
589
|
-
namespace_name,
|
|
590
|
-
create=self.session.catalog.metastore.project_allowed_to_create,
|
|
591
|
-
)
|
|
592
|
-
except ProjectNotFoundError as e:
|
|
593
|
-
# not being able to create it as creation is not allowed
|
|
594
|
-
raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
|
|
595
|
-
|
|
596
538
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
597
539
|
|
|
598
540
|
# Handle retry and delta functionality
|
|
@@ -616,7 +558,6 @@ class DataChain:
|
|
|
616
558
|
query=result_ds._query.save(
|
|
617
559
|
name=name,
|
|
618
560
|
version=version,
|
|
619
|
-
project=project,
|
|
620
561
|
feature_schema=schema,
|
|
621
562
|
dependencies=dependencies,
|
|
622
563
|
**kwargs,
|
|
@@ -636,7 +577,6 @@ class DataChain:
|
|
|
636
577
|
query=self._query.save(
|
|
637
578
|
name=name,
|
|
638
579
|
version=version,
|
|
639
|
-
project=project,
|
|
640
580
|
description=description,
|
|
641
581
|
attrs=attrs,
|
|
642
582
|
feature_schema=schema,
|
|
@@ -903,7 +843,7 @@ class DataChain:
|
|
|
903
843
|
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
904
844
|
I.e. when using `read_dataset` an `order_by` statement should be used if
|
|
905
845
|
the order of the records in the chain is important.
|
|
906
|
-
Using `order_by` directly before `limit`, `
|
|
846
|
+
Using `order_by` directly before `limit`, `collect` and `collect_flatten`
|
|
907
847
|
will give expected results.
|
|
908
848
|
See https://github.com/iterative/datachain/issues/477 for further details.
|
|
909
849
|
"""
|
|
@@ -1108,32 +1048,32 @@ class DataChain:
|
|
|
1108
1048
|
|
|
1109
1049
|
@property
|
|
1110
1050
|
def _effective_signals_schema(self) -> "SignalSchema":
|
|
1111
|
-
"""Effective schema used for user-facing API like
|
|
1051
|
+
"""Effective schema used for user-facing API like collect, to_pandas, etc."""
|
|
1112
1052
|
signals_schema = self.signals_schema
|
|
1113
1053
|
if not self._sys:
|
|
1114
1054
|
return signals_schema.clone_without_sys_signals()
|
|
1115
1055
|
return signals_schema
|
|
1116
1056
|
|
|
1117
1057
|
@overload
|
|
1118
|
-
def
|
|
1058
|
+
def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
|
|
1119
1059
|
|
|
1120
1060
|
@overload
|
|
1121
|
-
def
|
|
1061
|
+
def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
|
|
1122
1062
|
|
|
1123
1063
|
@overload
|
|
1124
|
-
def
|
|
1064
|
+
def collect_flatten(
|
|
1125
1065
|
self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
|
|
1126
1066
|
) -> Iterator[_T]: ...
|
|
1127
1067
|
|
|
1128
1068
|
@overload
|
|
1129
|
-
def
|
|
1069
|
+
def collect_flatten(
|
|
1130
1070
|
self,
|
|
1131
1071
|
*,
|
|
1132
1072
|
row_factory: Callable[[list[str], tuple[Any, ...]], _T],
|
|
1133
1073
|
include_hidden: bool,
|
|
1134
1074
|
) -> Iterator[_T]: ...
|
|
1135
1075
|
|
|
1136
|
-
def
|
|
1076
|
+
def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
|
|
1137
1077
|
"""Yields flattened rows of values as a tuple.
|
|
1138
1078
|
|
|
1139
1079
|
Args:
|
|
@@ -1161,7 +1101,7 @@ class DataChain:
|
|
|
1161
1101
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
1162
1102
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
1163
1103
|
|
|
1164
|
-
results_iter = self.
|
|
1104
|
+
results_iter = self.collect_flatten()
|
|
1165
1105
|
|
|
1166
1106
|
def column_chunks() -> Iterator[list[list[Any]]]:
|
|
1167
1107
|
for chunk_iter in batched_it(results_iter, chunk_size):
|
|
@@ -1194,9 +1134,9 @@ class DataChain:
|
|
|
1194
1134
|
|
|
1195
1135
|
def results(self, *, row_factory=None, include_hidden=True):
|
|
1196
1136
|
if row_factory is None:
|
|
1197
|
-
return list(self.
|
|
1137
|
+
return list(self.collect_flatten(include_hidden=include_hidden))
|
|
1198
1138
|
return list(
|
|
1199
|
-
self.
|
|
1139
|
+
self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
|
|
1200
1140
|
)
|
|
1201
1141
|
|
|
1202
1142
|
def to_records(self) -> list[dict[str, Any]]:
|
|
@@ -1207,38 +1147,42 @@ class DataChain:
|
|
|
1207
1147
|
|
|
1208
1148
|
return self.results(row_factory=to_dict)
|
|
1209
1149
|
|
|
1210
|
-
|
|
1150
|
+
@overload
|
|
1151
|
+
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1152
|
+
|
|
1153
|
+
@overload
|
|
1154
|
+
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1155
|
+
|
|
1156
|
+
@overload
|
|
1157
|
+
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1158
|
+
|
|
1159
|
+
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1211
1160
|
"""Yields rows of values, optionally limited to the specified columns.
|
|
1212
1161
|
|
|
1213
1162
|
Args:
|
|
1214
1163
|
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
1215
1164
|
|
|
1216
1165
|
Yields:
|
|
1217
|
-
(
|
|
1166
|
+
(DataType): Yields a single item if a column is selected.
|
|
1167
|
+
(tuple[DataType, ...]): Yields a tuple of items if multiple columns are
|
|
1168
|
+
selected.
|
|
1218
1169
|
|
|
1219
1170
|
Example:
|
|
1220
1171
|
Iterating over all rows:
|
|
1221
1172
|
```py
|
|
1222
|
-
for row in
|
|
1223
|
-
print(row)
|
|
1224
|
-
```
|
|
1225
|
-
|
|
1226
|
-
DataChain is iterable and can be used in a for loop directly which is
|
|
1227
|
-
equivalent to `ds.to_iter()`:
|
|
1228
|
-
```py
|
|
1229
|
-
for row in ds:
|
|
1173
|
+
for row in dc.collect():
|
|
1230
1174
|
print(row)
|
|
1231
1175
|
```
|
|
1232
1176
|
|
|
1233
1177
|
Iterating over all rows with selected columns:
|
|
1234
1178
|
```py
|
|
1235
|
-
for name, size in
|
|
1179
|
+
for name, size in dc.collect("file.path", "file.size"):
|
|
1236
1180
|
print(name, size)
|
|
1237
1181
|
```
|
|
1238
1182
|
|
|
1239
1183
|
Iterating over a single column:
|
|
1240
1184
|
```py
|
|
1241
|
-
for
|
|
1185
|
+
for file in dc.collect("file.path"):
|
|
1242
1186
|
print(file)
|
|
1243
1187
|
```
|
|
1244
1188
|
"""
|
|
@@ -1250,31 +1194,7 @@ class DataChain:
|
|
|
1250
1194
|
ret = signals_schema.row_to_features(
|
|
1251
1195
|
row, catalog=chain.session.catalog, cache=chain._settings.cache
|
|
1252
1196
|
)
|
|
1253
|
-
yield tuple(ret)
|
|
1254
|
-
|
|
1255
|
-
@overload
|
|
1256
|
-
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1257
|
-
|
|
1258
|
-
@overload
|
|
1259
|
-
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1260
|
-
|
|
1261
|
-
@overload
|
|
1262
|
-
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1263
|
-
|
|
1264
|
-
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1265
|
-
"""
|
|
1266
|
-
Deprecated. Use `to_iter` method instead.
|
|
1267
|
-
"""
|
|
1268
|
-
warnings.warn(
|
|
1269
|
-
"Method `collect` is deprecated. Use `to_iter` method instead.",
|
|
1270
|
-
DeprecationWarning,
|
|
1271
|
-
stacklevel=2,
|
|
1272
|
-
)
|
|
1273
|
-
|
|
1274
|
-
if len(cols) == 1:
|
|
1275
|
-
yield from [item[0] for item in self.to_iter(*cols)]
|
|
1276
|
-
else:
|
|
1277
|
-
yield from self.to_iter(*cols)
|
|
1197
|
+
yield ret[0] if len(cols) == 1 else tuple(ret)
|
|
1278
1198
|
|
|
1279
1199
|
def to_pytorch(
|
|
1280
1200
|
self,
|
|
@@ -1509,7 +1429,7 @@ class DataChain:
|
|
|
1509
1429
|
)
|
|
1510
1430
|
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1511
1431
|
|
|
1512
|
-
def
|
|
1432
|
+
def compare(
|
|
1513
1433
|
self,
|
|
1514
1434
|
other: "DataChain",
|
|
1515
1435
|
on: Union[str, Sequence[str]],
|
|
@@ -1522,33 +1442,41 @@ class DataChain:
|
|
|
1522
1442
|
same: bool = False,
|
|
1523
1443
|
status_col: Optional[str] = None,
|
|
1524
1444
|
) -> "DataChain":
|
|
1525
|
-
"""
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1445
|
+
"""Comparing two chains by identifying rows that are added, deleted, modified
|
|
1446
|
+
or same. Result is the new chain that has additional column with possible
|
|
1447
|
+
values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
|
|
1448
|
+
rows respectively. Note that if only one "status" is asked, by setting proper
|
|
1449
|
+
flags, this additional column is not created as it would have only one value
|
|
1450
|
+
for all rows. Beside additional diff column, new chain has schema of the chain
|
|
1451
|
+
on which method was called.
|
|
1530
1452
|
|
|
1531
1453
|
Parameters:
|
|
1532
|
-
other: Chain to
|
|
1533
|
-
on: Column
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1454
|
+
other: Chain to calculate diff from.
|
|
1455
|
+
on: Column or list of columns to match on. If both chains have the
|
|
1456
|
+
same columns then this column is enough for the match. Otherwise,
|
|
1457
|
+
`right_on` parameter has to specify the columns for the other chain.
|
|
1458
|
+
This value is used to find corresponding row in other dataset. If not
|
|
1459
|
+
found there, row is considered as added (or removed if vice versa), and
|
|
1460
|
+
if found then row can be either modified or same.
|
|
1461
|
+
right_on: Optional column or list of columns
|
|
1462
|
+
for the `other` to match.
|
|
1463
|
+
compare: Column or list of columns to compare on. If both chains have
|
|
1464
|
+
the same columns then this column is enough for the compare. Otherwise,
|
|
1465
|
+
`right_compare` parameter has to specify the columns for the other
|
|
1466
|
+
chain. This value is used to see if row is modified or same. If
|
|
1467
|
+
not set, all columns will be used for comparison
|
|
1468
|
+
right_compare: Optional column or list of columns
|
|
1469
|
+
for the `other` to compare to.
|
|
1470
|
+
added (bool): Whether to return added rows in resulting chain.
|
|
1471
|
+
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1472
|
+
modified (bool): Whether to return modified rows in resulting chain.
|
|
1473
|
+
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1474
|
+
status_col (str): Name of the new column that is created in resulting chain
|
|
1475
|
+
representing diff status.
|
|
1548
1476
|
|
|
1549
1477
|
Example:
|
|
1550
1478
|
```py
|
|
1551
|
-
res = persons.
|
|
1479
|
+
res = persons.compare(
|
|
1552
1480
|
new_persons,
|
|
1553
1481
|
on=["id"],
|
|
1554
1482
|
right_on=["other_id"],
|
|
@@ -1577,7 +1505,7 @@ class DataChain:
|
|
|
1577
1505
|
status_col=status_col,
|
|
1578
1506
|
)
|
|
1579
1507
|
|
|
1580
|
-
def
|
|
1508
|
+
def diff(
|
|
1581
1509
|
self,
|
|
1582
1510
|
other: "DataChain",
|
|
1583
1511
|
on: str = "file",
|
|
@@ -1588,29 +1516,31 @@ class DataChain:
|
|
|
1588
1516
|
same: bool = False,
|
|
1589
1517
|
status_col: Optional[str] = None,
|
|
1590
1518
|
) -> "DataChain":
|
|
1591
|
-
"""
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1519
|
+
"""Similar to `.compare()`, which is more generic method to calculate difference
|
|
1520
|
+
between two chains. Unlike `.compare()`, this method works only on those chains
|
|
1521
|
+
that have `File` object, or it's derivatives, in it. File `source` and `path`
|
|
1522
|
+
are used for matching, and file `version` and `etag` for comparing, while in
|
|
1523
|
+
`.compare()` user needs to provide arbitrary columns for matching and comparing.
|
|
1595
1524
|
|
|
1596
1525
|
Parameters:
|
|
1597
|
-
other: Chain to
|
|
1598
|
-
on: File
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1526
|
+
other: Chain to calculate diff from.
|
|
1527
|
+
on: File signal to match on. If both chains have the
|
|
1528
|
+
same file signal then this column is enough for the match. Otherwise,
|
|
1529
|
+
`right_on` parameter has to specify the file signal for the other chain.
|
|
1530
|
+
This value is used to find corresponding row in other dataset. If not
|
|
1531
|
+
found there, row is considered as added (or removed if vice versa), and
|
|
1532
|
+
if found then row can be either modified or same.
|
|
1533
|
+
right_on: Optional file signal for the `other` to match.
|
|
1534
|
+
added (bool): Whether to return added rows in resulting chain.
|
|
1535
|
+
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1536
|
+
modified (bool): Whether to return modified rows in resulting chain.
|
|
1537
|
+
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1538
|
+
status_col (str): Optional name of the new column that is created in
|
|
1539
|
+
resulting chain representing diff status.
|
|
1610
1540
|
|
|
1611
1541
|
Example:
|
|
1612
1542
|
```py
|
|
1613
|
-
diff = images.
|
|
1543
|
+
diff = images.diff(
|
|
1614
1544
|
new_images,
|
|
1615
1545
|
on="file",
|
|
1616
1546
|
right_on="other_file",
|
|
@@ -1635,7 +1565,7 @@ class DataChain:
|
|
|
1635
1565
|
compare_cols = get_file_signals(on, compare_file_signals)
|
|
1636
1566
|
right_compare_cols = get_file_signals(right_on, compare_file_signals)
|
|
1637
1567
|
|
|
1638
|
-
return self.
|
|
1568
|
+
return self.compare(
|
|
1639
1569
|
other,
|
|
1640
1570
|
on_cols,
|
|
1641
1571
|
right_on=right_on_cols,
|
|
@@ -2047,7 +1977,7 @@ class DataChain:
|
|
|
2047
1977
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
2048
1978
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
2049
1979
|
|
|
2050
|
-
results_iter = self.
|
|
1980
|
+
results_iter = self.collect_flatten()
|
|
2051
1981
|
|
|
2052
1982
|
with opener(path, "w", newline="") as f:
|
|
2053
1983
|
writer = csv.writer(f, delimiter=delimiter, **kwargs)
|
|
@@ -2099,7 +2029,7 @@ class DataChain:
|
|
|
2099
2029
|
if include_outer_list:
|
|
2100
2030
|
# This makes the file JSON instead of JSON lines.
|
|
2101
2031
|
f.write(b"[\n")
|
|
2102
|
-
for row in self.
|
|
2032
|
+
for row in self.collect_flatten():
|
|
2103
2033
|
if not is_first:
|
|
2104
2034
|
if include_outer_list:
|
|
2105
2035
|
# This makes the file JSON instead of JSON lines.
|
|
@@ -2264,7 +2194,7 @@ class DataChain:
|
|
|
2264
2194
|
max_threads=num_threads or 1,
|
|
2265
2195
|
client_config=client_config,
|
|
2266
2196
|
)
|
|
2267
|
-
file_exporter.run(self.
|
|
2197
|
+
file_exporter.run(self.collect(signal), progress_bar)
|
|
2268
2198
|
|
|
2269
2199
|
def shuffle(self) -> "Self":
|
|
2270
2200
|
"""Shuffle the rows of the chain deterministically."""
|
|
@@ -2309,45 +2239,16 @@ class DataChain:
|
|
|
2309
2239
|
|
|
2310
2240
|
Combining filters with "or"
|
|
2311
2241
|
```py
|
|
2312
|
-
dc.filter(
|
|
2313
|
-
C("file.path").glob("cat*") |
|
|
2314
|
-
C("file.path").glob("dog*")
|
|
2315
|
-
)
|
|
2316
|
-
```
|
|
2317
|
-
|
|
2318
|
-
```py
|
|
2319
|
-
dc.filter(dc.func.or_(
|
|
2320
|
-
C("file.path").glob("cat*"),
|
|
2321
|
-
C("file.path").glob("dog*")
|
|
2322
|
-
))
|
|
2242
|
+
dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
|
|
2323
2243
|
```
|
|
2324
2244
|
|
|
2325
2245
|
Combining filters with "and"
|
|
2326
2246
|
```py
|
|
2327
2247
|
dc.filter(
|
|
2328
|
-
C("file.path").glob("*.jpg
|
|
2329
|
-
string.length(C("file.path")) > 5
|
|
2330
|
-
)
|
|
2331
|
-
```
|
|
2332
|
-
|
|
2333
|
-
```py
|
|
2334
|
-
dc.filter(
|
|
2335
|
-
C("file.path").glob("*.jpg") &
|
|
2248
|
+
C("file.path").glob("*.jpg) &
|
|
2336
2249
|
(string.length(C("file.path")) > 5)
|
|
2337
2250
|
)
|
|
2338
2251
|
```
|
|
2339
|
-
|
|
2340
|
-
```py
|
|
2341
|
-
dc.filter(dc.func.and_(
|
|
2342
|
-
C("file.path").glob("*.jpg"),
|
|
2343
|
-
string.length(C("file.path")) > 5
|
|
2344
|
-
))
|
|
2345
|
-
```
|
|
2346
|
-
|
|
2347
|
-
Combining filters with "not"
|
|
2348
|
-
```py
|
|
2349
|
-
dc.filter(~(C("file.path").glob("*.jpg")))
|
|
2350
|
-
```
|
|
2351
2252
|
"""
|
|
2352
2253
|
return self._evolve(query=self._query.filter(*args))
|
|
2353
2254
|
|
|
@@ -2398,72 +2299,3 @@ class DataChain:
|
|
|
2398
2299
|
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|
|
2399
2300
|
"""
|
|
2400
2301
|
return self._evolve(query=self._query.chunk(index, total))
|
|
2401
|
-
|
|
2402
|
-
def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
|
|
2403
|
-
"""Returns a list of rows of values, optionally limited to the specified
|
|
2404
|
-
columns.
|
|
2405
|
-
|
|
2406
|
-
Args:
|
|
2407
|
-
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
2408
|
-
|
|
2409
|
-
Returns:
|
|
2410
|
-
list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
|
|
2411
|
-
|
|
2412
|
-
Example:
|
|
2413
|
-
Getting all rows as a list:
|
|
2414
|
-
```py
|
|
2415
|
-
rows = dc.to_list()
|
|
2416
|
-
print(rows)
|
|
2417
|
-
```
|
|
2418
|
-
|
|
2419
|
-
Getting all rows with selected columns as a list:
|
|
2420
|
-
```py
|
|
2421
|
-
name_size_pairs = dc.to_list("file.path", "file.size")
|
|
2422
|
-
print(name_size_pairs)
|
|
2423
|
-
```
|
|
2424
|
-
|
|
2425
|
-
Getting a single column as a list:
|
|
2426
|
-
```py
|
|
2427
|
-
files = dc.to_list("file.path")
|
|
2428
|
-
print(files) # Returns list of 1-tuples
|
|
2429
|
-
```
|
|
2430
|
-
"""
|
|
2431
|
-
return list(self.to_iter(*cols))
|
|
2432
|
-
|
|
2433
|
-
def to_values(self, col: str) -> list[DataValue]:
|
|
2434
|
-
"""Returns a flat list of values from a single column.
|
|
2435
|
-
|
|
2436
|
-
Args:
|
|
2437
|
-
col: The name of the column to extract values from.
|
|
2438
|
-
|
|
2439
|
-
Returns:
|
|
2440
|
-
list[DataValue]: Returns a flat list of values from the specified column.
|
|
2441
|
-
|
|
2442
|
-
Example:
|
|
2443
|
-
Getting all values from a single column:
|
|
2444
|
-
```py
|
|
2445
|
-
file_paths = dc.to_values("file.path")
|
|
2446
|
-
print(file_paths) # Returns list of strings
|
|
2447
|
-
```
|
|
2448
|
-
|
|
2449
|
-
Getting all file sizes:
|
|
2450
|
-
```py
|
|
2451
|
-
sizes = dc.to_values("file.size")
|
|
2452
|
-
print(sizes) # Returns list of integers
|
|
2453
|
-
```
|
|
2454
|
-
"""
|
|
2455
|
-
return [row[0] for row in self.to_list(col)]
|
|
2456
|
-
|
|
2457
|
-
def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
|
|
2458
|
-
"""Make DataChain objects iterable.
|
|
2459
|
-
|
|
2460
|
-
Yields:
|
|
2461
|
-
(tuple[DataValue, ...]): Yields tuples of all column values for each row.
|
|
2462
|
-
|
|
2463
|
-
Example:
|
|
2464
|
-
```py
|
|
2465
|
-
for row in chain:
|
|
2466
|
-
print(row)
|
|
2467
|
-
```
|
|
2468
|
-
"""
|
|
2469
|
-
return self.to_iter()
|