datachain 0.20.3__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +0 -2
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +65 -180
- datachain/cli/__init__.py +7 -0
- datachain/cli/commands/datasets.py +28 -43
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +35 -1
- datachain/client/fsspec.py +3 -5
- datachain/client/hf.py +0 -10
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +37 -403
- datachain/data_storage/sqlite.py +7 -139
- datachain/data_storage/warehouse.py +7 -26
- datachain/dataset.py +12 -126
- datachain/delta.py +7 -11
- datachain/error.py +0 -36
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +0 -4
- datachain/lib/dc/datachain.py +92 -259
- datachain/lib/dc/datasets.py +49 -87
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +0 -1
- datachain/lib/dc/storage.py +40 -38
- datachain/lib/file.py +23 -77
- datachain/lib/listing.py +1 -3
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +0 -10
- datachain/lib/tar.py +2 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +20 -30
- datachain/listing.py +1 -3
- datachain/query/dataset.py +46 -71
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +26 -61
- datachain/studio.py +7 -23
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
- datachain/lib/namespaces.py +0 -71
- datachain/lib/projects.py +0 -86
- datachain/namespace.py +0 -65
- datachain/project.py +0 -78
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
datachain/func/func.py
CHANGED
datachain/lib/arrow.py
CHANGED
|
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
|
|
|
76
76
|
fs_path = file.path
|
|
77
77
|
fs = ReferenceFileSystem({fs_path: [cache_path]})
|
|
78
78
|
else:
|
|
79
|
-
fs, fs_path = file.get_fs(), file.
|
|
79
|
+
fs, fs_path = file.get_fs(), file.get_path()
|
|
80
80
|
|
|
81
81
|
kwargs = self.kwargs
|
|
82
82
|
if format := kwargs.get("format"):
|
|
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
|
|
|
160
160
|
kwargs["format"] = fix_pyarrow_format(format, parse_options)
|
|
161
161
|
|
|
162
162
|
schemas = []
|
|
163
|
-
for
|
|
164
|
-
ds = dataset(file.
|
|
163
|
+
for file in chain.collect("file"):
|
|
164
|
+
ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
|
|
165
165
|
schemas.append(ds.schema)
|
|
166
166
|
if not schemas:
|
|
167
167
|
raise ValueError(
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -22,8 +22,6 @@ if TYPE_CHECKING:
|
|
|
22
22
|
|
|
23
23
|
class DatasetInfo(DataModel):
|
|
24
24
|
name: str
|
|
25
|
-
namespace: str
|
|
26
|
-
project: str
|
|
27
25
|
uuid: str = Field(default=str(uuid4()))
|
|
28
26
|
version: str = Field(default=DEFAULT_DATASET_VERSION)
|
|
29
27
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
@@ -93,8 +91,6 @@ class DatasetInfo(DataModel):
|
|
|
93
91
|
return cls(
|
|
94
92
|
uuid=version.uuid,
|
|
95
93
|
name=dataset.name,
|
|
96
|
-
namespace=dataset.project.namespace.name,
|
|
97
|
-
project=dataset.project.name,
|
|
98
94
|
version=version.version,
|
|
99
95
|
status=version.status,
|
|
100
96
|
created_at=version.created_at,
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -24,9 +24,8 @@ from pydantic import BaseModel
|
|
|
24
24
|
from tqdm import tqdm
|
|
25
25
|
|
|
26
26
|
from datachain import semver
|
|
27
|
-
from datachain.dataset import DatasetRecord
|
|
27
|
+
from datachain.dataset import DatasetRecord
|
|
28
28
|
from datachain.delta import delta_disabled
|
|
29
|
-
from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
|
|
30
29
|
from datachain.func import literal
|
|
31
30
|
from datachain.func.base import Function
|
|
32
31
|
from datachain.func.func import Func
|
|
@@ -38,7 +37,6 @@ from datachain.lib.file import (
|
|
|
38
37
|
FileExporter,
|
|
39
38
|
)
|
|
40
39
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
41
|
-
from datachain.lib.projects import get as get_project
|
|
42
40
|
from datachain.lib.settings import Settings
|
|
43
41
|
from datachain.lib.signal_schema import SignalSchema
|
|
44
42
|
from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
|
|
@@ -263,7 +261,7 @@ class DataChain:
|
|
|
263
261
|
"""Underlying dataset, if there is one."""
|
|
264
262
|
if not self.name:
|
|
265
263
|
return None
|
|
266
|
-
return self.session.catalog.get_dataset(self.name
|
|
264
|
+
return self.session.catalog.get_dataset(self.name)
|
|
267
265
|
|
|
268
266
|
def __or__(self, other: "Self") -> "Self":
|
|
269
267
|
"""Return `self.union(other)`."""
|
|
@@ -314,8 +312,6 @@ class DataChain:
|
|
|
314
312
|
min_task_size=None,
|
|
315
313
|
prefetch: Optional[int] = None,
|
|
316
314
|
sys: Optional[bool] = None,
|
|
317
|
-
namespace: Optional[str] = None,
|
|
318
|
-
project: Optional[str] = None,
|
|
319
315
|
) -> "Self":
|
|
320
316
|
"""Change settings for chain.
|
|
321
317
|
|
|
@@ -331,8 +327,6 @@ class DataChain:
|
|
|
331
327
|
prefetch: number of workers to use for downloading files in advance.
|
|
332
328
|
This is enabled by default and uses 2 workers.
|
|
333
329
|
To disable prefetching, set it to 0.
|
|
334
|
-
namespace: namespace name.
|
|
335
|
-
project: project name.
|
|
336
330
|
|
|
337
331
|
Example:
|
|
338
332
|
```py
|
|
@@ -346,11 +340,7 @@ class DataChain:
|
|
|
346
340
|
if sys is None:
|
|
347
341
|
sys = self._sys
|
|
348
342
|
settings = copy.copy(self._settings)
|
|
349
|
-
settings.add(
|
|
350
|
-
Settings(
|
|
351
|
-
cache, parallel, workers, min_task_size, prefetch, namespace, project
|
|
352
|
-
)
|
|
353
|
-
)
|
|
343
|
+
settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
|
|
354
344
|
return self._evolve(settings=settings, _sys=sys)
|
|
355
345
|
|
|
356
346
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
@@ -440,10 +430,10 @@ class DataChain:
|
|
|
440
430
|
|
|
441
431
|
from datachain.lib.arrow import schema_to_output
|
|
442
432
|
|
|
443
|
-
json_values = self.limit(schema_sample_size).
|
|
433
|
+
json_values = list(self.limit(schema_sample_size).collect(col))
|
|
444
434
|
json_dicts = [
|
|
445
435
|
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
446
|
-
for
|
|
436
|
+
for json_value in json_values
|
|
447
437
|
]
|
|
448
438
|
|
|
449
439
|
if any(not isinstance(json_dict, dict) for json_dict in json_dicts):
|
|
@@ -500,22 +490,6 @@ class DataChain:
|
|
|
500
490
|
)
|
|
501
491
|
return listings(*args, **kwargs)
|
|
502
492
|
|
|
503
|
-
@property
|
|
504
|
-
def namespace_name(self) -> str:
|
|
505
|
-
"""Current namespace name in which the chain is running"""
|
|
506
|
-
return (
|
|
507
|
-
self._settings.namespace
|
|
508
|
-
or self.session.catalog.metastore.default_namespace_name
|
|
509
|
-
)
|
|
510
|
-
|
|
511
|
-
@property
|
|
512
|
-
def project_name(self) -> str:
|
|
513
|
-
"""Current project name in which the chain is running"""
|
|
514
|
-
return (
|
|
515
|
-
self._settings.project
|
|
516
|
-
or self.session.catalog.metastore.default_project_name
|
|
517
|
-
)
|
|
518
|
-
|
|
519
493
|
def persist(self) -> "Self":
|
|
520
494
|
"""Saves temporary chain that will be removed after the process ends.
|
|
521
495
|
Temporary datasets are useful for optimization, for example when we have
|
|
@@ -525,12 +499,7 @@ class DataChain:
|
|
|
525
499
|
It returns the chain itself.
|
|
526
500
|
"""
|
|
527
501
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
528
|
-
|
|
529
|
-
self.project_name, self.namespace_name, session=self.session
|
|
530
|
-
)
|
|
531
|
-
return self._evolve(
|
|
532
|
-
query=self._query.save(project=project, feature_schema=schema)
|
|
533
|
-
)
|
|
502
|
+
return self._evolve(query=self._query.save(feature_schema=schema))
|
|
534
503
|
|
|
535
504
|
def save( # type: ignore[override]
|
|
536
505
|
self,
|
|
@@ -544,10 +513,7 @@ class DataChain:
|
|
|
544
513
|
"""Save to a Dataset. It returns the chain itself.
|
|
545
514
|
|
|
546
515
|
Parameters:
|
|
547
|
-
name : dataset name.
|
|
548
|
-
project, but it can also be just a regular dataset name in which
|
|
549
|
-
case we are taking namespace and project from settings, if they
|
|
550
|
-
are defined there, or default ones instead.
|
|
516
|
+
name : dataset name.
|
|
551
517
|
version : version of a dataset. If version is not specified and dataset
|
|
552
518
|
already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
|
|
553
519
|
description : description of a dataset.
|
|
@@ -569,29 +535,6 @@ class DataChain:
|
|
|
569
535
|
" patch"
|
|
570
536
|
)
|
|
571
537
|
|
|
572
|
-
namespace_name, project_name, name = parse_dataset_name(name)
|
|
573
|
-
|
|
574
|
-
namespace_name = (
|
|
575
|
-
namespace_name
|
|
576
|
-
or self._settings.namespace
|
|
577
|
-
or self.session.catalog.metastore.default_namespace_name
|
|
578
|
-
)
|
|
579
|
-
project_name = (
|
|
580
|
-
project_name
|
|
581
|
-
or self._settings.project
|
|
582
|
-
or self.session.catalog.metastore.default_project_name
|
|
583
|
-
)
|
|
584
|
-
|
|
585
|
-
try:
|
|
586
|
-
project = self.session.catalog.metastore.get_project(
|
|
587
|
-
project_name,
|
|
588
|
-
namespace_name,
|
|
589
|
-
create=self.session.catalog.metastore.project_allowed_to_create,
|
|
590
|
-
)
|
|
591
|
-
except ProjectNotFoundError as e:
|
|
592
|
-
# not being able to create it as creation is not allowed
|
|
593
|
-
raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
|
|
594
|
-
|
|
595
538
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
596
539
|
|
|
597
540
|
# Handle retry and delta functionality
|
|
@@ -615,7 +558,6 @@ class DataChain:
|
|
|
615
558
|
query=result_ds._query.save(
|
|
616
559
|
name=name,
|
|
617
560
|
version=version,
|
|
618
|
-
project=project,
|
|
619
561
|
feature_schema=schema,
|
|
620
562
|
dependencies=dependencies,
|
|
621
563
|
**kwargs,
|
|
@@ -635,7 +577,6 @@ class DataChain:
|
|
|
635
577
|
query=self._query.save(
|
|
636
578
|
name=name,
|
|
637
579
|
version=version,
|
|
638
|
-
project=project,
|
|
639
580
|
description=description,
|
|
640
581
|
attrs=attrs,
|
|
641
582
|
feature_schema=schema,
|
|
@@ -902,7 +843,7 @@ class DataChain:
|
|
|
902
843
|
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
903
844
|
I.e. when using `read_dataset` an `order_by` statement should be used if
|
|
904
845
|
the order of the records in the chain is important.
|
|
905
|
-
Using `order_by` directly before `limit`, `
|
|
846
|
+
Using `order_by` directly before `limit`, `collect` and `collect_flatten`
|
|
906
847
|
will give expected results.
|
|
907
848
|
See https://github.com/iterative/datachain/issues/477 for further details.
|
|
908
849
|
"""
|
|
@@ -1107,32 +1048,32 @@ class DataChain:
|
|
|
1107
1048
|
|
|
1108
1049
|
@property
|
|
1109
1050
|
def _effective_signals_schema(self) -> "SignalSchema":
|
|
1110
|
-
"""Effective schema used for user-facing API like
|
|
1051
|
+
"""Effective schema used for user-facing API like collect, to_pandas, etc."""
|
|
1111
1052
|
signals_schema = self.signals_schema
|
|
1112
1053
|
if not self._sys:
|
|
1113
1054
|
return signals_schema.clone_without_sys_signals()
|
|
1114
1055
|
return signals_schema
|
|
1115
1056
|
|
|
1116
1057
|
@overload
|
|
1117
|
-
def
|
|
1058
|
+
def collect_flatten(self) -> Iterator[tuple[Any, ...]]: ...
|
|
1118
1059
|
|
|
1119
1060
|
@overload
|
|
1120
|
-
def
|
|
1061
|
+
def collect_flatten(self, *, include_hidden: bool) -> Iterator[tuple[Any, ...]]: ...
|
|
1121
1062
|
|
|
1122
1063
|
@overload
|
|
1123
|
-
def
|
|
1064
|
+
def collect_flatten(
|
|
1124
1065
|
self, *, row_factory: Callable[[list[str], tuple[Any, ...]], _T]
|
|
1125
1066
|
) -> Iterator[_T]: ...
|
|
1126
1067
|
|
|
1127
1068
|
@overload
|
|
1128
|
-
def
|
|
1069
|
+
def collect_flatten(
|
|
1129
1070
|
self,
|
|
1130
1071
|
*,
|
|
1131
1072
|
row_factory: Callable[[list[str], tuple[Any, ...]], _T],
|
|
1132
1073
|
include_hidden: bool,
|
|
1133
1074
|
) -> Iterator[_T]: ...
|
|
1134
1075
|
|
|
1135
|
-
def
|
|
1076
|
+
def collect_flatten(self, *, row_factory=None, include_hidden: bool = True):
|
|
1136
1077
|
"""Yields flattened rows of values as a tuple.
|
|
1137
1078
|
|
|
1138
1079
|
Args:
|
|
@@ -1160,7 +1101,7 @@ class DataChain:
|
|
|
1160
1101
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
1161
1102
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
1162
1103
|
|
|
1163
|
-
results_iter = self.
|
|
1104
|
+
results_iter = self.collect_flatten()
|
|
1164
1105
|
|
|
1165
1106
|
def column_chunks() -> Iterator[list[list[Any]]]:
|
|
1166
1107
|
for chunk_iter in batched_it(results_iter, chunk_size):
|
|
@@ -1193,9 +1134,9 @@ class DataChain:
|
|
|
1193
1134
|
|
|
1194
1135
|
def results(self, *, row_factory=None, include_hidden=True):
|
|
1195
1136
|
if row_factory is None:
|
|
1196
|
-
return list(self.
|
|
1137
|
+
return list(self.collect_flatten(include_hidden=include_hidden))
|
|
1197
1138
|
return list(
|
|
1198
|
-
self.
|
|
1139
|
+
self.collect_flatten(row_factory=row_factory, include_hidden=include_hidden)
|
|
1199
1140
|
)
|
|
1200
1141
|
|
|
1201
1142
|
def to_records(self) -> list[dict[str, Any]]:
|
|
@@ -1206,38 +1147,42 @@ class DataChain:
|
|
|
1206
1147
|
|
|
1207
1148
|
return self.results(row_factory=to_dict)
|
|
1208
1149
|
|
|
1209
|
-
|
|
1150
|
+
@overload
|
|
1151
|
+
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1152
|
+
|
|
1153
|
+
@overload
|
|
1154
|
+
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1155
|
+
|
|
1156
|
+
@overload
|
|
1157
|
+
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1158
|
+
|
|
1159
|
+
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1210
1160
|
"""Yields rows of values, optionally limited to the specified columns.
|
|
1211
1161
|
|
|
1212
1162
|
Args:
|
|
1213
1163
|
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
1214
1164
|
|
|
1215
1165
|
Yields:
|
|
1216
|
-
(
|
|
1166
|
+
(DataType): Yields a single item if a column is selected.
|
|
1167
|
+
(tuple[DataType, ...]): Yields a tuple of items if multiple columns are
|
|
1168
|
+
selected.
|
|
1217
1169
|
|
|
1218
1170
|
Example:
|
|
1219
1171
|
Iterating over all rows:
|
|
1220
1172
|
```py
|
|
1221
|
-
for row in
|
|
1222
|
-
print(row)
|
|
1223
|
-
```
|
|
1224
|
-
|
|
1225
|
-
DataChain is iterable and can be used in a for loop directly which is
|
|
1226
|
-
equivalent to `ds.to_iter()`:
|
|
1227
|
-
```py
|
|
1228
|
-
for row in ds:
|
|
1173
|
+
for row in dc.collect():
|
|
1229
1174
|
print(row)
|
|
1230
1175
|
```
|
|
1231
1176
|
|
|
1232
1177
|
Iterating over all rows with selected columns:
|
|
1233
1178
|
```py
|
|
1234
|
-
for name, size in
|
|
1179
|
+
for name, size in dc.collect("file.path", "file.size"):
|
|
1235
1180
|
print(name, size)
|
|
1236
1181
|
```
|
|
1237
1182
|
|
|
1238
1183
|
Iterating over a single column:
|
|
1239
1184
|
```py
|
|
1240
|
-
for
|
|
1185
|
+
for file in dc.collect("file.path"):
|
|
1241
1186
|
print(file)
|
|
1242
1187
|
```
|
|
1243
1188
|
"""
|
|
@@ -1249,31 +1194,7 @@ class DataChain:
|
|
|
1249
1194
|
ret = signals_schema.row_to_features(
|
|
1250
1195
|
row, catalog=chain.session.catalog, cache=chain._settings.cache
|
|
1251
1196
|
)
|
|
1252
|
-
yield tuple(ret)
|
|
1253
|
-
|
|
1254
|
-
@overload
|
|
1255
|
-
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1256
|
-
|
|
1257
|
-
@overload
|
|
1258
|
-
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1259
|
-
|
|
1260
|
-
@overload
|
|
1261
|
-
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1262
|
-
|
|
1263
|
-
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1264
|
-
"""
|
|
1265
|
-
Deprecated. Use `to_iter` method instead.
|
|
1266
|
-
"""
|
|
1267
|
-
warnings.warn(
|
|
1268
|
-
"Method `collect` is deprecated. Use `to_iter` method instead.",
|
|
1269
|
-
DeprecationWarning,
|
|
1270
|
-
stacklevel=2,
|
|
1271
|
-
)
|
|
1272
|
-
|
|
1273
|
-
if len(cols) == 1:
|
|
1274
|
-
yield from [item[0] for item in self.to_iter(*cols)]
|
|
1275
|
-
else:
|
|
1276
|
-
yield from self.to_iter(*cols)
|
|
1197
|
+
yield ret[0] if len(cols) == 1 else tuple(ret)
|
|
1277
1198
|
|
|
1278
1199
|
def to_pytorch(
|
|
1279
1200
|
self,
|
|
@@ -1508,7 +1429,7 @@ class DataChain:
|
|
|
1508
1429
|
)
|
|
1509
1430
|
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1510
1431
|
|
|
1511
|
-
def
|
|
1432
|
+
def compare(
|
|
1512
1433
|
self,
|
|
1513
1434
|
other: "DataChain",
|
|
1514
1435
|
on: Union[str, Sequence[str]],
|
|
@@ -1521,33 +1442,41 @@ class DataChain:
|
|
|
1521
1442
|
same: bool = False,
|
|
1522
1443
|
status_col: Optional[str] = None,
|
|
1523
1444
|
) -> "DataChain":
|
|
1524
|
-
"""
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1445
|
+
"""Comparing two chains by identifying rows that are added, deleted, modified
|
|
1446
|
+
or same. Result is the new chain that has additional column with possible
|
|
1447
|
+
values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
|
|
1448
|
+
rows respectively. Note that if only one "status" is asked, by setting proper
|
|
1449
|
+
flags, this additional column is not created as it would have only one value
|
|
1450
|
+
for all rows. Beside additional diff column, new chain has schema of the chain
|
|
1451
|
+
on which method was called.
|
|
1529
1452
|
|
|
1530
1453
|
Parameters:
|
|
1531
|
-
other: Chain to
|
|
1532
|
-
on: Column
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1454
|
+
other: Chain to calculate diff from.
|
|
1455
|
+
on: Column or list of columns to match on. If both chains have the
|
|
1456
|
+
same columns then this column is enough for the match. Otherwise,
|
|
1457
|
+
`right_on` parameter has to specify the columns for the other chain.
|
|
1458
|
+
This value is used to find corresponding row in other dataset. If not
|
|
1459
|
+
found there, row is considered as added (or removed if vice versa), and
|
|
1460
|
+
if found then row can be either modified or same.
|
|
1461
|
+
right_on: Optional column or list of columns
|
|
1462
|
+
for the `other` to match.
|
|
1463
|
+
compare: Column or list of columns to compare on. If both chains have
|
|
1464
|
+
the same columns then this column is enough for the compare. Otherwise,
|
|
1465
|
+
`right_compare` parameter has to specify the columns for the other
|
|
1466
|
+
chain. This value is used to see if row is modified or same. If
|
|
1467
|
+
not set, all columns will be used for comparison
|
|
1468
|
+
right_compare: Optional column or list of columns
|
|
1469
|
+
for the `other` to compare to.
|
|
1470
|
+
added (bool): Whether to return added rows in resulting chain.
|
|
1471
|
+
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1472
|
+
modified (bool): Whether to return modified rows in resulting chain.
|
|
1473
|
+
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1474
|
+
status_col (str): Name of the new column that is created in resulting chain
|
|
1475
|
+
representing diff status.
|
|
1547
1476
|
|
|
1548
1477
|
Example:
|
|
1549
1478
|
```py
|
|
1550
|
-
res = persons.
|
|
1479
|
+
res = persons.compare(
|
|
1551
1480
|
new_persons,
|
|
1552
1481
|
on=["id"],
|
|
1553
1482
|
right_on=["other_id"],
|
|
@@ -1576,7 +1505,7 @@ class DataChain:
|
|
|
1576
1505
|
status_col=status_col,
|
|
1577
1506
|
)
|
|
1578
1507
|
|
|
1579
|
-
def
|
|
1508
|
+
def diff(
|
|
1580
1509
|
self,
|
|
1581
1510
|
other: "DataChain",
|
|
1582
1511
|
on: str = "file",
|
|
@@ -1587,29 +1516,31 @@ class DataChain:
|
|
|
1587
1516
|
same: bool = False,
|
|
1588
1517
|
status_col: Optional[str] = None,
|
|
1589
1518
|
) -> "DataChain":
|
|
1590
|
-
"""
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1519
|
+
"""Similar to `.compare()`, which is more generic method to calculate difference
|
|
1520
|
+
between two chains. Unlike `.compare()`, this method works only on those chains
|
|
1521
|
+
that have `File` object, or it's derivatives, in it. File `source` and `path`
|
|
1522
|
+
are used for matching, and file `version` and `etag` for comparing, while in
|
|
1523
|
+
`.compare()` user needs to provide arbitrary columns for matching and comparing.
|
|
1594
1524
|
|
|
1595
1525
|
Parameters:
|
|
1596
|
-
other: Chain to
|
|
1597
|
-
on: File
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1526
|
+
other: Chain to calculate diff from.
|
|
1527
|
+
on: File signal to match on. If both chains have the
|
|
1528
|
+
same file signal then this column is enough for the match. Otherwise,
|
|
1529
|
+
`right_on` parameter has to specify the file signal for the other chain.
|
|
1530
|
+
This value is used to find corresponding row in other dataset. If not
|
|
1531
|
+
found there, row is considered as added (or removed if vice versa), and
|
|
1532
|
+
if found then row can be either modified or same.
|
|
1533
|
+
right_on: Optional file signal for the `other` to match.
|
|
1534
|
+
added (bool): Whether to return added rows in resulting chain.
|
|
1535
|
+
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1536
|
+
modified (bool): Whether to return modified rows in resulting chain.
|
|
1537
|
+
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1538
|
+
status_col (str): Optional name of the new column that is created in
|
|
1539
|
+
resulting chain representing diff status.
|
|
1609
1540
|
|
|
1610
1541
|
Example:
|
|
1611
1542
|
```py
|
|
1612
|
-
diff = images.
|
|
1543
|
+
diff = images.diff(
|
|
1613
1544
|
new_images,
|
|
1614
1545
|
on="file",
|
|
1615
1546
|
right_on="other_file",
|
|
@@ -1634,7 +1565,7 @@ class DataChain:
|
|
|
1634
1565
|
compare_cols = get_file_signals(on, compare_file_signals)
|
|
1635
1566
|
right_compare_cols = get_file_signals(right_on, compare_file_signals)
|
|
1636
1567
|
|
|
1637
|
-
return self.
|
|
1568
|
+
return self.compare(
|
|
1638
1569
|
other,
|
|
1639
1570
|
on_cols,
|
|
1640
1571
|
right_on=right_on_cols,
|
|
@@ -2046,7 +1977,7 @@ class DataChain:
|
|
|
2046
1977
|
headers, _ = self._effective_signals_schema.get_headers_with_length()
|
|
2047
1978
|
column_names = [".".join(filter(None, header)) for header in headers]
|
|
2048
1979
|
|
|
2049
|
-
results_iter = self.
|
|
1980
|
+
results_iter = self.collect_flatten()
|
|
2050
1981
|
|
|
2051
1982
|
with opener(path, "w", newline="") as f:
|
|
2052
1983
|
writer = csv.writer(f, delimiter=delimiter, **kwargs)
|
|
@@ -2098,7 +2029,7 @@ class DataChain:
|
|
|
2098
2029
|
if include_outer_list:
|
|
2099
2030
|
# This makes the file JSON instead of JSON lines.
|
|
2100
2031
|
f.write(b"[\n")
|
|
2101
|
-
for row in self.
|
|
2032
|
+
for row in self.collect_flatten():
|
|
2102
2033
|
if not is_first:
|
|
2103
2034
|
if include_outer_list:
|
|
2104
2035
|
# This makes the file JSON instead of JSON lines.
|
|
@@ -2263,7 +2194,7 @@ class DataChain:
|
|
|
2263
2194
|
max_threads=num_threads or 1,
|
|
2264
2195
|
client_config=client_config,
|
|
2265
2196
|
)
|
|
2266
|
-
file_exporter.run(self.
|
|
2197
|
+
file_exporter.run(self.collect(signal), progress_bar)
|
|
2267
2198
|
|
|
2268
2199
|
def shuffle(self) -> "Self":
|
|
2269
2200
|
"""Shuffle the rows of the chain deterministically."""
|
|
@@ -2308,45 +2239,16 @@ class DataChain:
|
|
|
2308
2239
|
|
|
2309
2240
|
Combining filters with "or"
|
|
2310
2241
|
```py
|
|
2311
|
-
dc.filter(
|
|
2312
|
-
C("file.path").glob("cat*") |
|
|
2313
|
-
C("file.path").glob("dog*")
|
|
2314
|
-
)
|
|
2315
|
-
```
|
|
2316
|
-
|
|
2317
|
-
```py
|
|
2318
|
-
dc.filter(dc.func.or_(
|
|
2319
|
-
C("file.path").glob("cat*"),
|
|
2320
|
-
C("file.path").glob("dog*")
|
|
2321
|
-
))
|
|
2242
|
+
dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
|
|
2322
2243
|
```
|
|
2323
2244
|
|
|
2324
2245
|
Combining filters with "and"
|
|
2325
2246
|
```py
|
|
2326
2247
|
dc.filter(
|
|
2327
|
-
C("file.path").glob("*.jpg
|
|
2328
|
-
string.length(C("file.path")) > 5
|
|
2329
|
-
)
|
|
2330
|
-
```
|
|
2331
|
-
|
|
2332
|
-
```py
|
|
2333
|
-
dc.filter(
|
|
2334
|
-
C("file.path").glob("*.jpg") &
|
|
2248
|
+
C("file.path").glob("*.jpg) &
|
|
2335
2249
|
(string.length(C("file.path")) > 5)
|
|
2336
2250
|
)
|
|
2337
2251
|
```
|
|
2338
|
-
|
|
2339
|
-
```py
|
|
2340
|
-
dc.filter(dc.func.and_(
|
|
2341
|
-
C("file.path").glob("*.jpg"),
|
|
2342
|
-
string.length(C("file.path")) > 5
|
|
2343
|
-
))
|
|
2344
|
-
```
|
|
2345
|
-
|
|
2346
|
-
Combining filters with "not"
|
|
2347
|
-
```py
|
|
2348
|
-
dc.filter(~(C("file.path").glob("*.jpg")))
|
|
2349
|
-
```
|
|
2350
2252
|
"""
|
|
2351
2253
|
return self._evolve(query=self._query.filter(*args))
|
|
2352
2254
|
|
|
@@ -2397,72 +2299,3 @@ class DataChain:
|
|
|
2397
2299
|
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|
|
2398
2300
|
"""
|
|
2399
2301
|
return self._evolve(query=self._query.chunk(index, total))
|
|
2400
|
-
|
|
2401
|
-
def to_list(self, *cols: str) -> list[tuple[DataValue, ...]]:
|
|
2402
|
-
"""Returns a list of rows of values, optionally limited to the specified
|
|
2403
|
-
columns.
|
|
2404
|
-
|
|
2405
|
-
Args:
|
|
2406
|
-
*cols: Limit to the specified columns. By default, all columns are selected.
|
|
2407
|
-
|
|
2408
|
-
Returns:
|
|
2409
|
-
list[tuple[DataType, ...]]: Returns a list of tuples of items for each row.
|
|
2410
|
-
|
|
2411
|
-
Example:
|
|
2412
|
-
Getting all rows as a list:
|
|
2413
|
-
```py
|
|
2414
|
-
rows = dc.to_list()
|
|
2415
|
-
print(rows)
|
|
2416
|
-
```
|
|
2417
|
-
|
|
2418
|
-
Getting all rows with selected columns as a list:
|
|
2419
|
-
```py
|
|
2420
|
-
name_size_pairs = dc.to_list("file.path", "file.size")
|
|
2421
|
-
print(name_size_pairs)
|
|
2422
|
-
```
|
|
2423
|
-
|
|
2424
|
-
Getting a single column as a list:
|
|
2425
|
-
```py
|
|
2426
|
-
files = dc.to_list("file.path")
|
|
2427
|
-
print(files) # Returns list of 1-tuples
|
|
2428
|
-
```
|
|
2429
|
-
"""
|
|
2430
|
-
return list(self.to_iter(*cols))
|
|
2431
|
-
|
|
2432
|
-
def to_values(self, col: str) -> list[DataValue]:
|
|
2433
|
-
"""Returns a flat list of values from a single column.
|
|
2434
|
-
|
|
2435
|
-
Args:
|
|
2436
|
-
col: The name of the column to extract values from.
|
|
2437
|
-
|
|
2438
|
-
Returns:
|
|
2439
|
-
list[DataValue]: Returns a flat list of values from the specified column.
|
|
2440
|
-
|
|
2441
|
-
Example:
|
|
2442
|
-
Getting all values from a single column:
|
|
2443
|
-
```py
|
|
2444
|
-
file_paths = dc.to_values("file.path")
|
|
2445
|
-
print(file_paths) # Returns list of strings
|
|
2446
|
-
```
|
|
2447
|
-
|
|
2448
|
-
Getting all file sizes:
|
|
2449
|
-
```py
|
|
2450
|
-
sizes = dc.to_values("file.size")
|
|
2451
|
-
print(sizes) # Returns list of integers
|
|
2452
|
-
```
|
|
2453
|
-
"""
|
|
2454
|
-
return [row[0] for row in self.to_list(col)]
|
|
2455
|
-
|
|
2456
|
-
def __iter__(self) -> Iterator[tuple[DataValue, ...]]:
|
|
2457
|
-
"""Make DataChain objects iterable.
|
|
2458
|
-
|
|
2459
|
-
Yields:
|
|
2460
|
-
(tuple[DataValue, ...]): Yields tuples of all column values for each row.
|
|
2461
|
-
|
|
2462
|
-
Example:
|
|
2463
|
-
```py
|
|
2464
|
-
for row in chain:
|
|
2465
|
-
print(row)
|
|
2466
|
-
```
|
|
2467
|
-
"""
|
|
2468
|
-
return self.to_iter()
|