datachain 0.7.2__py3-none-any.whl → 0.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +4 -3
- datachain/cli.py +108 -8
- datachain/data_storage/metastore.py +58 -13
- datachain/dataset.py +156 -17
- datachain/lib/dataset_info.py +7 -3
- datachain/remote/studio.py +40 -8
- datachain/studio.py +29 -0
- {datachain-0.7.2.dist-info → datachain-0.7.4.dist-info}/METADATA +2 -2
- {datachain-0.7.2.dist-info → datachain-0.7.4.dist-info}/RECORD +13 -13
- {datachain-0.7.2.dist-info → datachain-0.7.4.dist-info}/LICENSE +0 -0
- {datachain-0.7.2.dist-info → datachain-0.7.4.dist-info}/WHEEL +0 -0
- {datachain-0.7.2.dist-info → datachain-0.7.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.7.2.dist-info → datachain-0.7.4.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -38,6 +38,7 @@ from datachain.dataset import (
|
|
|
38
38
|
DATASET_PREFIX,
|
|
39
39
|
QUERY_DATASET_PREFIX,
|
|
40
40
|
DatasetDependency,
|
|
41
|
+
DatasetListRecord,
|
|
41
42
|
DatasetRecord,
|
|
42
43
|
DatasetStats,
|
|
43
44
|
DatasetStatus,
|
|
@@ -72,7 +73,7 @@ if TYPE_CHECKING:
|
|
|
72
73
|
AbstractMetastore,
|
|
73
74
|
AbstractWarehouse,
|
|
74
75
|
)
|
|
75
|
-
from datachain.dataset import
|
|
76
|
+
from datachain.dataset import DatasetListVersion
|
|
76
77
|
from datachain.job import Job
|
|
77
78
|
from datachain.lib.file import File
|
|
78
79
|
from datachain.listing import Listing
|
|
@@ -1135,7 +1136,7 @@ class Catalog:
|
|
|
1135
1136
|
|
|
1136
1137
|
return direct_dependencies
|
|
1137
1138
|
|
|
1138
|
-
def ls_datasets(self, include_listing: bool = False) -> Iterator[
|
|
1139
|
+
def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetListRecord]:
|
|
1139
1140
|
datasets = self.metastore.list_datasets()
|
|
1140
1141
|
for d in datasets:
|
|
1141
1142
|
if not d.is_bucket_listing or include_listing:
|
|
@@ -1144,7 +1145,7 @@ class Catalog:
|
|
|
1144
1145
|
def list_datasets_versions(
|
|
1145
1146
|
self,
|
|
1146
1147
|
include_listing: bool = False,
|
|
1147
|
-
) -> Iterator[tuple[
|
|
1148
|
+
) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
|
|
1148
1149
|
"""Iterate over all dataset versions with related jobs."""
|
|
1149
1150
|
datasets = list(self.ls_datasets(include_listing=include_listing))
|
|
1150
1151
|
|
datachain/cli.py
CHANGED
|
@@ -18,7 +18,12 @@ from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyVa
|
|
|
18
18
|
from datachain.config import Config
|
|
19
19
|
from datachain.error import DataChainError
|
|
20
20
|
from datachain.lib.dc import DataChain
|
|
21
|
-
from datachain.studio import
|
|
21
|
+
from datachain.studio import (
|
|
22
|
+
edit_studio_dataset,
|
|
23
|
+
list_datasets,
|
|
24
|
+
process_studio_cli_args,
|
|
25
|
+
remove_studio_dataset,
|
|
26
|
+
)
|
|
22
27
|
from datachain.telemetry import telemetry
|
|
23
28
|
|
|
24
29
|
if TYPE_CHECKING:
|
|
@@ -403,21 +408,44 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
403
408
|
parse_edit_dataset.add_argument(
|
|
404
409
|
"--new-name",
|
|
405
410
|
action="store",
|
|
406
|
-
default="",
|
|
407
411
|
help="Dataset new name",
|
|
408
412
|
)
|
|
409
413
|
parse_edit_dataset.add_argument(
|
|
410
414
|
"--description",
|
|
411
415
|
action="store",
|
|
412
|
-
default="",
|
|
413
416
|
help="Dataset description",
|
|
414
417
|
)
|
|
415
418
|
parse_edit_dataset.add_argument(
|
|
416
419
|
"--labels",
|
|
417
|
-
default=[],
|
|
418
420
|
nargs="+",
|
|
419
421
|
help="Dataset labels",
|
|
420
422
|
)
|
|
423
|
+
parse_edit_dataset.add_argument(
|
|
424
|
+
"--studio",
|
|
425
|
+
action="store_true",
|
|
426
|
+
default=False,
|
|
427
|
+
help="Edit dataset from Studio",
|
|
428
|
+
)
|
|
429
|
+
parse_edit_dataset.add_argument(
|
|
430
|
+
"-L",
|
|
431
|
+
"--local",
|
|
432
|
+
action="store_true",
|
|
433
|
+
default=False,
|
|
434
|
+
help="Edit local dataset only",
|
|
435
|
+
)
|
|
436
|
+
parse_edit_dataset.add_argument(
|
|
437
|
+
"-a",
|
|
438
|
+
"--all",
|
|
439
|
+
action="store_true",
|
|
440
|
+
default=True,
|
|
441
|
+
help="Edit both datasets from studio and local",
|
|
442
|
+
)
|
|
443
|
+
parse_edit_dataset.add_argument(
|
|
444
|
+
"--team",
|
|
445
|
+
action="store",
|
|
446
|
+
default=None,
|
|
447
|
+
help="The team to edit a dataset. By default, it will use team from config.",
|
|
448
|
+
)
|
|
421
449
|
|
|
422
450
|
datasets_parser = subp.add_parser(
|
|
423
451
|
"datasets", parents=[parent_parser], description="List datasets"
|
|
@@ -466,6 +494,32 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
466
494
|
action=BooleanOptionalAction,
|
|
467
495
|
help="Force delete registered dataset with all of it's versions",
|
|
468
496
|
)
|
|
497
|
+
rm_dataset_parser.add_argument(
|
|
498
|
+
"--studio",
|
|
499
|
+
action="store_true",
|
|
500
|
+
default=False,
|
|
501
|
+
help="Remove dataset from Studio",
|
|
502
|
+
)
|
|
503
|
+
rm_dataset_parser.add_argument(
|
|
504
|
+
"-L",
|
|
505
|
+
"--local",
|
|
506
|
+
action="store_true",
|
|
507
|
+
default=False,
|
|
508
|
+
help="Remove local datasets only",
|
|
509
|
+
)
|
|
510
|
+
rm_dataset_parser.add_argument(
|
|
511
|
+
"-a",
|
|
512
|
+
"--all",
|
|
513
|
+
action="store_true",
|
|
514
|
+
default=True,
|
|
515
|
+
help="Remove both local and studio",
|
|
516
|
+
)
|
|
517
|
+
rm_dataset_parser.add_argument(
|
|
518
|
+
"--team",
|
|
519
|
+
action="store",
|
|
520
|
+
default=None,
|
|
521
|
+
help="The team to delete a dataset. By default, it will use team from config.",
|
|
522
|
+
)
|
|
469
523
|
|
|
470
524
|
dataset_stats_parser = subp.add_parser(
|
|
471
525
|
"dataset-stats",
|
|
@@ -909,8 +963,40 @@ def rm_dataset(
|
|
|
909
963
|
name: str,
|
|
910
964
|
version: Optional[int] = None,
|
|
911
965
|
force: Optional[bool] = False,
|
|
966
|
+
studio: bool = False,
|
|
967
|
+
local: bool = False,
|
|
968
|
+
all: bool = True,
|
|
969
|
+
team: Optional[str] = None,
|
|
970
|
+
):
|
|
971
|
+
token = Config().read().get("studio", {}).get("token")
|
|
972
|
+
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
973
|
+
|
|
974
|
+
if all or local:
|
|
975
|
+
catalog.remove_dataset(name, version=version, force=force)
|
|
976
|
+
|
|
977
|
+
if (all or studio) and token:
|
|
978
|
+
remove_studio_dataset(team, name, version, force)
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
def edit_dataset(
|
|
982
|
+
catalog: "Catalog",
|
|
983
|
+
name: str,
|
|
984
|
+
new_name: Optional[str] = None,
|
|
985
|
+
description: Optional[str] = None,
|
|
986
|
+
labels: Optional[list[str]] = None,
|
|
987
|
+
studio: bool = False,
|
|
988
|
+
local: bool = False,
|
|
989
|
+
all: bool = True,
|
|
990
|
+
team: Optional[str] = None,
|
|
912
991
|
):
|
|
913
|
-
|
|
992
|
+
token = Config().read().get("studio", {}).get("token")
|
|
993
|
+
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
994
|
+
|
|
995
|
+
if all or local:
|
|
996
|
+
catalog.edit_dataset(name, new_name, description, labels)
|
|
997
|
+
|
|
998
|
+
if (all or studio) and token:
|
|
999
|
+
edit_studio_dataset(team, name, new_name, description, labels)
|
|
914
1000
|
|
|
915
1001
|
|
|
916
1002
|
def dataset_stats(
|
|
@@ -1127,11 +1213,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1127
1213
|
edatachain_file=args.edatachain_file,
|
|
1128
1214
|
)
|
|
1129
1215
|
elif args.command == "edit-dataset":
|
|
1130
|
-
|
|
1216
|
+
edit_dataset(
|
|
1217
|
+
catalog,
|
|
1131
1218
|
args.name,
|
|
1132
|
-
description=args.description,
|
|
1133
1219
|
new_name=args.new_name,
|
|
1220
|
+
description=args.description,
|
|
1134
1221
|
labels=args.labels,
|
|
1222
|
+
studio=args.studio,
|
|
1223
|
+
local=args.local,
|
|
1224
|
+
all=args.all,
|
|
1225
|
+
team=args.team,
|
|
1135
1226
|
)
|
|
1136
1227
|
elif args.command == "ls":
|
|
1137
1228
|
ls(
|
|
@@ -1164,7 +1255,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1164
1255
|
schema=args.schema,
|
|
1165
1256
|
)
|
|
1166
1257
|
elif args.command == "rm-dataset":
|
|
1167
|
-
rm_dataset(
|
|
1258
|
+
rm_dataset(
|
|
1259
|
+
catalog,
|
|
1260
|
+
args.name,
|
|
1261
|
+
version=args.version,
|
|
1262
|
+
force=args.force,
|
|
1263
|
+
studio=args.studio,
|
|
1264
|
+
local=args.local,
|
|
1265
|
+
all=args.all,
|
|
1266
|
+
team=args.team,
|
|
1267
|
+
)
|
|
1168
1268
|
elif args.command == "dataset-stats":
|
|
1169
1269
|
dataset_stats(
|
|
1170
1270
|
catalog,
|
|
@@ -27,6 +27,8 @@ from datachain.data_storage import JobQueryType, JobStatus
|
|
|
27
27
|
from datachain.data_storage.serializer import Serializable
|
|
28
28
|
from datachain.dataset import (
|
|
29
29
|
DatasetDependency,
|
|
30
|
+
DatasetListRecord,
|
|
31
|
+
DatasetListVersion,
|
|
30
32
|
DatasetRecord,
|
|
31
33
|
DatasetStatus,
|
|
32
34
|
DatasetVersion,
|
|
@@ -59,6 +61,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
59
61
|
|
|
60
62
|
schema: "schema.Schema"
|
|
61
63
|
dataset_class: type[DatasetRecord] = DatasetRecord
|
|
64
|
+
dataset_list_class: type[DatasetListRecord] = DatasetListRecord
|
|
65
|
+
dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
|
|
62
66
|
dependency_class: type[DatasetDependency] = DatasetDependency
|
|
63
67
|
job_class: type[Job] = Job
|
|
64
68
|
|
|
@@ -166,11 +170,11 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
166
170
|
"""
|
|
167
171
|
|
|
168
172
|
@abstractmethod
|
|
169
|
-
def list_datasets(self) -> Iterator[
|
|
173
|
+
def list_datasets(self) -> Iterator[DatasetListRecord]:
|
|
170
174
|
"""Lists all datasets."""
|
|
171
175
|
|
|
172
176
|
@abstractmethod
|
|
173
|
-
def list_datasets_by_prefix(self, prefix: str) -> Iterator["
|
|
177
|
+
def list_datasets_by_prefix(self, prefix: str) -> Iterator["DatasetListRecord"]:
|
|
174
178
|
"""Lists all datasets which names start with prefix."""
|
|
175
179
|
|
|
176
180
|
@abstractmethod
|
|
@@ -348,6 +352,14 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
348
352
|
if c.name # type: ignore [attr-defined]
|
|
349
353
|
]
|
|
350
354
|
|
|
355
|
+
@cached_property
|
|
356
|
+
def _dataset_list_fields(self) -> list[str]:
|
|
357
|
+
return [
|
|
358
|
+
c.name # type: ignore [attr-defined]
|
|
359
|
+
for c in self._datasets_columns()
|
|
360
|
+
if c.name in self.dataset_list_class.__dataclass_fields__ # type: ignore [attr-defined]
|
|
361
|
+
]
|
|
362
|
+
|
|
351
363
|
@classmethod
|
|
352
364
|
def _datasets_versions_columns(cls) -> list["SchemaItem"]:
|
|
353
365
|
"""Datasets versions table columns."""
|
|
@@ -390,6 +402,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
390
402
|
if c.name # type: ignore [attr-defined]
|
|
391
403
|
]
|
|
392
404
|
|
|
405
|
+
@cached_property
|
|
406
|
+
def _dataset_list_version_fields(self) -> list[str]:
|
|
407
|
+
return [
|
|
408
|
+
c.name # type: ignore [attr-defined]
|
|
409
|
+
for c in self._datasets_versions_columns()
|
|
410
|
+
if c.name # type: ignore [attr-defined]
|
|
411
|
+
in self.dataset_list_version_class.__dataclass_fields__
|
|
412
|
+
]
|
|
413
|
+
|
|
393
414
|
@classmethod
|
|
394
415
|
def _datasets_dependencies_columns(cls) -> list["SchemaItem"]:
|
|
395
416
|
"""Datasets dependencies table columns."""
|
|
@@ -664,14 +685,25 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
664
685
|
return None
|
|
665
686
|
return reduce(lambda ds, version: ds.merge_versions(version), versions)
|
|
666
687
|
|
|
667
|
-
def
|
|
688
|
+
def _parse_list_dataset(self, rows) -> Optional[DatasetListRecord]:
|
|
689
|
+
versions = [self.dataset_list_class.parse(*r) for r in rows]
|
|
690
|
+
if not versions:
|
|
691
|
+
return None
|
|
692
|
+
return reduce(lambda ds, version: ds.merge_versions(version), versions)
|
|
693
|
+
|
|
694
|
+
def _parse_dataset_list(self, rows) -> Iterator["DatasetListRecord"]:
|
|
668
695
|
# grouping rows by dataset id
|
|
669
696
|
for _, g in groupby(rows, lambda r: r[0]):
|
|
670
|
-
dataset = self.
|
|
697
|
+
dataset = self._parse_list_dataset(list(g))
|
|
671
698
|
if dataset:
|
|
672
699
|
yield dataset
|
|
673
700
|
|
|
674
|
-
def
|
|
701
|
+
def _get_dataset_query(
|
|
702
|
+
self,
|
|
703
|
+
dataset_fields: list[str],
|
|
704
|
+
dataset_version_fields: list[str],
|
|
705
|
+
isouter: bool = True,
|
|
706
|
+
):
|
|
675
707
|
if not (
|
|
676
708
|
self.db.has_table(self._datasets.name)
|
|
677
709
|
and self.db.has_table(self._datasets_versions.name)
|
|
@@ -680,23 +712,36 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
680
712
|
|
|
681
713
|
d = self._datasets
|
|
682
714
|
dv = self._datasets_versions
|
|
715
|
+
|
|
683
716
|
query = self._datasets_select(
|
|
684
|
-
*(getattr(d.c, f) for f in
|
|
685
|
-
*(getattr(dv.c, f) for f in
|
|
717
|
+
*(getattr(d.c, f) for f in dataset_fields),
|
|
718
|
+
*(getattr(dv.c, f) for f in dataset_version_fields),
|
|
686
719
|
)
|
|
687
|
-
j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=
|
|
720
|
+
j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
|
|
688
721
|
return query.select_from(j)
|
|
689
722
|
|
|
690
|
-
def
|
|
723
|
+
def _base_dataset_query(self):
|
|
724
|
+
return self._get_dataset_query(
|
|
725
|
+
self._dataset_fields, self._dataset_version_fields
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
def _base_list_datasets_query(self):
|
|
729
|
+
return self._get_dataset_query(
|
|
730
|
+
self._dataset_list_fields, self._dataset_list_version_fields, isouter=False
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
def list_datasets(self) -> Iterator["DatasetListRecord"]:
|
|
691
734
|
"""Lists all datasets."""
|
|
692
|
-
yield from self.
|
|
735
|
+
yield from self._parse_dataset_list(
|
|
736
|
+
self.db.execute(self._base_list_datasets_query())
|
|
737
|
+
)
|
|
693
738
|
|
|
694
739
|
def list_datasets_by_prefix(
|
|
695
740
|
self, prefix: str, conn=None
|
|
696
|
-
) -> Iterator["
|
|
697
|
-
query = self.
|
|
741
|
+
) -> Iterator["DatasetListRecord"]:
|
|
742
|
+
query = self._base_list_datasets_query()
|
|
698
743
|
query = query.where(self._datasets.c.name.startswith(prefix))
|
|
699
|
-
yield from self.
|
|
744
|
+
yield from self._parse_dataset_list(self.db.execute(query))
|
|
700
745
|
|
|
701
746
|
def get_dataset(self, name: str, conn=None) -> DatasetRecord:
|
|
702
747
|
"""Gets a single dataset by name"""
|
datachain/dataset.py
CHANGED
|
@@ -2,6 +2,7 @@ import builtins
|
|
|
2
2
|
import json
|
|
3
3
|
from dataclasses import dataclass, fields
|
|
4
4
|
from datetime import datetime
|
|
5
|
+
from functools import cached_property
|
|
5
6
|
from typing import (
|
|
6
7
|
Any,
|
|
7
8
|
NewType,
|
|
@@ -11,11 +12,15 @@ from typing import (
|
|
|
11
12
|
)
|
|
12
13
|
from urllib.parse import urlparse
|
|
13
14
|
|
|
15
|
+
import orjson
|
|
16
|
+
|
|
14
17
|
from datachain.error import DatasetVersionNotFoundError
|
|
15
18
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
16
19
|
|
|
17
20
|
T = TypeVar("T", bound="DatasetRecord")
|
|
21
|
+
LT = TypeVar("LT", bound="DatasetListRecord")
|
|
18
22
|
V = TypeVar("V", bound="DatasetVersion")
|
|
23
|
+
LV = TypeVar("LV", bound="DatasetListVersion")
|
|
19
24
|
DD = TypeVar("DD", bound="DatasetDependency")
|
|
20
25
|
|
|
21
26
|
DATASET_PREFIX = "ds://"
|
|
@@ -176,7 +181,7 @@ class DatasetVersion:
|
|
|
176
181
|
schema: dict[str, Union[SQLType, type[SQLType]]]
|
|
177
182
|
num_objects: Optional[int]
|
|
178
183
|
size: Optional[int]
|
|
179
|
-
|
|
184
|
+
_preview_data: Optional[Union[str, list[dict]]]
|
|
180
185
|
sources: str = ""
|
|
181
186
|
query_script: str = ""
|
|
182
187
|
job_id: Optional[str] = None
|
|
@@ -197,7 +202,7 @@ class DatasetVersion:
|
|
|
197
202
|
script_output: str,
|
|
198
203
|
num_objects: Optional[int],
|
|
199
204
|
size: Optional[int],
|
|
200
|
-
preview: Optional[str],
|
|
205
|
+
preview: Optional[Union[str, list[dict]]],
|
|
201
206
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
202
207
|
sources: str = "",
|
|
203
208
|
query_script: str = "",
|
|
@@ -218,7 +223,7 @@ class DatasetVersion:
|
|
|
218
223
|
schema,
|
|
219
224
|
num_objects,
|
|
220
225
|
size,
|
|
221
|
-
|
|
226
|
+
preview,
|
|
222
227
|
sources,
|
|
223
228
|
query_script,
|
|
224
229
|
job_id,
|
|
@@ -258,12 +263,73 @@ class DatasetVersion:
|
|
|
258
263
|
for c_name, c_type in self.schema.items()
|
|
259
264
|
}
|
|
260
265
|
|
|
266
|
+
@cached_property
|
|
267
|
+
def preview(self) -> Optional[list[dict]]:
|
|
268
|
+
if isinstance(self._preview_data, str):
|
|
269
|
+
return orjson.loads(self._preview_data)
|
|
270
|
+
return self._preview_data if self._preview_data else None
|
|
271
|
+
|
|
261
272
|
@classmethod
|
|
262
273
|
def from_dict(cls, d: dict[str, Any]) -> "DatasetVersion":
|
|
263
274
|
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
275
|
+
if not hasattr(kwargs, "_preview_data"):
|
|
276
|
+
kwargs["_preview_data"] = d.get("preview")
|
|
264
277
|
return cls(**kwargs)
|
|
265
278
|
|
|
266
279
|
|
|
280
|
+
@dataclass
|
|
281
|
+
class DatasetListVersion:
|
|
282
|
+
id: int
|
|
283
|
+
uuid: str
|
|
284
|
+
dataset_id: int
|
|
285
|
+
version: int
|
|
286
|
+
status: int
|
|
287
|
+
created_at: datetime
|
|
288
|
+
finished_at: Optional[datetime]
|
|
289
|
+
error_message: str
|
|
290
|
+
error_stack: str
|
|
291
|
+
num_objects: Optional[int]
|
|
292
|
+
size: Optional[int]
|
|
293
|
+
query_script: str = ""
|
|
294
|
+
job_id: Optional[str] = None
|
|
295
|
+
|
|
296
|
+
@classmethod
|
|
297
|
+
def parse(
|
|
298
|
+
cls: type[LV],
|
|
299
|
+
id: int,
|
|
300
|
+
uuid: str,
|
|
301
|
+
dataset_id: int,
|
|
302
|
+
version: int,
|
|
303
|
+
status: int,
|
|
304
|
+
created_at: datetime,
|
|
305
|
+
finished_at: Optional[datetime],
|
|
306
|
+
error_message: str,
|
|
307
|
+
error_stack: str,
|
|
308
|
+
num_objects: Optional[int],
|
|
309
|
+
size: Optional[int],
|
|
310
|
+
query_script: str = "",
|
|
311
|
+
job_id: Optional[str] = None,
|
|
312
|
+
):
|
|
313
|
+
return cls(
|
|
314
|
+
id,
|
|
315
|
+
uuid,
|
|
316
|
+
dataset_id,
|
|
317
|
+
version,
|
|
318
|
+
status,
|
|
319
|
+
created_at,
|
|
320
|
+
finished_at,
|
|
321
|
+
error_message,
|
|
322
|
+
error_stack,
|
|
323
|
+
num_objects,
|
|
324
|
+
size,
|
|
325
|
+
query_script,
|
|
326
|
+
job_id,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
def __hash__(self):
|
|
330
|
+
return hash(f"{self.dataset_id}_{self.version}")
|
|
331
|
+
|
|
332
|
+
|
|
267
333
|
@dataclass
|
|
268
334
|
class DatasetRecord:
|
|
269
335
|
id: int
|
|
@@ -447,20 +513,6 @@ class DatasetRecord:
|
|
|
447
513
|
identifier = self.identifier(version)
|
|
448
514
|
return f"{DATASET_PREFIX}{identifier}"
|
|
449
515
|
|
|
450
|
-
@property
|
|
451
|
-
def is_bucket_listing(self) -> bool:
|
|
452
|
-
"""
|
|
453
|
-
For bucket listing we implicitly create underlying dataset to hold data. This
|
|
454
|
-
method is checking if this is one of those datasets.
|
|
455
|
-
"""
|
|
456
|
-
from datachain.client import Client
|
|
457
|
-
|
|
458
|
-
# TODO refactor and maybe remove method in
|
|
459
|
-
# https://github.com/iterative/datachain/issues/318
|
|
460
|
-
return Client.is_data_source_uri(self.name) or self.name.startswith(
|
|
461
|
-
LISTING_PREFIX
|
|
462
|
-
)
|
|
463
|
-
|
|
464
516
|
@property
|
|
465
517
|
def versions_values(self) -> list[int]:
|
|
466
518
|
"""
|
|
@@ -499,5 +551,92 @@ class DatasetRecord:
|
|
|
499
551
|
return cls(**kwargs, versions=versions)
|
|
500
552
|
|
|
501
553
|
|
|
554
|
+
@dataclass
|
|
555
|
+
class DatasetListRecord:
|
|
556
|
+
id: int
|
|
557
|
+
name: str
|
|
558
|
+
description: Optional[str]
|
|
559
|
+
labels: list[str]
|
|
560
|
+
versions: list[DatasetListVersion]
|
|
561
|
+
created_at: Optional[datetime] = None
|
|
562
|
+
|
|
563
|
+
@classmethod
|
|
564
|
+
def parse( # noqa: PLR0913
|
|
565
|
+
cls: type[LT],
|
|
566
|
+
id: int,
|
|
567
|
+
name: str,
|
|
568
|
+
description: Optional[str],
|
|
569
|
+
labels: str,
|
|
570
|
+
created_at: datetime,
|
|
571
|
+
version_id: int,
|
|
572
|
+
version_uuid: str,
|
|
573
|
+
version_dataset_id: int,
|
|
574
|
+
version: int,
|
|
575
|
+
version_status: int,
|
|
576
|
+
version_created_at: datetime,
|
|
577
|
+
version_finished_at: Optional[datetime],
|
|
578
|
+
version_error_message: str,
|
|
579
|
+
version_error_stack: str,
|
|
580
|
+
version_num_objects: Optional[int],
|
|
581
|
+
version_size: Optional[int],
|
|
582
|
+
version_query_script: Optional[str],
|
|
583
|
+
version_job_id: Optional[str] = None,
|
|
584
|
+
) -> "DatasetListRecord":
|
|
585
|
+
labels_lst: list[str] = json.loads(labels) if labels else []
|
|
586
|
+
|
|
587
|
+
dataset_version = DatasetListVersion.parse(
|
|
588
|
+
version_id,
|
|
589
|
+
version_uuid,
|
|
590
|
+
version_dataset_id,
|
|
591
|
+
version,
|
|
592
|
+
version_status,
|
|
593
|
+
version_created_at,
|
|
594
|
+
version_finished_at,
|
|
595
|
+
version_error_message,
|
|
596
|
+
version_error_stack,
|
|
597
|
+
version_num_objects,
|
|
598
|
+
version_size,
|
|
599
|
+
version_query_script, # type: ignore[arg-type]
|
|
600
|
+
version_job_id,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
return cls(
|
|
604
|
+
id,
|
|
605
|
+
name,
|
|
606
|
+
description,
|
|
607
|
+
labels_lst,
|
|
608
|
+
[dataset_version],
|
|
609
|
+
created_at,
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
|
|
613
|
+
"""Merge versions from another dataset"""
|
|
614
|
+
if other.id != self.id:
|
|
615
|
+
raise RuntimeError("Cannot merge versions of datasets with different ids")
|
|
616
|
+
if not other.versions:
|
|
617
|
+
# nothing to merge
|
|
618
|
+
return self
|
|
619
|
+
if not self.versions:
|
|
620
|
+
self.versions = []
|
|
621
|
+
|
|
622
|
+
self.versions = list(set(self.versions + other.versions))
|
|
623
|
+
self.versions.sort(key=lambda v: v.version)
|
|
624
|
+
return self
|
|
625
|
+
|
|
626
|
+
@property
|
|
627
|
+
def is_bucket_listing(self) -> bool:
|
|
628
|
+
"""
|
|
629
|
+
For bucket listing we implicitly create underlying dataset to hold data. This
|
|
630
|
+
method is checking if this is one of those datasets.
|
|
631
|
+
"""
|
|
632
|
+
from datachain.client import Client
|
|
633
|
+
|
|
634
|
+
# TODO refactor and maybe remove method in
|
|
635
|
+
# https://github.com/iterative/datachain/issues/318
|
|
636
|
+
return Client.is_data_source_uri(self.name) or self.name.startswith(
|
|
637
|
+
LISTING_PREFIX
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
|
|
502
641
|
class RowDict(dict):
|
|
503
642
|
pass
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -5,7 +5,11 @@ from uuid import uuid4
|
|
|
5
5
|
|
|
6
6
|
from pydantic import Field, field_validator
|
|
7
7
|
|
|
8
|
-
from datachain.dataset import
|
|
8
|
+
from datachain.dataset import (
|
|
9
|
+
DatasetListRecord,
|
|
10
|
+
DatasetListVersion,
|
|
11
|
+
DatasetStatus,
|
|
12
|
+
)
|
|
9
13
|
from datachain.job import Job
|
|
10
14
|
from datachain.lib.data_model import DataModel
|
|
11
15
|
from datachain.utils import TIME_ZERO
|
|
@@ -57,8 +61,8 @@ class DatasetInfo(DataModel):
|
|
|
57
61
|
@classmethod
|
|
58
62
|
def from_models(
|
|
59
63
|
cls,
|
|
60
|
-
dataset:
|
|
61
|
-
version:
|
|
64
|
+
dataset: DatasetListRecord,
|
|
65
|
+
version: DatasetListVersion,
|
|
62
66
|
job: Optional[Job],
|
|
63
67
|
) -> "Self":
|
|
64
68
|
return cls(
|
datachain/remote/studio.py
CHANGED
|
@@ -178,17 +178,9 @@ class StudioClient:
|
|
|
178
178
|
data = {}
|
|
179
179
|
|
|
180
180
|
if not ok:
|
|
181
|
-
logger.error(
|
|
182
|
-
"Got bad response from Studio, content is %s",
|
|
183
|
-
response.content.decode("utf-8"),
|
|
184
|
-
)
|
|
185
181
|
if response.status_code == 403:
|
|
186
182
|
message = f"Not authorized for the team {self.team}"
|
|
187
183
|
else:
|
|
188
|
-
logger.error(
|
|
189
|
-
"Got bad response from Studio, content is %s",
|
|
190
|
-
response.content.decode("utf-8"),
|
|
191
|
-
)
|
|
192
184
|
message = data.get("message", "")
|
|
193
185
|
else:
|
|
194
186
|
message = ""
|
|
@@ -230,6 +222,46 @@ class StudioClient:
|
|
|
230
222
|
def ls_datasets(self) -> Response[LsData]:
|
|
231
223
|
return self._send_request("datachain/ls-datasets", {})
|
|
232
224
|
|
|
225
|
+
def edit_dataset(
|
|
226
|
+
self,
|
|
227
|
+
name: str,
|
|
228
|
+
new_name: Optional[str] = None,
|
|
229
|
+
description: Optional[str] = None,
|
|
230
|
+
labels: Optional[list[str]] = None,
|
|
231
|
+
) -> Response[DatasetInfoData]:
|
|
232
|
+
body = {
|
|
233
|
+
"dataset_name": name,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if new_name is not None:
|
|
237
|
+
body["new_name"] = new_name
|
|
238
|
+
|
|
239
|
+
if description is not None:
|
|
240
|
+
body["description"] = description
|
|
241
|
+
|
|
242
|
+
if labels is not None:
|
|
243
|
+
body["labels"] = labels # type: ignore[assignment]
|
|
244
|
+
|
|
245
|
+
return self._send_request(
|
|
246
|
+
"datachain/edit-dataset",
|
|
247
|
+
body,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
def rm_dataset(
|
|
251
|
+
self,
|
|
252
|
+
name: str,
|
|
253
|
+
version: Optional[int] = None,
|
|
254
|
+
force: Optional[bool] = False,
|
|
255
|
+
) -> Response[DatasetInfoData]:
|
|
256
|
+
return self._send_request(
|
|
257
|
+
"datachain/rm-dataset",
|
|
258
|
+
{
|
|
259
|
+
"dataset_name": name,
|
|
260
|
+
"version": version,
|
|
261
|
+
"force": force,
|
|
262
|
+
},
|
|
263
|
+
)
|
|
264
|
+
|
|
233
265
|
def dataset_info(self, name: str) -> Response[DatasetInfoData]:
|
|
234
266
|
def _parse_dataset_info(dataset_info):
|
|
235
267
|
_parse_dates(dataset_info, ["created_at", "finished_at"])
|
datachain/studio.py
CHANGED
|
@@ -130,6 +130,35 @@ def list_datasets(team: Optional[str] = None):
|
|
|
130
130
|
yield (name, version)
|
|
131
131
|
|
|
132
132
|
|
|
133
|
+
def edit_studio_dataset(
|
|
134
|
+
team_name: Optional[str],
|
|
135
|
+
name: str,
|
|
136
|
+
new_name: Optional[str] = None,
|
|
137
|
+
description: Optional[str] = None,
|
|
138
|
+
labels: Optional[list[str]] = None,
|
|
139
|
+
):
|
|
140
|
+
client = StudioClient(team=team_name)
|
|
141
|
+
response = client.edit_dataset(name, new_name, description, labels)
|
|
142
|
+
if not response.ok:
|
|
143
|
+
raise_remote_error(response.message)
|
|
144
|
+
|
|
145
|
+
print(f"Dataset {name} updated")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def remove_studio_dataset(
|
|
149
|
+
team_name: Optional[str],
|
|
150
|
+
name: str,
|
|
151
|
+
version: Optional[int] = None,
|
|
152
|
+
force: Optional[bool] = False,
|
|
153
|
+
):
|
|
154
|
+
client = StudioClient(team=team_name)
|
|
155
|
+
response = client.rm_dataset(name, version, force)
|
|
156
|
+
if not response.ok:
|
|
157
|
+
raise_remote_error(response.message)
|
|
158
|
+
|
|
159
|
+
print(f"Dataset {name} removed")
|
|
160
|
+
|
|
161
|
+
|
|
133
162
|
def save_config(hostname, token):
|
|
134
163
|
config = Config(ConfigLevel.GLOBAL)
|
|
135
164
|
with config.edit() as conf:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.4
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -139,7 +139,7 @@ Key Features
|
|
|
139
139
|
============
|
|
140
140
|
|
|
141
141
|
📂 **Multimodal Dataset Versioning.**
|
|
142
|
-
- Version unstructured data without redundant data copies, by
|
|
142
|
+
- Version unstructured data without redundant data copies, by supporting
|
|
143
143
|
references to S3, GCP, Azure, and local file systems.
|
|
144
144
|
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
145
145
|
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
@@ -2,10 +2,10 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
|
|
4
4
|
datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=1hiBClE1kbRyx0DK3uX5KMVa0ktbsG6TsFSNvoT2xxs,39399
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
8
|
-
datachain/dataset.py,sha256=
|
|
8
|
+
datachain/dataset.py,sha256=PKHaEXeYOL2gE5BaEmc9rzPJdDg5O9X8_7FvSh_Q9Vg,18614
|
|
9
9
|
datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=TgKg25ZWAP5enzKgw2_2GUPJVdnQUh6uySHB5SJrUY4,7773
|
|
@@ -14,11 +14,11 @@ datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,11
|
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
|
|
15
15
|
datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
datachain/studio.py,sha256=
|
|
17
|
+
datachain/studio.py,sha256=w41vgVPrBfJ02XQOaDccLbh-1uSAfq9cAgOmkYUqExE,4845
|
|
18
18
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
19
|
datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
|
|
20
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
21
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=l_HAxor5i_F03VvbmMuwhi4INhsmNrqubyydPhXWo2Y,57980
|
|
22
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
23
23
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
@@ -33,7 +33,7 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
|
|
|
33
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
34
34
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
35
35
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
36
|
-
datachain/data_storage/metastore.py,sha256=
|
|
36
|
+
datachain/data_storage/metastore.py,sha256=VPq-Dl8P-RbZQMzn6vB9aXBPKUWPTwP8ypkaVfE-7PU,37661
|
|
37
37
|
datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu-AY,9895
|
|
38
38
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
39
39
|
datachain/data_storage/sqlite.py,sha256=nF-2B-n8YZh9cJlZv4XnbahAJDW6pvrp1h9L-140M7A,27538
|
|
@@ -52,7 +52,7 @@ datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
52
52
|
datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
|
|
53
53
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
54
54
|
datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
|
|
55
|
-
datachain/lib/dataset_info.py,sha256=
|
|
55
|
+
datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
|
|
56
56
|
datachain/lib/dc.py,sha256=J7liATKQBJCkeHanVLr0s3d1t5wxiiiSJuSbuxKBbLg,89527
|
|
57
57
|
datachain/lib/file.py,sha256=-XMkL6ED1sE7TMhWoMRTEuOXswZJw8X6AEmJDONFP74,15019
|
|
58
58
|
datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
|
|
@@ -96,7 +96,7 @@ datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
|
96
96
|
datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
|
|
97
97
|
datachain/query/session.py,sha256=50SOdLNCjqHHKI-L4xGXyzTVxzMWfANqKqjeYre-c2k,5959
|
|
98
98
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
99
|
-
datachain/remote/studio.py,sha256=
|
|
99
|
+
datachain/remote/studio.py,sha256=z9DTDqfdWKT8MC23wRDTOHvI8hc_OySS1Ce3F617gjA,9906
|
|
100
100
|
datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
|
|
101
101
|
datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
|
|
102
102
|
datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
|
|
@@ -117,9 +117,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
117
117
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
118
118
|
datachain/toolkit/split.py,sha256=ZgDcrNiKiPXZmKD591_1z9qRIXitu5zwAsoVPB7ykiU,2508
|
|
119
119
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
120
|
-
datachain-0.7.
|
|
121
|
-
datachain-0.7.
|
|
122
|
-
datachain-0.7.
|
|
123
|
-
datachain-0.7.
|
|
124
|
-
datachain-0.7.
|
|
125
|
-
datachain-0.7.
|
|
120
|
+
datachain-0.7.4.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
121
|
+
datachain-0.7.4.dist-info/METADATA,sha256=tr5ReyIE9nUfhvCwuGujJC1MmfO07A10N1sLfvOBcYQ,18006
|
|
122
|
+
datachain-0.7.4.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
123
|
+
datachain-0.7.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
124
|
+
datachain-0.7.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
125
|
+
datachain-0.7.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|