datachain 0.20.3__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +0 -2
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +65 -180
- datachain/cli/__init__.py +7 -0
- datachain/cli/commands/datasets.py +28 -43
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +35 -1
- datachain/client/fsspec.py +3 -5
- datachain/client/hf.py +0 -10
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +37 -403
- datachain/data_storage/sqlite.py +7 -139
- datachain/data_storage/warehouse.py +7 -26
- datachain/dataset.py +12 -126
- datachain/delta.py +7 -11
- datachain/error.py +0 -36
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +0 -4
- datachain/lib/dc/datachain.py +92 -259
- datachain/lib/dc/datasets.py +49 -87
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +0 -1
- datachain/lib/dc/storage.py +40 -38
- datachain/lib/file.py +23 -77
- datachain/lib/listing.py +1 -3
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +0 -10
- datachain/lib/tar.py +2 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +20 -30
- datachain/listing.py +1 -3
- datachain/query/dataset.py +46 -71
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +26 -61
- datachain/studio.py +7 -23
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
- datachain/lib/namespaces.py +0 -71
- datachain/lib/projects.py +0 -86
- datachain/namespace.py +0 -65
- datachain/project.py +0 -78
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -32,7 +32,6 @@ from datachain.lib.file import (
|
|
|
32
32
|
VideoFrame,
|
|
33
33
|
)
|
|
34
34
|
from datachain.lib.model_store import ModelStore
|
|
35
|
-
from datachain.lib.projects import create as create_project
|
|
36
35
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
37
36
|
from datachain.lib.utils import AbstractUDF, DataChainError
|
|
38
37
|
from datachain.query import metrics, param
|
|
@@ -63,7 +62,6 @@ __all__ = [
|
|
|
63
62
|
"VideoFile",
|
|
64
63
|
"VideoFragment",
|
|
65
64
|
"VideoFrame",
|
|
66
|
-
"create_project",
|
|
67
65
|
"datasets",
|
|
68
66
|
"delete_dataset",
|
|
69
67
|
"is_chain_type",
|
datachain/cache.py
CHANGED
|
@@ -39,7 +39,7 @@ def temporary_cache(
|
|
|
39
39
|
cache.destroy()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class Cache:
|
|
42
|
+
class Cache:
|
|
43
43
|
def __init__(self, cache_dir: str, tmp_dir: str):
|
|
44
44
|
self.odb = LocalHashFileDB(
|
|
45
45
|
LocalFileSystem(),
|
|
@@ -76,9 +76,9 @@ class Cache: # noqa: PLW1641
|
|
|
76
76
|
async def download(
|
|
77
77
|
self, file: "File", client: "Client", callback: Optional[Callback] = None
|
|
78
78
|
) -> None:
|
|
79
|
+
from_path = f"{file.source}/{file.path}"
|
|
79
80
|
from dvc_objects.fs.utils import tmp_fname
|
|
80
81
|
|
|
81
|
-
from_path = file.get_uri()
|
|
82
82
|
odb_fs = self.odb.fs
|
|
83
83
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
84
84
|
size = file.size
|
datachain/catalog/catalog.py
CHANGED
|
@@ -41,7 +41,6 @@ from datachain.dataset import (
|
|
|
41
41
|
DatasetStatus,
|
|
42
42
|
StorageURI,
|
|
43
43
|
create_dataset_uri,
|
|
44
|
-
parse_dataset_name,
|
|
45
44
|
parse_dataset_uri,
|
|
46
45
|
)
|
|
47
46
|
from datachain.error import (
|
|
@@ -49,14 +48,12 @@ from datachain.error import (
|
|
|
49
48
|
DatasetInvalidVersionError,
|
|
50
49
|
DatasetNotFoundError,
|
|
51
50
|
DatasetVersionNotFoundError,
|
|
52
|
-
ProjectNotFoundError,
|
|
53
51
|
QueryScriptCancelError,
|
|
54
52
|
QueryScriptRunError,
|
|
55
53
|
)
|
|
56
54
|
from datachain.lib.listing import get_listing
|
|
57
55
|
from datachain.node import DirType, Node, NodeWithPath
|
|
58
56
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
59
|
-
from datachain.project import Project
|
|
60
57
|
from datachain.sql.types import DateTime, SQLType
|
|
61
58
|
from datachain.utils import DataChainDir
|
|
62
59
|
|
|
@@ -158,9 +155,9 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
158
155
|
self,
|
|
159
156
|
metastore: "AbstractMetastore",
|
|
160
157
|
warehouse: "AbstractWarehouse",
|
|
161
|
-
|
|
158
|
+
remote_ds_name: str,
|
|
162
159
|
remote_ds_version: str,
|
|
163
|
-
|
|
160
|
+
local_ds_name: str,
|
|
164
161
|
local_ds_version: str,
|
|
165
162
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
166
163
|
max_threads: int = PULL_DATASET_MAX_THREADS,
|
|
@@ -172,9 +169,9 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
172
169
|
self._check_dependencies()
|
|
173
170
|
self.metastore = metastore
|
|
174
171
|
self.warehouse = warehouse
|
|
175
|
-
self.
|
|
172
|
+
self.remote_ds_name = remote_ds_name
|
|
176
173
|
self.remote_ds_version = remote_ds_version
|
|
177
|
-
self.
|
|
174
|
+
self.local_ds_name = local_ds_name
|
|
178
175
|
self.local_ds_version = local_ds_version
|
|
179
176
|
self.schema = schema
|
|
180
177
|
self.last_status_check: Optional[float] = None
|
|
@@ -210,7 +207,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
210
207
|
Checks are done every PULL_DATASET_CHECK_STATUS_INTERVAL seconds
|
|
211
208
|
"""
|
|
212
209
|
export_status_response = self.studio_client.dataset_export_status(
|
|
213
|
-
self.
|
|
210
|
+
self.remote_ds_name, self.remote_ds_version
|
|
214
211
|
)
|
|
215
212
|
if not export_status_response.ok:
|
|
216
213
|
raise DataChainError(export_status_response.message)
|
|
@@ -257,7 +254,9 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
257
254
|
import pandas as pd
|
|
258
255
|
|
|
259
256
|
# metastore and warehouse are not thread safe
|
|
260
|
-
with self.warehouse.clone() as warehouse:
|
|
257
|
+
with self.metastore.clone() as metastore, self.warehouse.clone() as warehouse:
|
|
258
|
+
local_ds = metastore.get_dataset(self.local_ds_name)
|
|
259
|
+
|
|
261
260
|
urls = list(urls)
|
|
262
261
|
|
|
263
262
|
for url in urls:
|
|
@@ -270,7 +269,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
270
269
|
df = self.fix_columns(df)
|
|
271
270
|
|
|
272
271
|
inserted = warehouse.insert_dataset_rows(
|
|
273
|
-
df,
|
|
272
|
+
df, local_ds, self.local_ds_version
|
|
274
273
|
)
|
|
275
274
|
self.increase_counter(inserted) # type: ignore [arg-type]
|
|
276
275
|
# sometimes progress bar doesn't get updated so manually updating it
|
|
@@ -676,11 +675,7 @@ class Catalog:
|
|
|
676
675
|
listing: Optional[Listing]
|
|
677
676
|
if src.startswith("ds://"):
|
|
678
677
|
ds_name, ds_version = parse_dataset_uri(src)
|
|
679
|
-
|
|
680
|
-
assert ds_namespace
|
|
681
|
-
assert ds_project
|
|
682
|
-
project = self.metastore.get_project(ds_project, ds_namespace)
|
|
683
|
-
dataset = self.get_dataset(ds_name, project)
|
|
678
|
+
dataset = self.get_dataset(ds_name)
|
|
684
679
|
if not ds_version:
|
|
685
680
|
ds_version = dataset.latest_version
|
|
686
681
|
dataset_sources = self.warehouse.get_dataset_sources(
|
|
@@ -700,11 +695,7 @@ class Catalog:
|
|
|
700
695
|
dataset_name=dataset_name,
|
|
701
696
|
)
|
|
702
697
|
rows = DatasetQuery(
|
|
703
|
-
name=dataset.name,
|
|
704
|
-
namespace_name=dataset.project.namespace.name,
|
|
705
|
-
project_name=dataset.project.name,
|
|
706
|
-
version=ds_version,
|
|
707
|
-
catalog=self,
|
|
698
|
+
name=dataset.name, version=ds_version, catalog=self
|
|
708
699
|
).to_db_records()
|
|
709
700
|
indexed_sources.append(
|
|
710
701
|
(
|
|
@@ -778,7 +769,6 @@ class Catalog:
|
|
|
778
769
|
def create_dataset(
|
|
779
770
|
self,
|
|
780
771
|
name: str,
|
|
781
|
-
project: Optional[Project] = None,
|
|
782
772
|
version: Optional[str] = None,
|
|
783
773
|
*,
|
|
784
774
|
columns: Sequence[Column],
|
|
@@ -798,7 +788,6 @@ class Catalog:
|
|
|
798
788
|
If version is None, then next unused version is created.
|
|
799
789
|
If version is given, then it must be an unused version.
|
|
800
790
|
"""
|
|
801
|
-
DatasetRecord.validate_name(name)
|
|
802
791
|
assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
|
|
803
792
|
if not listing and Client.is_data_source_uri(name):
|
|
804
793
|
raise RuntimeError(
|
|
@@ -806,7 +795,7 @@ class Catalog:
|
|
|
806
795
|
)
|
|
807
796
|
default_version = DEFAULT_DATASET_VERSION
|
|
808
797
|
try:
|
|
809
|
-
dataset = self.get_dataset(name
|
|
798
|
+
dataset = self.get_dataset(name)
|
|
810
799
|
default_version = dataset.next_version_patch
|
|
811
800
|
if update_version == "major":
|
|
812
801
|
default_version = dataset.next_version_major
|
|
@@ -831,7 +820,6 @@ class Catalog:
|
|
|
831
820
|
}
|
|
832
821
|
dataset = self.metastore.create_dataset(
|
|
833
822
|
name,
|
|
834
|
-
project.id if project else None,
|
|
835
823
|
feature_schema=feature_schema,
|
|
836
824
|
query_script=query_script,
|
|
837
825
|
schema=schema,
|
|
@@ -904,7 +892,7 @@ class Catalog:
|
|
|
904
892
|
)
|
|
905
893
|
|
|
906
894
|
if create_rows_table:
|
|
907
|
-
table_name = self.warehouse.dataset_table_name(dataset, version)
|
|
895
|
+
table_name = self.warehouse.dataset_table_name(dataset.name, version)
|
|
908
896
|
self.warehouse.create_dataset_rows_table(table_name, columns=columns)
|
|
909
897
|
self.update_dataset_version_with_warehouse_info(dataset, version)
|
|
910
898
|
|
|
@@ -935,13 +923,7 @@ class Catalog:
|
|
|
935
923
|
|
|
936
924
|
if not dataset_version.preview:
|
|
937
925
|
values["preview"] = (
|
|
938
|
-
DatasetQuery(
|
|
939
|
-
name=dataset.name,
|
|
940
|
-
namespace_name=dataset.project.namespace.name,
|
|
941
|
-
project_name=dataset.project.name,
|
|
942
|
-
version=version,
|
|
943
|
-
catalog=self,
|
|
944
|
-
)
|
|
926
|
+
DatasetQuery(name=dataset.name, version=version, catalog=self)
|
|
945
927
|
.limit(20)
|
|
946
928
|
.to_db_records()
|
|
947
929
|
)
|
|
@@ -967,7 +949,6 @@ class Catalog:
|
|
|
967
949
|
# updating name must result in updating dataset table names as well
|
|
968
950
|
for version in [v.version for v in dataset.versions]:
|
|
969
951
|
self.warehouse.rename_dataset_table(
|
|
970
|
-
dataset,
|
|
971
952
|
old_name,
|
|
972
953
|
new_name,
|
|
973
954
|
old_version=version,
|
|
@@ -1005,7 +986,6 @@ class Catalog:
|
|
|
1005
986
|
self,
|
|
1006
987
|
name: str,
|
|
1007
988
|
sources: list[str],
|
|
1008
|
-
project: Optional[Project] = None,
|
|
1009
989
|
client_config=None,
|
|
1010
990
|
recursive=False,
|
|
1011
991
|
) -> DatasetRecord:
|
|
@@ -1014,8 +994,6 @@ class Catalog:
|
|
|
1014
994
|
|
|
1015
995
|
from datachain import read_dataset, read_storage
|
|
1016
996
|
|
|
1017
|
-
project = project or self.metastore.default_project
|
|
1018
|
-
|
|
1019
997
|
chains = []
|
|
1020
998
|
for source in sources:
|
|
1021
999
|
if source.startswith(DATASET_PREFIX):
|
|
@@ -1028,11 +1006,10 @@ class Catalog:
|
|
|
1028
1006
|
# create union of all dataset queries created from sources
|
|
1029
1007
|
dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
|
|
1030
1008
|
try:
|
|
1031
|
-
dc = dc.settings(project=project.name, namespace=project.namespace.name)
|
|
1032
1009
|
dc.save(name)
|
|
1033
1010
|
except Exception as e: # noqa: BLE001
|
|
1034
1011
|
try:
|
|
1035
|
-
ds = self.get_dataset(name
|
|
1012
|
+
ds = self.get_dataset(name)
|
|
1036
1013
|
self.metastore.update_dataset_status(
|
|
1037
1014
|
ds,
|
|
1038
1015
|
DatasetStatus.FAILED,
|
|
@@ -1049,7 +1026,7 @@ class Catalog:
|
|
|
1049
1026
|
except DatasetNotFoundError:
|
|
1050
1027
|
raise e from None
|
|
1051
1028
|
|
|
1052
|
-
ds = self.get_dataset(name
|
|
1029
|
+
ds = self.get_dataset(name)
|
|
1053
1030
|
|
|
1054
1031
|
self.update_dataset_version_with_warehouse_info(
|
|
1055
1032
|
ds,
|
|
@@ -1057,67 +1034,49 @@ class Catalog:
|
|
|
1057
1034
|
sources="\n".join(sources),
|
|
1058
1035
|
)
|
|
1059
1036
|
|
|
1060
|
-
return self.get_dataset(name
|
|
1037
|
+
return self.get_dataset(name)
|
|
1061
1038
|
|
|
1062
|
-
def get_dataset(
|
|
1063
|
-
self
|
|
1064
|
-
) -> DatasetRecord:
|
|
1065
|
-
from datachain.lib.listing import is_listing_dataset
|
|
1066
|
-
|
|
1067
|
-
if is_listing_dataset(name):
|
|
1068
|
-
project = self.metastore.listing_project
|
|
1069
|
-
return self.metastore.get_dataset(name, project.id if project else None)
|
|
1039
|
+
def get_dataset(self, name: str) -> DatasetRecord:
|
|
1040
|
+
return self.metastore.get_dataset(name)
|
|
1070
1041
|
|
|
1071
1042
|
def get_dataset_with_remote_fallback(
|
|
1072
|
-
self,
|
|
1073
|
-
name: str,
|
|
1074
|
-
namespace_name: str,
|
|
1075
|
-
project_name: str,
|
|
1076
|
-
version: Optional[str] = None,
|
|
1043
|
+
self, name: str, version: Optional[str] = None
|
|
1077
1044
|
) -> DatasetRecord:
|
|
1078
1045
|
try:
|
|
1079
|
-
|
|
1080
|
-
ds = self.get_dataset(name, project)
|
|
1046
|
+
ds = self.get_dataset(name)
|
|
1081
1047
|
if version and not ds.has_version(version):
|
|
1082
1048
|
raise DatasetVersionNotFoundError(
|
|
1083
1049
|
f"Dataset {name} does not have version {version}"
|
|
1084
1050
|
)
|
|
1085
1051
|
return ds
|
|
1086
1052
|
|
|
1087
|
-
except (
|
|
1088
|
-
ProjectNotFoundError,
|
|
1089
|
-
DatasetNotFoundError,
|
|
1090
|
-
DatasetVersionNotFoundError,
|
|
1091
|
-
):
|
|
1053
|
+
except (DatasetNotFoundError, DatasetVersionNotFoundError):
|
|
1092
1054
|
print("Dataset not found in local catalog, trying to get from studio")
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1055
|
+
|
|
1056
|
+
remote_ds_uri = f"{DATASET_PREFIX}{name}"
|
|
1057
|
+
if version:
|
|
1058
|
+
remote_ds_uri += f"@v{version}"
|
|
1096
1059
|
|
|
1097
1060
|
self.pull_dataset(
|
|
1098
1061
|
remote_ds_uri=remote_ds_uri,
|
|
1099
1062
|
local_ds_name=name,
|
|
1100
1063
|
local_ds_version=version,
|
|
1101
1064
|
)
|
|
1102
|
-
return self.get_dataset(
|
|
1103
|
-
name, self.metastore.get_project(project_name, namespace_name)
|
|
1104
|
-
)
|
|
1065
|
+
return self.get_dataset(name)
|
|
1105
1066
|
|
|
1106
1067
|
def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
|
|
1107
1068
|
"""Returns dataset that contains version with specific uuid"""
|
|
1108
1069
|
for dataset in self.ls_datasets():
|
|
1109
1070
|
if dataset.has_version_with_uuid(uuid):
|
|
1110
|
-
return self.get_dataset(dataset.name
|
|
1071
|
+
return self.get_dataset(dataset.name)
|
|
1111
1072
|
raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
|
|
1112
1073
|
|
|
1113
|
-
def get_remote_dataset(
|
|
1114
|
-
self, namespace: str, project: str, name: str
|
|
1115
|
-
) -> DatasetRecord:
|
|
1074
|
+
def get_remote_dataset(self, name: str) -> DatasetRecord:
|
|
1116
1075
|
from datachain.remote.studio import StudioClient
|
|
1117
1076
|
|
|
1118
1077
|
studio_client = StudioClient()
|
|
1119
1078
|
|
|
1120
|
-
info_response = studio_client.dataset_info(
|
|
1079
|
+
info_response = studio_client.dataset_info(name)
|
|
1121
1080
|
if not info_response.ok:
|
|
1122
1081
|
raise DataChainError(info_response.message)
|
|
1123
1082
|
|
|
@@ -1126,9 +1085,9 @@ class Catalog:
|
|
|
1126
1085
|
return DatasetRecord.from_dict(dataset_info)
|
|
1127
1086
|
|
|
1128
1087
|
def get_dataset_dependencies(
|
|
1129
|
-
self, name: str, version: str,
|
|
1088
|
+
self, name: str, version: str, indirect=False
|
|
1130
1089
|
) -> list[Optional[DatasetDependency]]:
|
|
1131
|
-
dataset = self.get_dataset(name
|
|
1090
|
+
dataset = self.get_dataset(name)
|
|
1132
1091
|
|
|
1133
1092
|
direct_dependencies = self.metastore.get_direct_dataset_dependencies(
|
|
1134
1093
|
dataset, version
|
|
@@ -1142,10 +1101,9 @@ class Catalog:
|
|
|
1142
1101
|
# dependency has been removed
|
|
1143
1102
|
continue
|
|
1144
1103
|
if d.is_dataset:
|
|
1145
|
-
project = self.metastore.get_project(d.project, d.namespace)
|
|
1146
1104
|
# only datasets can have dependencies
|
|
1147
1105
|
d.dependencies = self.get_dataset_dependencies(
|
|
1148
|
-
d.name, d.version,
|
|
1106
|
+
d.name, d.version, indirect=indirect
|
|
1149
1107
|
)
|
|
1150
1108
|
|
|
1151
1109
|
return direct_dependencies
|
|
@@ -1155,12 +1113,9 @@ class Catalog:
|
|
|
1155
1113
|
prefix: Optional[str] = None,
|
|
1156
1114
|
include_listing: bool = False,
|
|
1157
1115
|
studio: bool = False,
|
|
1158
|
-
project: Optional[Project] = None,
|
|
1159
1116
|
) -> Iterator[DatasetListRecord]:
|
|
1160
1117
|
from datachain.remote.studio import StudioClient
|
|
1161
1118
|
|
|
1162
|
-
project_id = project.id if project else None
|
|
1163
|
-
|
|
1164
1119
|
if studio:
|
|
1165
1120
|
client = StudioClient()
|
|
1166
1121
|
response = client.ls_datasets(prefix=prefix)
|
|
@@ -1175,11 +1130,9 @@ class Catalog:
|
|
|
1175
1130
|
if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
|
|
1176
1131
|
)
|
|
1177
1132
|
elif prefix:
|
|
1178
|
-
datasets = self.metastore.list_datasets_by_prefix(
|
|
1179
|
-
prefix, project_id=project_id
|
|
1180
|
-
)
|
|
1133
|
+
datasets = self.metastore.list_datasets_by_prefix(prefix)
|
|
1181
1134
|
else:
|
|
1182
|
-
datasets = self.metastore.list_datasets(
|
|
1135
|
+
datasets = self.metastore.list_datasets()
|
|
1183
1136
|
|
|
1184
1137
|
for d in datasets:
|
|
1185
1138
|
if not d.is_bucket_listing or include_listing:
|
|
@@ -1191,15 +1144,11 @@ class Catalog:
|
|
|
1191
1144
|
include_listing: bool = False,
|
|
1192
1145
|
with_job: bool = True,
|
|
1193
1146
|
studio: bool = False,
|
|
1194
|
-
project: Optional[Project] = None,
|
|
1195
1147
|
) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
|
|
1196
1148
|
"""Iterate over all dataset versions with related jobs."""
|
|
1197
1149
|
datasets = list(
|
|
1198
1150
|
self.ls_datasets(
|
|
1199
|
-
prefix=prefix,
|
|
1200
|
-
include_listing=include_listing,
|
|
1201
|
-
studio=studio,
|
|
1202
|
-
project=project,
|
|
1151
|
+
prefix=prefix, include_listing=include_listing, studio=studio
|
|
1203
1152
|
)
|
|
1204
1153
|
)
|
|
1205
1154
|
|
|
@@ -1235,7 +1184,6 @@ class Catalog:
|
|
|
1235
1184
|
prefix=prefix,
|
|
1236
1185
|
include_listing=True,
|
|
1237
1186
|
with_job=False,
|
|
1238
|
-
project=self.metastore.listing_project,
|
|
1239
1187
|
)
|
|
1240
1188
|
|
|
1241
1189
|
return [
|
|
@@ -1245,21 +1193,13 @@ class Catalog:
|
|
|
1245
1193
|
]
|
|
1246
1194
|
|
|
1247
1195
|
def ls_dataset_rows(
|
|
1248
|
-
self,
|
|
1249
|
-
dataset: DatasetRecord,
|
|
1250
|
-
version: str,
|
|
1251
|
-
offset=None,
|
|
1252
|
-
limit=None,
|
|
1196
|
+
self, name: str, version: str, offset=None, limit=None
|
|
1253
1197
|
) -> list[dict]:
|
|
1254
1198
|
from datachain.query.dataset import DatasetQuery
|
|
1255
1199
|
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
project_name=dataset.project.name,
|
|
1260
|
-
version=version,
|
|
1261
|
-
catalog=self,
|
|
1262
|
-
)
|
|
1200
|
+
dataset = self.get_dataset(name)
|
|
1201
|
+
|
|
1202
|
+
q = DatasetQuery(name=dataset.name, version=version, catalog=self)
|
|
1263
1203
|
if limit:
|
|
1264
1204
|
q = q.limit(limit)
|
|
1265
1205
|
if offset:
|
|
@@ -1292,29 +1232,35 @@ class Catalog:
|
|
|
1292
1232
|
bucket_uri: str,
|
|
1293
1233
|
name: str,
|
|
1294
1234
|
version: str,
|
|
1295
|
-
project: Optional[Project] = None,
|
|
1296
1235
|
client_config=None,
|
|
1297
1236
|
) -> list[str]:
|
|
1298
|
-
dataset = self.get_dataset(name
|
|
1237
|
+
dataset = self.get_dataset(name)
|
|
1299
1238
|
|
|
1300
1239
|
return self.warehouse.export_dataset_table(
|
|
1301
1240
|
bucket_uri, dataset, version, client_config
|
|
1302
1241
|
)
|
|
1303
1242
|
|
|
1304
|
-
def dataset_table_export_file_names(
|
|
1305
|
-
|
|
1306
|
-
) -> list[str]:
|
|
1307
|
-
dataset = self.get_dataset(name, project)
|
|
1243
|
+
def dataset_table_export_file_names(self, name: str, version: str) -> list[str]:
|
|
1244
|
+
dataset = self.get_dataset(name)
|
|
1308
1245
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1309
1246
|
|
|
1310
1247
|
def remove_dataset(
|
|
1311
1248
|
self,
|
|
1312
1249
|
name: str,
|
|
1313
|
-
project: Optional[Project] = None,
|
|
1314
1250
|
version: Optional[str] = None,
|
|
1315
1251
|
force: Optional[bool] = False,
|
|
1252
|
+
studio: Optional[bool] = False,
|
|
1316
1253
|
):
|
|
1317
|
-
|
|
1254
|
+
from datachain.remote.studio import StudioClient
|
|
1255
|
+
|
|
1256
|
+
if studio:
|
|
1257
|
+
client = StudioClient()
|
|
1258
|
+
response = client.rm_dataset(name, version=version, force=force)
|
|
1259
|
+
if not response.ok:
|
|
1260
|
+
raise DataChainError(response.message)
|
|
1261
|
+
return
|
|
1262
|
+
|
|
1263
|
+
dataset = self.get_dataset(name)
|
|
1318
1264
|
if not version and not force:
|
|
1319
1265
|
raise ValueError(f"Missing dataset version from input for dataset {name}")
|
|
1320
1266
|
if version and not dataset.has_version(version):
|
|
@@ -1336,21 +1282,19 @@ class Catalog:
|
|
|
1336
1282
|
def edit_dataset(
|
|
1337
1283
|
self,
|
|
1338
1284
|
name: str,
|
|
1339
|
-
project: Optional[Project] = None,
|
|
1340
1285
|
new_name: Optional[str] = None,
|
|
1341
1286
|
description: Optional[str] = None,
|
|
1342
1287
|
attrs: Optional[list[str]] = None,
|
|
1343
1288
|
) -> DatasetRecord:
|
|
1344
1289
|
update_data = {}
|
|
1345
1290
|
if new_name:
|
|
1346
|
-
DatasetRecord.validate_name(new_name)
|
|
1347
1291
|
update_data["name"] = new_name
|
|
1348
1292
|
if description is not None:
|
|
1349
1293
|
update_data["description"] = description
|
|
1350
1294
|
if attrs is not None:
|
|
1351
1295
|
update_data["attrs"] = attrs # type: ignore[assignment]
|
|
1352
1296
|
|
|
1353
|
-
dataset = self.get_dataset(name
|
|
1297
|
+
dataset = self.get_dataset(name)
|
|
1354
1298
|
return self.update_dataset(dataset, **update_data)
|
|
1355
1299
|
|
|
1356
1300
|
def ls(
|
|
@@ -1407,29 +1351,7 @@ class Catalog:
|
|
|
1407
1351
|
except Exception as e:
|
|
1408
1352
|
raise DataChainError("Error when parsing dataset uri") from e
|
|
1409
1353
|
|
|
1410
|
-
|
|
1411
|
-
remote_ds_name
|
|
1412
|
-
)
|
|
1413
|
-
if not remote_namespace or not remote_project:
|
|
1414
|
-
raise DataChainError(
|
|
1415
|
-
f"Invalid fully qualified dataset name {remote_ds_name}, namespace"
|
|
1416
|
-
f" or project missing"
|
|
1417
|
-
)
|
|
1418
|
-
|
|
1419
|
-
if local_ds_name:
|
|
1420
|
-
local_namespace, local_project, local_ds_name = parse_dataset_name(
|
|
1421
|
-
local_ds_name
|
|
1422
|
-
)
|
|
1423
|
-
if local_namespace and local_namespace != remote_namespace:
|
|
1424
|
-
raise DataChainError(
|
|
1425
|
-
"Local namespace must be the same to remote namespace"
|
|
1426
|
-
)
|
|
1427
|
-
if local_project and local_project != remote_project:
|
|
1428
|
-
raise DataChainError("Local project must be the same to remote project")
|
|
1429
|
-
|
|
1430
|
-
remote_ds = self.get_remote_dataset(
|
|
1431
|
-
remote_namespace, remote_project, remote_ds_name
|
|
1432
|
-
)
|
|
1354
|
+
remote_ds = self.get_remote_dataset(remote_ds_name)
|
|
1433
1355
|
|
|
1434
1356
|
try:
|
|
1435
1357
|
# if version is not specified in uri, take the latest one
|
|
@@ -1437,12 +1359,7 @@ class Catalog:
|
|
|
1437
1359
|
version = remote_ds.latest_version
|
|
1438
1360
|
print(f"Version not specified, pulling the latest one (v{version})")
|
|
1439
1361
|
# updating dataset uri with latest version
|
|
1440
|
-
remote_ds_uri = create_dataset_uri(
|
|
1441
|
-
remote_ds.name,
|
|
1442
|
-
remote_ds.project.namespace.name,
|
|
1443
|
-
remote_ds.project.name,
|
|
1444
|
-
version,
|
|
1445
|
-
)
|
|
1362
|
+
remote_ds_uri = create_dataset_uri(remote_ds_name, version)
|
|
1446
1363
|
remote_ds_version = remote_ds.get_version(version)
|
|
1447
1364
|
except (DatasetVersionNotFoundError, StopIteration) as exc:
|
|
1448
1365
|
raise DataChainError(
|
|
@@ -1451,13 +1368,7 @@ class Catalog:
|
|
|
1451
1368
|
|
|
1452
1369
|
local_ds_name = local_ds_name or remote_ds.name
|
|
1453
1370
|
local_ds_version = local_ds_version or remote_ds_version.version
|
|
1454
|
-
|
|
1455
|
-
local_ds_uri = create_dataset_uri(
|
|
1456
|
-
local_ds_name,
|
|
1457
|
-
remote_ds.project.namespace.name,
|
|
1458
|
-
remote_ds.project.name,
|
|
1459
|
-
local_ds_version,
|
|
1460
|
-
)
|
|
1371
|
+
local_ds_uri = create_dataset_uri(local_ds_name, local_ds_version)
|
|
1461
1372
|
|
|
1462
1373
|
try:
|
|
1463
1374
|
# try to find existing dataset with the same uuid to avoid pulling again
|
|
@@ -1466,10 +1377,7 @@ class Catalog:
|
|
|
1466
1377
|
remote_ds_version.uuid
|
|
1467
1378
|
)
|
|
1468
1379
|
existing_ds_uri = create_dataset_uri(
|
|
1469
|
-
existing_ds.name,
|
|
1470
|
-
existing_ds.project.namespace.name,
|
|
1471
|
-
existing_ds.project.name,
|
|
1472
|
-
existing_ds_version.version,
|
|
1380
|
+
existing_ds.name, existing_ds_version.version
|
|
1473
1381
|
)
|
|
1474
1382
|
if existing_ds_uri == remote_ds_uri:
|
|
1475
1383
|
print(f"Local copy of dataset {remote_ds_uri} already present")
|
|
@@ -1483,26 +1391,8 @@ class Catalog:
|
|
|
1483
1391
|
except DatasetNotFoundError:
|
|
1484
1392
|
pass
|
|
1485
1393
|
|
|
1486
|
-
# Create namespace and project if doesn't exist
|
|
1487
|
-
print(
|
|
1488
|
-
f"Creating namespace {remote_ds.project.namespace.name} and project"
|
|
1489
|
-
f" {remote_ds.project.name}"
|
|
1490
|
-
)
|
|
1491
|
-
|
|
1492
|
-
namespace = self.metastore.create_namespace(
|
|
1493
|
-
remote_ds.project.namespace.name,
|
|
1494
|
-
description=remote_ds.project.namespace.descr,
|
|
1495
|
-
uuid=remote_ds.project.namespace.uuid,
|
|
1496
|
-
)
|
|
1497
|
-
project = self.metastore.create_project(
|
|
1498
|
-
namespace.name,
|
|
1499
|
-
remote_ds.project.name,
|
|
1500
|
-
description=remote_ds.project.descr,
|
|
1501
|
-
uuid=remote_ds.project.uuid,
|
|
1502
|
-
)
|
|
1503
|
-
|
|
1504
1394
|
try:
|
|
1505
|
-
local_dataset = self.get_dataset(local_ds_name
|
|
1395
|
+
local_dataset = self.get_dataset(local_ds_name)
|
|
1506
1396
|
if local_dataset and local_dataset.has_version(local_ds_version):
|
|
1507
1397
|
raise DataChainError(
|
|
1508
1398
|
f"Local dataset {local_ds_uri} already exists with different uuid,"
|
|
@@ -1524,7 +1414,6 @@ class Catalog:
|
|
|
1524
1414
|
|
|
1525
1415
|
local_ds = self.create_dataset(
|
|
1526
1416
|
local_ds_name,
|
|
1527
|
-
project,
|
|
1528
1417
|
local_ds_version,
|
|
1529
1418
|
query_script=remote_ds_version.query_script,
|
|
1530
1419
|
create_rows=True,
|
|
@@ -1537,7 +1426,7 @@ class Catalog:
|
|
|
1537
1426
|
# asking remote to export dataset rows table to s3 and to return signed
|
|
1538
1427
|
# urls of exported parts, which are in parquet format
|
|
1539
1428
|
export_response = studio_client.export_dataset_table(
|
|
1540
|
-
|
|
1429
|
+
remote_ds_name, remote_ds_version.version
|
|
1541
1430
|
)
|
|
1542
1431
|
if not export_response.ok:
|
|
1543
1432
|
raise DataChainError(export_response.message)
|
|
@@ -1568,9 +1457,9 @@ class Catalog:
|
|
|
1568
1457
|
rows_fetcher = DatasetRowsFetcher(
|
|
1569
1458
|
metastore,
|
|
1570
1459
|
warehouse,
|
|
1571
|
-
|
|
1460
|
+
remote_ds_name,
|
|
1572
1461
|
remote_ds_version.version,
|
|
1573
|
-
|
|
1462
|
+
local_ds_name,
|
|
1574
1463
|
local_ds_version,
|
|
1575
1464
|
schema,
|
|
1576
1465
|
progress_bar=dataset_save_progress_bar,
|
|
@@ -1580,7 +1469,7 @@ class Catalog:
|
|
|
1580
1469
|
iter(batch(signed_urls)), dataset_save_progress_bar
|
|
1581
1470
|
)
|
|
1582
1471
|
except:
|
|
1583
|
-
self.remove_dataset(local_ds_name,
|
|
1472
|
+
self.remove_dataset(local_ds_name, local_ds_version)
|
|
1584
1473
|
raise
|
|
1585
1474
|
|
|
1586
1475
|
local_ds = self.metastore.update_dataset_status(
|
|
@@ -1637,11 +1526,7 @@ class Catalog:
|
|
|
1637
1526
|
)
|
|
1638
1527
|
|
|
1639
1528
|
self.create_dataset_from_sources(
|
|
1640
|
-
output,
|
|
1641
|
-
sources,
|
|
1642
|
-
self.metastore.default_project,
|
|
1643
|
-
client_config=client_config,
|
|
1644
|
-
recursive=recursive,
|
|
1529
|
+
output, sources, client_config=client_config, recursive=recursive
|
|
1645
1530
|
)
|
|
1646
1531
|
|
|
1647
1532
|
def query(
|
datachain/cli/__init__.py
CHANGED
|
@@ -152,6 +152,9 @@ def handle_dataset_command(args, catalog):
|
|
|
152
152
|
new_name=args.new_name,
|
|
153
153
|
description=args.description,
|
|
154
154
|
attrs=args.attrs,
|
|
155
|
+
studio=args.studio,
|
|
156
|
+
local=args.local,
|
|
157
|
+
all=args.all,
|
|
155
158
|
team=args.team,
|
|
156
159
|
),
|
|
157
160
|
"ls": lambda: list_datasets(
|
|
@@ -169,6 +172,8 @@ def handle_dataset_command(args, catalog):
|
|
|
169
172
|
version=args.version,
|
|
170
173
|
force=args.force,
|
|
171
174
|
studio=args.studio,
|
|
175
|
+
local=args.local,
|
|
176
|
+
all=args.all,
|
|
172
177
|
team=args.team,
|
|
173
178
|
),
|
|
174
179
|
"remove": lambda: rm_dataset(
|
|
@@ -177,6 +182,8 @@ def handle_dataset_command(args, catalog):
|
|
|
177
182
|
version=args.version,
|
|
178
183
|
force=args.force,
|
|
179
184
|
studio=args.studio,
|
|
185
|
+
local=args.local,
|
|
186
|
+
all=args.all,
|
|
180
187
|
team=args.team,
|
|
181
188
|
),
|
|
182
189
|
}
|