datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +213 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +35 -26
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +1 -35
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +433 -37
- datachain/data_storage/sqlite.py +140 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +128 -12
- datachain/delta.py +11 -7
- datachain/error.py +36 -0
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +253 -91
- datachain/lib/dc/datasets.py +103 -50
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +2 -1
- datachain/lib/dc/storage.py +38 -40
- datachain/lib/file.py +77 -23
- datachain/lib/listing.py +3 -1
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +71 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +10 -0
- datachain/lib/signal_schema.py +8 -0
- datachain/lib/tar.py +1 -2
- datachain/lib/udf.py +1 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +61 -26
- datachain/studio.py +23 -6
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -32,6 +32,7 @@ from datachain.lib.file import (
|
|
|
32
32
|
VideoFrame,
|
|
33
33
|
)
|
|
34
34
|
from datachain.lib.model_store import ModelStore
|
|
35
|
+
from datachain.lib.projects import create as create_project
|
|
35
36
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
36
37
|
from datachain.lib.utils import AbstractUDF, DataChainError
|
|
37
38
|
from datachain.query import metrics, param
|
|
@@ -62,6 +63,7 @@ __all__ = [
|
|
|
62
63
|
"VideoFile",
|
|
63
64
|
"VideoFragment",
|
|
64
65
|
"VideoFrame",
|
|
66
|
+
"create_project",
|
|
65
67
|
"datasets",
|
|
66
68
|
"delete_dataset",
|
|
67
69
|
"is_chain_type",
|
datachain/cache.py
CHANGED
|
@@ -39,7 +39,7 @@ def temporary_cache(
|
|
|
39
39
|
cache.destroy()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class Cache:
|
|
42
|
+
class Cache: # noqa: PLW1641
|
|
43
43
|
def __init__(self, cache_dir: str, tmp_dir: str):
|
|
44
44
|
self.odb = LocalHashFileDB(
|
|
45
45
|
LocalFileSystem(),
|
|
@@ -76,9 +76,9 @@ class Cache:
|
|
|
76
76
|
async def download(
|
|
77
77
|
self, file: "File", client: "Client", callback: Optional[Callback] = None
|
|
78
78
|
) -> None:
|
|
79
|
-
from_path = f"{file.source}/{file.path}"
|
|
80
79
|
from dvc_objects.fs.utils import tmp_fname
|
|
81
80
|
|
|
81
|
+
from_path = file.get_uri()
|
|
82
82
|
odb_fs = self.odb.fs
|
|
83
83
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
84
84
|
size = file.size
|
datachain/catalog/catalog.py
CHANGED
|
@@ -41,6 +41,7 @@ from datachain.dataset import (
|
|
|
41
41
|
DatasetStatus,
|
|
42
42
|
StorageURI,
|
|
43
43
|
create_dataset_uri,
|
|
44
|
+
parse_dataset_name,
|
|
44
45
|
parse_dataset_uri,
|
|
45
46
|
)
|
|
46
47
|
from datachain.error import (
|
|
@@ -48,12 +49,14 @@ from datachain.error import (
|
|
|
48
49
|
DatasetInvalidVersionError,
|
|
49
50
|
DatasetNotFoundError,
|
|
50
51
|
DatasetVersionNotFoundError,
|
|
52
|
+
ProjectNotFoundError,
|
|
51
53
|
QueryScriptCancelError,
|
|
52
54
|
QueryScriptRunError,
|
|
53
55
|
)
|
|
54
56
|
from datachain.lib.listing import get_listing
|
|
55
57
|
from datachain.node import DirType, Node, NodeWithPath
|
|
56
58
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
59
|
+
from datachain.project import Project
|
|
57
60
|
from datachain.sql.types import DateTime, SQLType
|
|
58
61
|
from datachain.utils import DataChainDir
|
|
59
62
|
|
|
@@ -155,9 +158,9 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
155
158
|
self,
|
|
156
159
|
metastore: "AbstractMetastore",
|
|
157
160
|
warehouse: "AbstractWarehouse",
|
|
158
|
-
|
|
161
|
+
remote_ds: DatasetRecord,
|
|
159
162
|
remote_ds_version: str,
|
|
160
|
-
|
|
163
|
+
local_ds: DatasetRecord,
|
|
161
164
|
local_ds_version: str,
|
|
162
165
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
163
166
|
max_threads: int = PULL_DATASET_MAX_THREADS,
|
|
@@ -169,9 +172,9 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
169
172
|
self._check_dependencies()
|
|
170
173
|
self.metastore = metastore
|
|
171
174
|
self.warehouse = warehouse
|
|
172
|
-
self.
|
|
175
|
+
self.remote_ds = remote_ds
|
|
173
176
|
self.remote_ds_version = remote_ds_version
|
|
174
|
-
self.
|
|
177
|
+
self.local_ds = local_ds
|
|
175
178
|
self.local_ds_version = local_ds_version
|
|
176
179
|
self.schema = schema
|
|
177
180
|
self.last_status_check: Optional[float] = None
|
|
@@ -207,7 +210,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
207
210
|
Checks are done every PULL_DATASET_CHECK_STATUS_INTERVAL seconds
|
|
208
211
|
"""
|
|
209
212
|
export_status_response = self.studio_client.dataset_export_status(
|
|
210
|
-
self.
|
|
213
|
+
self.remote_ds, self.remote_ds_version
|
|
211
214
|
)
|
|
212
215
|
if not export_status_response.ok:
|
|
213
216
|
raise DataChainError(export_status_response.message)
|
|
@@ -254,9 +257,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
254
257
|
import pandas as pd
|
|
255
258
|
|
|
256
259
|
# metastore and warehouse are not thread safe
|
|
257
|
-
with self.
|
|
258
|
-
local_ds = metastore.get_dataset(self.local_ds_name)
|
|
259
|
-
|
|
260
|
+
with self.warehouse.clone() as warehouse:
|
|
260
261
|
urls = list(urls)
|
|
261
262
|
|
|
262
263
|
for url in urls:
|
|
@@ -269,7 +270,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
269
270
|
df = self.fix_columns(df)
|
|
270
271
|
|
|
271
272
|
inserted = warehouse.insert_dataset_rows(
|
|
272
|
-
df, local_ds, self.local_ds_version
|
|
273
|
+
df, self.local_ds, self.local_ds_version
|
|
273
274
|
)
|
|
274
275
|
self.increase_counter(inserted) # type: ignore [arg-type]
|
|
275
276
|
# sometimes progress bar doesn't get updated so manually updating it
|
|
@@ -675,7 +676,11 @@ class Catalog:
|
|
|
675
676
|
listing: Optional[Listing]
|
|
676
677
|
if src.startswith("ds://"):
|
|
677
678
|
ds_name, ds_version = parse_dataset_uri(src)
|
|
678
|
-
|
|
679
|
+
ds_namespace, ds_project, ds_name = parse_dataset_name(ds_name)
|
|
680
|
+
assert ds_namespace
|
|
681
|
+
assert ds_project
|
|
682
|
+
project = self.metastore.get_project(ds_project, ds_namespace)
|
|
683
|
+
dataset = self.get_dataset(ds_name, project)
|
|
679
684
|
if not ds_version:
|
|
680
685
|
ds_version = dataset.latest_version
|
|
681
686
|
dataset_sources = self.warehouse.get_dataset_sources(
|
|
@@ -695,7 +700,11 @@ class Catalog:
|
|
|
695
700
|
dataset_name=dataset_name,
|
|
696
701
|
)
|
|
697
702
|
rows = DatasetQuery(
|
|
698
|
-
name=dataset.name,
|
|
703
|
+
name=dataset.name,
|
|
704
|
+
namespace_name=dataset.project.namespace.name,
|
|
705
|
+
project_name=dataset.project.name,
|
|
706
|
+
version=ds_version,
|
|
707
|
+
catalog=self,
|
|
699
708
|
).to_db_records()
|
|
700
709
|
indexed_sources.append(
|
|
701
710
|
(
|
|
@@ -769,6 +778,7 @@ class Catalog:
|
|
|
769
778
|
def create_dataset(
|
|
770
779
|
self,
|
|
771
780
|
name: str,
|
|
781
|
+
project: Optional[Project] = None,
|
|
772
782
|
version: Optional[str] = None,
|
|
773
783
|
*,
|
|
774
784
|
columns: Sequence[Column],
|
|
@@ -788,6 +798,7 @@ class Catalog:
|
|
|
788
798
|
If version is None, then next unused version is created.
|
|
789
799
|
If version is given, then it must be an unused version.
|
|
790
800
|
"""
|
|
801
|
+
DatasetRecord.validate_name(name)
|
|
791
802
|
assert [c.name for c in columns if c.name != "sys__id"], f"got {columns=}"
|
|
792
803
|
if not listing and Client.is_data_source_uri(name):
|
|
793
804
|
raise RuntimeError(
|
|
@@ -795,7 +806,7 @@ class Catalog:
|
|
|
795
806
|
)
|
|
796
807
|
default_version = DEFAULT_DATASET_VERSION
|
|
797
808
|
try:
|
|
798
|
-
dataset = self.get_dataset(name)
|
|
809
|
+
dataset = self.get_dataset(name, project)
|
|
799
810
|
default_version = dataset.next_version_patch
|
|
800
811
|
if update_version == "major":
|
|
801
812
|
default_version = dataset.next_version_major
|
|
@@ -820,6 +831,7 @@ class Catalog:
|
|
|
820
831
|
}
|
|
821
832
|
dataset = self.metastore.create_dataset(
|
|
822
833
|
name,
|
|
834
|
+
project.id if project else None,
|
|
823
835
|
feature_schema=feature_schema,
|
|
824
836
|
query_script=query_script,
|
|
825
837
|
schema=schema,
|
|
@@ -892,7 +904,7 @@ class Catalog:
|
|
|
892
904
|
)
|
|
893
905
|
|
|
894
906
|
if create_rows_table:
|
|
895
|
-
table_name = self.warehouse.dataset_table_name(dataset
|
|
907
|
+
table_name = self.warehouse.dataset_table_name(dataset, version)
|
|
896
908
|
self.warehouse.create_dataset_rows_table(table_name, columns=columns)
|
|
897
909
|
self.update_dataset_version_with_warehouse_info(dataset, version)
|
|
898
910
|
|
|
@@ -923,7 +935,13 @@ class Catalog:
|
|
|
923
935
|
|
|
924
936
|
if not dataset_version.preview:
|
|
925
937
|
values["preview"] = (
|
|
926
|
-
DatasetQuery(
|
|
938
|
+
DatasetQuery(
|
|
939
|
+
name=dataset.name,
|
|
940
|
+
namespace_name=dataset.project.namespace.name,
|
|
941
|
+
project_name=dataset.project.name,
|
|
942
|
+
version=version,
|
|
943
|
+
catalog=self,
|
|
944
|
+
)
|
|
927
945
|
.limit(20)
|
|
928
946
|
.to_db_records()
|
|
929
947
|
)
|
|
@@ -949,6 +967,7 @@ class Catalog:
|
|
|
949
967
|
# updating name must result in updating dataset table names as well
|
|
950
968
|
for version in [v.version for v in dataset.versions]:
|
|
951
969
|
self.warehouse.rename_dataset_table(
|
|
970
|
+
dataset,
|
|
952
971
|
old_name,
|
|
953
972
|
new_name,
|
|
954
973
|
old_version=version,
|
|
@@ -986,6 +1005,7 @@ class Catalog:
|
|
|
986
1005
|
self,
|
|
987
1006
|
name: str,
|
|
988
1007
|
sources: list[str],
|
|
1008
|
+
project: Optional[Project] = None,
|
|
989
1009
|
client_config=None,
|
|
990
1010
|
recursive=False,
|
|
991
1011
|
) -> DatasetRecord:
|
|
@@ -994,6 +1014,8 @@ class Catalog:
|
|
|
994
1014
|
|
|
995
1015
|
from datachain import read_dataset, read_storage
|
|
996
1016
|
|
|
1017
|
+
project = project or self.metastore.default_project
|
|
1018
|
+
|
|
997
1019
|
chains = []
|
|
998
1020
|
for source in sources:
|
|
999
1021
|
if source.startswith(DATASET_PREFIX):
|
|
@@ -1006,10 +1028,11 @@ class Catalog:
|
|
|
1006
1028
|
# create union of all dataset queries created from sources
|
|
1007
1029
|
dc = reduce(lambda dc1, dc2: dc1.union(dc2), chains)
|
|
1008
1030
|
try:
|
|
1031
|
+
dc = dc.settings(project=project.name, namespace=project.namespace.name)
|
|
1009
1032
|
dc.save(name)
|
|
1010
1033
|
except Exception as e: # noqa: BLE001
|
|
1011
1034
|
try:
|
|
1012
|
-
ds = self.get_dataset(name)
|
|
1035
|
+
ds = self.get_dataset(name, project)
|
|
1013
1036
|
self.metastore.update_dataset_status(
|
|
1014
1037
|
ds,
|
|
1015
1038
|
DatasetStatus.FAILED,
|
|
@@ -1026,7 +1049,7 @@ class Catalog:
|
|
|
1026
1049
|
except DatasetNotFoundError:
|
|
1027
1050
|
raise e from None
|
|
1028
1051
|
|
|
1029
|
-
ds = self.get_dataset(name)
|
|
1052
|
+
ds = self.get_dataset(name, project)
|
|
1030
1053
|
|
|
1031
1054
|
self.update_dataset_version_with_warehouse_info(
|
|
1032
1055
|
ds,
|
|
@@ -1034,49 +1057,100 @@ class Catalog:
|
|
|
1034
1057
|
sources="\n".join(sources),
|
|
1035
1058
|
)
|
|
1036
1059
|
|
|
1037
|
-
return self.get_dataset(name)
|
|
1060
|
+
return self.get_dataset(name, project)
|
|
1038
1061
|
|
|
1039
|
-
def
|
|
1040
|
-
|
|
1062
|
+
def get_full_dataset_name(
|
|
1063
|
+
self,
|
|
1064
|
+
name: str,
|
|
1065
|
+
project_name: Optional[str] = None,
|
|
1066
|
+
namespace_name: Optional[str] = None,
|
|
1067
|
+
) -> tuple[str, str, str]:
|
|
1068
|
+
"""
|
|
1069
|
+
Returns dataset name together with separated namespace and project name.
|
|
1070
|
+
It takes into account all the ways namespace and project can be added.
|
|
1071
|
+
"""
|
|
1072
|
+
parsed_namespace_name, parsed_project_name, name = parse_dataset_name(name)
|
|
1073
|
+
|
|
1074
|
+
namespace_env = os.environ.get("DATACHAIN_NAMESPACE")
|
|
1075
|
+
project_env = os.environ.get("DATACHAIN_PROJECT")
|
|
1076
|
+
if project_env and len(project_env.split(".")) == 2:
|
|
1077
|
+
# we allow setting both namespace and project in DATACHAIN_PROJECT
|
|
1078
|
+
namespace_env, project_env = project_env.split(".")
|
|
1079
|
+
|
|
1080
|
+
namespace_name = (
|
|
1081
|
+
parsed_namespace_name
|
|
1082
|
+
or namespace_name
|
|
1083
|
+
or namespace_env
|
|
1084
|
+
or self.metastore.default_namespace_name
|
|
1085
|
+
)
|
|
1086
|
+
project_name = (
|
|
1087
|
+
parsed_project_name
|
|
1088
|
+
or project_name
|
|
1089
|
+
or project_env
|
|
1090
|
+
or self.metastore.default_project_name
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
return namespace_name, project_name, name
|
|
1094
|
+
|
|
1095
|
+
def get_dataset(
|
|
1096
|
+
self, name: str, project: Optional[Project] = None
|
|
1097
|
+
) -> DatasetRecord:
|
|
1098
|
+
from datachain.lib.listing import is_listing_dataset
|
|
1099
|
+
|
|
1100
|
+
if is_listing_dataset(name):
|
|
1101
|
+
project = self.metastore.listing_project
|
|
1102
|
+
return self.metastore.get_dataset(name, project.id if project else None)
|
|
1041
1103
|
|
|
1042
1104
|
def get_dataset_with_remote_fallback(
|
|
1043
|
-
self,
|
|
1105
|
+
self,
|
|
1106
|
+
name: str,
|
|
1107
|
+
namespace_name: str,
|
|
1108
|
+
project_name: str,
|
|
1109
|
+
version: Optional[str] = None,
|
|
1044
1110
|
) -> DatasetRecord:
|
|
1045
1111
|
try:
|
|
1046
|
-
|
|
1112
|
+
project = self.metastore.get_project(project_name, namespace_name)
|
|
1113
|
+
ds = self.get_dataset(name, project)
|
|
1047
1114
|
if version and not ds.has_version(version):
|
|
1048
1115
|
raise DatasetVersionNotFoundError(
|
|
1049
1116
|
f"Dataset {name} does not have version {version}"
|
|
1050
1117
|
)
|
|
1051
1118
|
return ds
|
|
1052
1119
|
|
|
1053
|
-
except (
|
|
1120
|
+
except (
|
|
1121
|
+
ProjectNotFoundError,
|
|
1122
|
+
DatasetNotFoundError,
|
|
1123
|
+
DatasetVersionNotFoundError,
|
|
1124
|
+
):
|
|
1054
1125
|
print("Dataset not found in local catalog, trying to get from studio")
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
remote_ds_uri += f"@v{version}"
|
|
1126
|
+
remote_ds_uri = create_dataset_uri(
|
|
1127
|
+
name, namespace_name, project_name, version
|
|
1128
|
+
)
|
|
1059
1129
|
|
|
1060
1130
|
self.pull_dataset(
|
|
1061
1131
|
remote_ds_uri=remote_ds_uri,
|
|
1062
1132
|
local_ds_name=name,
|
|
1063
1133
|
local_ds_version=version,
|
|
1064
1134
|
)
|
|
1065
|
-
return self.get_dataset(
|
|
1135
|
+
return self.get_dataset(
|
|
1136
|
+
name, self.metastore.get_project(project_name, namespace_name)
|
|
1137
|
+
)
|
|
1066
1138
|
|
|
1067
1139
|
def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
|
|
1068
1140
|
"""Returns dataset that contains version with specific uuid"""
|
|
1069
1141
|
for dataset in self.ls_datasets():
|
|
1070
1142
|
if dataset.has_version_with_uuid(uuid):
|
|
1071
|
-
return self.get_dataset(dataset.name)
|
|
1143
|
+
return self.get_dataset(dataset.name, dataset.project)
|
|
1072
1144
|
raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
|
|
1073
1145
|
|
|
1074
|
-
def get_remote_dataset(
|
|
1146
|
+
def get_remote_dataset(
|
|
1147
|
+
self, namespace: str, project: str, name: str
|
|
1148
|
+
) -> DatasetRecord:
|
|
1075
1149
|
from datachain.remote.studio import StudioClient
|
|
1076
1150
|
|
|
1077
1151
|
studio_client = StudioClient()
|
|
1078
1152
|
|
|
1079
|
-
info_response = studio_client.dataset_info(name)
|
|
1153
|
+
info_response = studio_client.dataset_info(namespace, project, name)
|
|
1080
1154
|
if not info_response.ok:
|
|
1081
1155
|
raise DataChainError(info_response.message)
|
|
1082
1156
|
|
|
@@ -1085,9 +1159,9 @@ class Catalog:
|
|
|
1085
1159
|
return DatasetRecord.from_dict(dataset_info)
|
|
1086
1160
|
|
|
1087
1161
|
def get_dataset_dependencies(
|
|
1088
|
-
self, name: str, version: str, indirect=False
|
|
1162
|
+
self, name: str, version: str, project: Optional[Project] = None, indirect=False
|
|
1089
1163
|
) -> list[Optional[DatasetDependency]]:
|
|
1090
|
-
dataset = self.get_dataset(name)
|
|
1164
|
+
dataset = self.get_dataset(name, project)
|
|
1091
1165
|
|
|
1092
1166
|
direct_dependencies = self.metastore.get_direct_dataset_dependencies(
|
|
1093
1167
|
dataset, version
|
|
@@ -1101,9 +1175,10 @@ class Catalog:
|
|
|
1101
1175
|
# dependency has been removed
|
|
1102
1176
|
continue
|
|
1103
1177
|
if d.is_dataset:
|
|
1178
|
+
project = self.metastore.get_project(d.project, d.namespace)
|
|
1104
1179
|
# only datasets can have dependencies
|
|
1105
1180
|
d.dependencies = self.get_dataset_dependencies(
|
|
1106
|
-
d.name, d.version, indirect=indirect
|
|
1181
|
+
d.name, d.version, project, indirect=indirect
|
|
1107
1182
|
)
|
|
1108
1183
|
|
|
1109
1184
|
return direct_dependencies
|
|
@@ -1113,9 +1188,12 @@ class Catalog:
|
|
|
1113
1188
|
prefix: Optional[str] = None,
|
|
1114
1189
|
include_listing: bool = False,
|
|
1115
1190
|
studio: bool = False,
|
|
1191
|
+
project: Optional[Project] = None,
|
|
1116
1192
|
) -> Iterator[DatasetListRecord]:
|
|
1117
1193
|
from datachain.remote.studio import StudioClient
|
|
1118
1194
|
|
|
1195
|
+
project_id = project.id if project else None
|
|
1196
|
+
|
|
1119
1197
|
if studio:
|
|
1120
1198
|
client = StudioClient()
|
|
1121
1199
|
response = client.ls_datasets(prefix=prefix)
|
|
@@ -1130,9 +1208,11 @@ class Catalog:
|
|
|
1130
1208
|
if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
|
|
1131
1209
|
)
|
|
1132
1210
|
elif prefix:
|
|
1133
|
-
datasets = self.metastore.list_datasets_by_prefix(
|
|
1211
|
+
datasets = self.metastore.list_datasets_by_prefix(
|
|
1212
|
+
prefix, project_id=project_id
|
|
1213
|
+
)
|
|
1134
1214
|
else:
|
|
1135
|
-
datasets = self.metastore.list_datasets()
|
|
1215
|
+
datasets = self.metastore.list_datasets(project_id=project_id)
|
|
1136
1216
|
|
|
1137
1217
|
for d in datasets:
|
|
1138
1218
|
if not d.is_bucket_listing or include_listing:
|
|
@@ -1144,11 +1224,15 @@ class Catalog:
|
|
|
1144
1224
|
include_listing: bool = False,
|
|
1145
1225
|
with_job: bool = True,
|
|
1146
1226
|
studio: bool = False,
|
|
1227
|
+
project: Optional[Project] = None,
|
|
1147
1228
|
) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
|
|
1148
1229
|
"""Iterate over all dataset versions with related jobs."""
|
|
1149
1230
|
datasets = list(
|
|
1150
1231
|
self.ls_datasets(
|
|
1151
|
-
prefix=prefix,
|
|
1232
|
+
prefix=prefix,
|
|
1233
|
+
include_listing=include_listing,
|
|
1234
|
+
studio=studio,
|
|
1235
|
+
project=project,
|
|
1152
1236
|
)
|
|
1153
1237
|
)
|
|
1154
1238
|
|
|
@@ -1184,6 +1268,7 @@ class Catalog:
|
|
|
1184
1268
|
prefix=prefix,
|
|
1185
1269
|
include_listing=True,
|
|
1186
1270
|
with_job=False,
|
|
1271
|
+
project=self.metastore.listing_project,
|
|
1187
1272
|
)
|
|
1188
1273
|
|
|
1189
1274
|
return [
|
|
@@ -1193,13 +1278,21 @@ class Catalog:
|
|
|
1193
1278
|
]
|
|
1194
1279
|
|
|
1195
1280
|
def ls_dataset_rows(
|
|
1196
|
-
self,
|
|
1281
|
+
self,
|
|
1282
|
+
dataset: DatasetRecord,
|
|
1283
|
+
version: str,
|
|
1284
|
+
offset=None,
|
|
1285
|
+
limit=None,
|
|
1197
1286
|
) -> list[dict]:
|
|
1198
1287
|
from datachain.query.dataset import DatasetQuery
|
|
1199
1288
|
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1289
|
+
q = DatasetQuery(
|
|
1290
|
+
name=dataset.name,
|
|
1291
|
+
namespace_name=dataset.project.namespace.name,
|
|
1292
|
+
project_name=dataset.project.name,
|
|
1293
|
+
version=version,
|
|
1294
|
+
catalog=self,
|
|
1295
|
+
)
|
|
1203
1296
|
if limit:
|
|
1204
1297
|
q = q.limit(limit)
|
|
1205
1298
|
if offset:
|
|
@@ -1232,35 +1325,29 @@ class Catalog:
|
|
|
1232
1325
|
bucket_uri: str,
|
|
1233
1326
|
name: str,
|
|
1234
1327
|
version: str,
|
|
1328
|
+
project: Optional[Project] = None,
|
|
1235
1329
|
client_config=None,
|
|
1236
1330
|
) -> list[str]:
|
|
1237
|
-
dataset = self.get_dataset(name)
|
|
1331
|
+
dataset = self.get_dataset(name, project)
|
|
1238
1332
|
|
|
1239
1333
|
return self.warehouse.export_dataset_table(
|
|
1240
1334
|
bucket_uri, dataset, version, client_config
|
|
1241
1335
|
)
|
|
1242
1336
|
|
|
1243
|
-
def dataset_table_export_file_names(
|
|
1244
|
-
|
|
1337
|
+
def dataset_table_export_file_names(
|
|
1338
|
+
self, name: str, version: str, project: Optional[Project] = None
|
|
1339
|
+
) -> list[str]:
|
|
1340
|
+
dataset = self.get_dataset(name, project)
|
|
1245
1341
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1246
1342
|
|
|
1247
1343
|
def remove_dataset(
|
|
1248
1344
|
self,
|
|
1249
1345
|
name: str,
|
|
1346
|
+
project: Optional[Project] = None,
|
|
1250
1347
|
version: Optional[str] = None,
|
|
1251
1348
|
force: Optional[bool] = False,
|
|
1252
|
-
studio: Optional[bool] = False,
|
|
1253
1349
|
):
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
if studio:
|
|
1257
|
-
client = StudioClient()
|
|
1258
|
-
response = client.rm_dataset(name, version=version, force=force)
|
|
1259
|
-
if not response.ok:
|
|
1260
|
-
raise DataChainError(response.message)
|
|
1261
|
-
return
|
|
1262
|
-
|
|
1263
|
-
dataset = self.get_dataset(name)
|
|
1350
|
+
dataset = self.get_dataset(name, project)
|
|
1264
1351
|
if not version and not force:
|
|
1265
1352
|
raise ValueError(f"Missing dataset version from input for dataset {name}")
|
|
1266
1353
|
if version and not dataset.has_version(version):
|
|
@@ -1282,19 +1369,21 @@ class Catalog:
|
|
|
1282
1369
|
def edit_dataset(
|
|
1283
1370
|
self,
|
|
1284
1371
|
name: str,
|
|
1372
|
+
project: Optional[Project] = None,
|
|
1285
1373
|
new_name: Optional[str] = None,
|
|
1286
1374
|
description: Optional[str] = None,
|
|
1287
1375
|
attrs: Optional[list[str]] = None,
|
|
1288
1376
|
) -> DatasetRecord:
|
|
1289
1377
|
update_data = {}
|
|
1290
1378
|
if new_name:
|
|
1379
|
+
DatasetRecord.validate_name(new_name)
|
|
1291
1380
|
update_data["name"] = new_name
|
|
1292
1381
|
if description is not None:
|
|
1293
1382
|
update_data["description"] = description
|
|
1294
1383
|
if attrs is not None:
|
|
1295
1384
|
update_data["attrs"] = attrs # type: ignore[assignment]
|
|
1296
1385
|
|
|
1297
|
-
dataset = self.get_dataset(name)
|
|
1386
|
+
dataset = self.get_dataset(name, project)
|
|
1298
1387
|
return self.update_dataset(dataset, **update_data)
|
|
1299
1388
|
|
|
1300
1389
|
def ls(
|
|
@@ -1351,7 +1440,29 @@ class Catalog:
|
|
|
1351
1440
|
except Exception as e:
|
|
1352
1441
|
raise DataChainError("Error when parsing dataset uri") from e
|
|
1353
1442
|
|
|
1354
|
-
|
|
1443
|
+
remote_namespace, remote_project, remote_ds_name = parse_dataset_name(
|
|
1444
|
+
remote_ds_name
|
|
1445
|
+
)
|
|
1446
|
+
if not remote_namespace or not remote_project:
|
|
1447
|
+
raise DataChainError(
|
|
1448
|
+
f"Invalid fully qualified dataset name {remote_ds_name}, namespace"
|
|
1449
|
+
f" or project missing"
|
|
1450
|
+
)
|
|
1451
|
+
|
|
1452
|
+
if local_ds_name:
|
|
1453
|
+
local_namespace, local_project, local_ds_name = parse_dataset_name(
|
|
1454
|
+
local_ds_name
|
|
1455
|
+
)
|
|
1456
|
+
if local_namespace and local_namespace != remote_namespace:
|
|
1457
|
+
raise DataChainError(
|
|
1458
|
+
"Local namespace must be the same to remote namespace"
|
|
1459
|
+
)
|
|
1460
|
+
if local_project and local_project != remote_project:
|
|
1461
|
+
raise DataChainError("Local project must be the same to remote project")
|
|
1462
|
+
|
|
1463
|
+
remote_ds = self.get_remote_dataset(
|
|
1464
|
+
remote_namespace, remote_project, remote_ds_name
|
|
1465
|
+
)
|
|
1355
1466
|
|
|
1356
1467
|
try:
|
|
1357
1468
|
# if version is not specified in uri, take the latest one
|
|
@@ -1359,7 +1470,12 @@ class Catalog:
|
|
|
1359
1470
|
version = remote_ds.latest_version
|
|
1360
1471
|
print(f"Version not specified, pulling the latest one (v{version})")
|
|
1361
1472
|
# updating dataset uri with latest version
|
|
1362
|
-
remote_ds_uri = create_dataset_uri(
|
|
1473
|
+
remote_ds_uri = create_dataset_uri(
|
|
1474
|
+
remote_ds.name,
|
|
1475
|
+
remote_ds.project.namespace.name,
|
|
1476
|
+
remote_ds.project.name,
|
|
1477
|
+
version,
|
|
1478
|
+
)
|
|
1363
1479
|
remote_ds_version = remote_ds.get_version(version)
|
|
1364
1480
|
except (DatasetVersionNotFoundError, StopIteration) as exc:
|
|
1365
1481
|
raise DataChainError(
|
|
@@ -1368,7 +1484,13 @@ class Catalog:
|
|
|
1368
1484
|
|
|
1369
1485
|
local_ds_name = local_ds_name or remote_ds.name
|
|
1370
1486
|
local_ds_version = local_ds_version or remote_ds_version.version
|
|
1371
|
-
|
|
1487
|
+
|
|
1488
|
+
local_ds_uri = create_dataset_uri(
|
|
1489
|
+
local_ds_name,
|
|
1490
|
+
remote_ds.project.namespace.name,
|
|
1491
|
+
remote_ds.project.name,
|
|
1492
|
+
local_ds_version,
|
|
1493
|
+
)
|
|
1372
1494
|
|
|
1373
1495
|
try:
|
|
1374
1496
|
# try to find existing dataset with the same uuid to avoid pulling again
|
|
@@ -1377,7 +1499,10 @@ class Catalog:
|
|
|
1377
1499
|
remote_ds_version.uuid
|
|
1378
1500
|
)
|
|
1379
1501
|
existing_ds_uri = create_dataset_uri(
|
|
1380
|
-
existing_ds.name,
|
|
1502
|
+
existing_ds.name,
|
|
1503
|
+
existing_ds.project.namespace.name,
|
|
1504
|
+
existing_ds.project.name,
|
|
1505
|
+
existing_ds_version.version,
|
|
1381
1506
|
)
|
|
1382
1507
|
if existing_ds_uri == remote_ds_uri:
|
|
1383
1508
|
print(f"Local copy of dataset {remote_ds_uri} already present")
|
|
@@ -1391,8 +1516,26 @@ class Catalog:
|
|
|
1391
1516
|
except DatasetNotFoundError:
|
|
1392
1517
|
pass
|
|
1393
1518
|
|
|
1519
|
+
# Create namespace and project if doesn't exist
|
|
1520
|
+
print(
|
|
1521
|
+
f"Creating namespace {remote_ds.project.namespace.name} and project"
|
|
1522
|
+
f" {remote_ds.project.name}"
|
|
1523
|
+
)
|
|
1524
|
+
|
|
1525
|
+
namespace = self.metastore.create_namespace(
|
|
1526
|
+
remote_ds.project.namespace.name,
|
|
1527
|
+
description=remote_ds.project.namespace.descr,
|
|
1528
|
+
uuid=remote_ds.project.namespace.uuid,
|
|
1529
|
+
)
|
|
1530
|
+
project = self.metastore.create_project(
|
|
1531
|
+
namespace.name,
|
|
1532
|
+
remote_ds.project.name,
|
|
1533
|
+
description=remote_ds.project.descr,
|
|
1534
|
+
uuid=remote_ds.project.uuid,
|
|
1535
|
+
)
|
|
1536
|
+
|
|
1394
1537
|
try:
|
|
1395
|
-
local_dataset = self.get_dataset(local_ds_name)
|
|
1538
|
+
local_dataset = self.get_dataset(local_ds_name, project=project)
|
|
1396
1539
|
if local_dataset and local_dataset.has_version(local_ds_version):
|
|
1397
1540
|
raise DataChainError(
|
|
1398
1541
|
f"Local dataset {local_ds_uri} already exists with different uuid,"
|
|
@@ -1414,6 +1557,7 @@ class Catalog:
|
|
|
1414
1557
|
|
|
1415
1558
|
local_ds = self.create_dataset(
|
|
1416
1559
|
local_ds_name,
|
|
1560
|
+
project,
|
|
1417
1561
|
local_ds_version,
|
|
1418
1562
|
query_script=remote_ds_version.query_script,
|
|
1419
1563
|
create_rows=True,
|
|
@@ -1426,7 +1570,7 @@ class Catalog:
|
|
|
1426
1570
|
# asking remote to export dataset rows table to s3 and to return signed
|
|
1427
1571
|
# urls of exported parts, which are in parquet format
|
|
1428
1572
|
export_response = studio_client.export_dataset_table(
|
|
1429
|
-
|
|
1573
|
+
remote_ds, remote_ds_version.version
|
|
1430
1574
|
)
|
|
1431
1575
|
if not export_response.ok:
|
|
1432
1576
|
raise DataChainError(export_response.message)
|
|
@@ -1457,9 +1601,9 @@ class Catalog:
|
|
|
1457
1601
|
rows_fetcher = DatasetRowsFetcher(
|
|
1458
1602
|
metastore,
|
|
1459
1603
|
warehouse,
|
|
1460
|
-
|
|
1604
|
+
remote_ds,
|
|
1461
1605
|
remote_ds_version.version,
|
|
1462
|
-
|
|
1606
|
+
local_ds,
|
|
1463
1607
|
local_ds_version,
|
|
1464
1608
|
schema,
|
|
1465
1609
|
progress_bar=dataset_save_progress_bar,
|
|
@@ -1469,7 +1613,7 @@ class Catalog:
|
|
|
1469
1613
|
iter(batch(signed_urls)), dataset_save_progress_bar
|
|
1470
1614
|
)
|
|
1471
1615
|
except:
|
|
1472
|
-
self.remove_dataset(local_ds_name, local_ds_version)
|
|
1616
|
+
self.remove_dataset(local_ds_name, project, local_ds_version)
|
|
1473
1617
|
raise
|
|
1474
1618
|
|
|
1475
1619
|
local_ds = self.metastore.update_dataset_status(
|
|
@@ -1526,7 +1670,11 @@ class Catalog:
|
|
|
1526
1670
|
)
|
|
1527
1671
|
|
|
1528
1672
|
self.create_dataset_from_sources(
|
|
1529
|
-
output,
|
|
1673
|
+
output,
|
|
1674
|
+
sources,
|
|
1675
|
+
self.metastore.default_project,
|
|
1676
|
+
client_config=client_config,
|
|
1677
|
+
recursive=recursive,
|
|
1530
1678
|
)
|
|
1531
1679
|
|
|
1532
1680
|
def query(
|
datachain/cli/__init__.py
CHANGED
|
@@ -154,9 +154,6 @@ def handle_dataset_command(args, catalog):
|
|
|
154
154
|
new_name=args.new_name,
|
|
155
155
|
description=args.description,
|
|
156
156
|
attrs=args.attrs,
|
|
157
|
-
studio=args.studio,
|
|
158
|
-
local=args.local,
|
|
159
|
-
all=args.all,
|
|
160
157
|
team=args.team,
|
|
161
158
|
),
|
|
162
159
|
"ls": lambda: list_datasets(
|
|
@@ -174,8 +171,6 @@ def handle_dataset_command(args, catalog):
|
|
|
174
171
|
version=args.version,
|
|
175
172
|
force=args.force,
|
|
176
173
|
studio=args.studio,
|
|
177
|
-
local=args.local,
|
|
178
|
-
all=args.all,
|
|
179
174
|
team=args.team,
|
|
180
175
|
),
|
|
181
176
|
"remove": lambda: rm_dataset(
|
|
@@ -184,8 +179,6 @@ def handle_dataset_command(args, catalog):
|
|
|
184
179
|
version=args.version,
|
|
185
180
|
force=args.force,
|
|
186
181
|
studio=args.studio,
|
|
187
|
-
local=args.local,
|
|
188
|
-
all=args.all,
|
|
189
182
|
team=args.team,
|
|
190
183
|
),
|
|
191
184
|
}
|