datachain 0.30.2__py3-none-any.whl → 0.30.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/catalog/__init__.py +2 -0
- datachain/catalog/catalog.py +100 -31
- datachain/catalog/loader.py +4 -2
- datachain/cli/__init__.py +1 -0
- datachain/cli/commands/datasets.py +19 -12
- datachain/data_storage/metastore.py +34 -30
- datachain/data_storage/sqlite.py +0 -4
- datachain/delta.py +23 -12
- datachain/func/string.py +8 -0
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/database.py +50 -6
- datachain/lib/dc/datachain.py +48 -20
- datachain/lib/dc/datasets.py +12 -7
- datachain/lib/dc/utils.py +5 -0
- datachain/lib/namespaces.py +3 -1
- datachain/lib/projects.py +3 -1
- datachain/lib/signal_schema.py +28 -17
- datachain/listing.py +5 -9
- datachain/model/ultralytics/bbox.py +14 -12
- datachain/model/ultralytics/pose.py +14 -12
- datachain/model/ultralytics/segment.py +14 -12
- datachain/query/dataset.py +42 -28
- datachain/query/schema.py +4 -0
- datachain/utils.py +7 -0
- {datachain-0.30.2.dist-info → datachain-0.30.4.dist-info}/METADATA +2 -2
- {datachain-0.30.2.dist-info → datachain-0.30.4.dist-info}/RECORD +31 -31
- {datachain-0.30.2.dist-info → datachain-0.30.4.dist-info}/WHEEL +0 -0
- {datachain-0.30.2.dist-info → datachain-0.30.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.2.dist-info → datachain-0.30.4.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.2.dist-info → datachain-0.30.4.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ from datachain.lib.dc import (
|
|
|
6
6
|
Sys,
|
|
7
7
|
datasets,
|
|
8
8
|
delete_dataset,
|
|
9
|
+
is_studio,
|
|
9
10
|
listings,
|
|
10
11
|
move_dataset,
|
|
11
12
|
read_csv,
|
|
@@ -74,6 +75,7 @@ __all__ = [
|
|
|
74
75
|
"datasets",
|
|
75
76
|
"delete_dataset",
|
|
76
77
|
"is_chain_type",
|
|
78
|
+
"is_studio",
|
|
77
79
|
"listings",
|
|
78
80
|
"metrics",
|
|
79
81
|
"move_dataset",
|
datachain/catalog/__init__.py
CHANGED
|
@@ -3,6 +3,7 @@ from .catalog import (
|
|
|
3
3
|
QUERY_SCRIPT_CANCELED_EXIT_CODE,
|
|
4
4
|
QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
|
|
5
5
|
Catalog,
|
|
6
|
+
is_namespace_local,
|
|
6
7
|
)
|
|
7
8
|
from .loader import get_catalog
|
|
8
9
|
|
|
@@ -12,4 +13,5 @@ __all__ = [
|
|
|
12
13
|
"QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
|
|
13
14
|
"Catalog",
|
|
14
15
|
"get_catalog",
|
|
16
|
+
"is_namespace_local",
|
|
15
17
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -113,6 +113,11 @@ else:
|
|
|
113
113
|
SIGINT = signal.SIGINT
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
def is_namespace_local(namespace_name) -> bool:
|
|
117
|
+
"""Checks if namespace is from local environment, i.e. is `local`"""
|
|
118
|
+
return namespace_name == "local"
|
|
119
|
+
|
|
120
|
+
|
|
116
121
|
def shutdown_process(
|
|
117
122
|
proc: subprocess.Popen,
|
|
118
123
|
interrupt_timeout: Optional[int] = None,
|
|
@@ -680,8 +685,9 @@ class Catalog:
|
|
|
680
685
|
ds_namespace, ds_project, ds_name = parse_dataset_name(ds_name)
|
|
681
686
|
assert ds_namespace
|
|
682
687
|
assert ds_project
|
|
683
|
-
|
|
684
|
-
|
|
688
|
+
dataset = self.get_dataset(
|
|
689
|
+
ds_name, namespace_name=ds_namespace, project_name=ds_project
|
|
690
|
+
)
|
|
685
691
|
if not ds_version:
|
|
686
692
|
ds_version = dataset.latest_version
|
|
687
693
|
dataset_sources = self.warehouse.get_dataset_sources(
|
|
@@ -807,7 +813,11 @@ class Catalog:
|
|
|
807
813
|
)
|
|
808
814
|
default_version = DEFAULT_DATASET_VERSION
|
|
809
815
|
try:
|
|
810
|
-
dataset = self.get_dataset(
|
|
816
|
+
dataset = self.get_dataset(
|
|
817
|
+
name,
|
|
818
|
+
namespace_name=project.namespace.name if project else None,
|
|
819
|
+
project_name=project.name if project else None,
|
|
820
|
+
)
|
|
811
821
|
default_version = dataset.next_version_patch
|
|
812
822
|
if update_version == "major":
|
|
813
823
|
default_version = dataset.next_version_major
|
|
@@ -1016,7 +1026,11 @@ class Catalog:
|
|
|
1016
1026
|
dc.save(name)
|
|
1017
1027
|
except Exception as e: # noqa: BLE001
|
|
1018
1028
|
try:
|
|
1019
|
-
ds = self.get_dataset(
|
|
1029
|
+
ds = self.get_dataset(
|
|
1030
|
+
name,
|
|
1031
|
+
namespace_name=project.namespace.name,
|
|
1032
|
+
project_name=project.name,
|
|
1033
|
+
)
|
|
1020
1034
|
self.metastore.update_dataset_status(
|
|
1021
1035
|
ds,
|
|
1022
1036
|
DatasetStatus.FAILED,
|
|
@@ -1033,7 +1047,11 @@ class Catalog:
|
|
|
1033
1047
|
except DatasetNotFoundError:
|
|
1034
1048
|
raise e from None
|
|
1035
1049
|
|
|
1036
|
-
ds = self.get_dataset(
|
|
1050
|
+
ds = self.get_dataset(
|
|
1051
|
+
name,
|
|
1052
|
+
namespace_name=project.namespace.name,
|
|
1053
|
+
project_name=project.name,
|
|
1054
|
+
)
|
|
1037
1055
|
|
|
1038
1056
|
self.update_dataset_version_with_warehouse_info(
|
|
1039
1057
|
ds,
|
|
@@ -1041,7 +1059,11 @@ class Catalog:
|
|
|
1041
1059
|
sources="\n".join(sources),
|
|
1042
1060
|
)
|
|
1043
1061
|
|
|
1044
|
-
return self.get_dataset(
|
|
1062
|
+
return self.get_dataset(
|
|
1063
|
+
name,
|
|
1064
|
+
namespace_name=project.namespace.name,
|
|
1065
|
+
project_name=project.name,
|
|
1066
|
+
)
|
|
1045
1067
|
|
|
1046
1068
|
def get_full_dataset_name(
|
|
1047
1069
|
self,
|
|
@@ -1077,22 +1099,23 @@ class Catalog:
|
|
|
1077
1099
|
return namespace_name, project_name, name
|
|
1078
1100
|
|
|
1079
1101
|
def get_dataset(
|
|
1080
|
-
self,
|
|
1102
|
+
self,
|
|
1103
|
+
name: str,
|
|
1104
|
+
namespace_name: Optional[str] = None,
|
|
1105
|
+
project_name: Optional[str] = None,
|
|
1081
1106
|
) -> DatasetRecord:
|
|
1082
1107
|
from datachain.lib.listing import is_listing_dataset
|
|
1083
1108
|
|
|
1084
|
-
|
|
1109
|
+
namespace_name = namespace_name or self.metastore.default_namespace_name
|
|
1110
|
+
project_name = project_name or self.metastore.default_project_name
|
|
1085
1111
|
|
|
1086
1112
|
if is_listing_dataset(name):
|
|
1087
|
-
|
|
1113
|
+
namespace_name = self.metastore.system_namespace_name
|
|
1114
|
+
project_name = self.metastore.listing_project_name
|
|
1088
1115
|
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
raise DatasetNotFoundError(
|
|
1093
|
-
f"Dataset {name} not found in namespace {project.namespace.name}"
|
|
1094
|
-
f" and project {project.name}"
|
|
1095
|
-
) from None
|
|
1116
|
+
return self.metastore.get_dataset(
|
|
1117
|
+
name, namespace_name=namespace_name, project_name=project_name
|
|
1118
|
+
)
|
|
1096
1119
|
|
|
1097
1120
|
def get_dataset_with_remote_fallback(
|
|
1098
1121
|
self,
|
|
@@ -1103,6 +1126,8 @@ class Catalog:
|
|
|
1103
1126
|
pull_dataset: bool = False,
|
|
1104
1127
|
update: bool = False,
|
|
1105
1128
|
) -> DatasetRecord:
|
|
1129
|
+
from datachain.lib.dc.utils import is_studio
|
|
1130
|
+
|
|
1106
1131
|
# Intentionally ignore update flag is version is provided. Here only exact
|
|
1107
1132
|
# version can be provided and update then doesn't make sense.
|
|
1108
1133
|
# It corresponds to a query like this for example:
|
|
@@ -1111,16 +1136,24 @@ class Catalog:
|
|
|
1111
1136
|
if version:
|
|
1112
1137
|
update = False
|
|
1113
1138
|
|
|
1114
|
-
|
|
1139
|
+
# we don't do Studio fallback is script is already ran in Studio, or if we try
|
|
1140
|
+
# to fetch dataset with local namespace as that one cannot
|
|
1141
|
+
# exist in Studio in the first place
|
|
1142
|
+
no_fallback = is_studio() or is_namespace_local(namespace_name)
|
|
1143
|
+
|
|
1144
|
+
if no_fallback or not update:
|
|
1115
1145
|
try:
|
|
1116
|
-
|
|
1117
|
-
|
|
1146
|
+
ds = self.get_dataset(
|
|
1147
|
+
name,
|
|
1148
|
+
namespace_name=namespace_name,
|
|
1149
|
+
project_name=project_name,
|
|
1150
|
+
)
|
|
1118
1151
|
if not version or ds.has_version(version):
|
|
1119
1152
|
return ds
|
|
1120
1153
|
except (NamespaceNotFoundError, ProjectNotFoundError, DatasetNotFoundError):
|
|
1121
1154
|
pass
|
|
1122
1155
|
|
|
1123
|
-
if
|
|
1156
|
+
if no_fallback:
|
|
1124
1157
|
raise DatasetNotFoundError(
|
|
1125
1158
|
f"Dataset {name}"
|
|
1126
1159
|
+ (f" version {version} " if version else " ")
|
|
@@ -1139,7 +1172,9 @@ class Catalog:
|
|
|
1139
1172
|
local_ds_version=version,
|
|
1140
1173
|
)
|
|
1141
1174
|
return self.get_dataset(
|
|
1142
|
-
name,
|
|
1175
|
+
name,
|
|
1176
|
+
namespace_name=namespace_name,
|
|
1177
|
+
project_name=project_name,
|
|
1143
1178
|
)
|
|
1144
1179
|
|
|
1145
1180
|
return self.get_remote_dataset(namespace_name, project_name, name)
|
|
@@ -1148,7 +1183,11 @@ class Catalog:
|
|
|
1148
1183
|
"""Returns dataset that contains version with specific uuid"""
|
|
1149
1184
|
for dataset in self.ls_datasets():
|
|
1150
1185
|
if dataset.has_version_with_uuid(uuid):
|
|
1151
|
-
return self.get_dataset(
|
|
1186
|
+
return self.get_dataset(
|
|
1187
|
+
dataset.name,
|
|
1188
|
+
namespace_name=dataset.project.namespace.name,
|
|
1189
|
+
project_name=dataset.project.name,
|
|
1190
|
+
)
|
|
1152
1191
|
raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
|
|
1153
1192
|
|
|
1154
1193
|
def get_remote_dataset(
|
|
@@ -1171,9 +1210,18 @@ class Catalog:
|
|
|
1171
1210
|
return DatasetRecord.from_dict(dataset_info)
|
|
1172
1211
|
|
|
1173
1212
|
def get_dataset_dependencies(
|
|
1174
|
-
self,
|
|
1213
|
+
self,
|
|
1214
|
+
name: str,
|
|
1215
|
+
version: str,
|
|
1216
|
+
namespace_name: Optional[str] = None,
|
|
1217
|
+
project_name: Optional[str] = None,
|
|
1218
|
+
indirect=False,
|
|
1175
1219
|
) -> list[Optional[DatasetDependency]]:
|
|
1176
|
-
dataset = self.get_dataset(
|
|
1220
|
+
dataset = self.get_dataset(
|
|
1221
|
+
name,
|
|
1222
|
+
namespace_name=namespace_name,
|
|
1223
|
+
project_name=project_name,
|
|
1224
|
+
)
|
|
1177
1225
|
|
|
1178
1226
|
direct_dependencies = self.metastore.get_direct_dataset_dependencies(
|
|
1179
1227
|
dataset, version
|
|
@@ -1187,10 +1235,13 @@ class Catalog:
|
|
|
1187
1235
|
# dependency has been removed
|
|
1188
1236
|
continue
|
|
1189
1237
|
if d.is_dataset:
|
|
1190
|
-
project = self.metastore.get_project(d.project, d.namespace)
|
|
1191
1238
|
# only datasets can have dependencies
|
|
1192
1239
|
d.dependencies = self.get_dataset_dependencies(
|
|
1193
|
-
d.name,
|
|
1240
|
+
d.name,
|
|
1241
|
+
d.version,
|
|
1242
|
+
namespace_name=d.namespace,
|
|
1243
|
+
project_name=d.project,
|
|
1244
|
+
indirect=indirect,
|
|
1194
1245
|
)
|
|
1195
1246
|
|
|
1196
1247
|
return direct_dependencies
|
|
@@ -1340,7 +1391,11 @@ class Catalog:
|
|
|
1340
1391
|
project: Optional[Project] = None,
|
|
1341
1392
|
client_config=None,
|
|
1342
1393
|
) -> list[str]:
|
|
1343
|
-
dataset = self.get_dataset(
|
|
1394
|
+
dataset = self.get_dataset(
|
|
1395
|
+
name,
|
|
1396
|
+
namespace_name=project.namespace.name if project else None,
|
|
1397
|
+
project_name=project.name if project else None,
|
|
1398
|
+
)
|
|
1344
1399
|
|
|
1345
1400
|
return self.warehouse.export_dataset_table(
|
|
1346
1401
|
bucket_uri, dataset, version, client_config
|
|
@@ -1349,7 +1404,11 @@ class Catalog:
|
|
|
1349
1404
|
def dataset_table_export_file_names(
|
|
1350
1405
|
self, name: str, version: str, project: Optional[Project] = None
|
|
1351
1406
|
) -> list[str]:
|
|
1352
|
-
dataset = self.get_dataset(
|
|
1407
|
+
dataset = self.get_dataset(
|
|
1408
|
+
name,
|
|
1409
|
+
namespace_name=project.namespace.name if project else None,
|
|
1410
|
+
project_name=project.name if project else None,
|
|
1411
|
+
)
|
|
1353
1412
|
return self.warehouse.dataset_table_export_file_names(dataset, version)
|
|
1354
1413
|
|
|
1355
1414
|
def remove_dataset(
|
|
@@ -1359,7 +1418,11 @@ class Catalog:
|
|
|
1359
1418
|
version: Optional[str] = None,
|
|
1360
1419
|
force: Optional[bool] = False,
|
|
1361
1420
|
):
|
|
1362
|
-
dataset = self.get_dataset(
|
|
1421
|
+
dataset = self.get_dataset(
|
|
1422
|
+
name,
|
|
1423
|
+
namespace_name=project.namespace.name if project else None,
|
|
1424
|
+
project_name=project.name if project else None,
|
|
1425
|
+
)
|
|
1363
1426
|
if not version and not force:
|
|
1364
1427
|
raise ValueError(f"Missing dataset version from input for dataset {name}")
|
|
1365
1428
|
if version and not dataset.has_version(version):
|
|
@@ -1395,7 +1458,11 @@ class Catalog:
|
|
|
1395
1458
|
if attrs is not None:
|
|
1396
1459
|
update_data["attrs"] = attrs # type: ignore[assignment]
|
|
1397
1460
|
|
|
1398
|
-
dataset = self.get_dataset(
|
|
1461
|
+
dataset = self.get_dataset(
|
|
1462
|
+
name,
|
|
1463
|
+
namespace_name=project.namespace.name if project else None,
|
|
1464
|
+
project_name=project.name if project else None,
|
|
1465
|
+
)
|
|
1399
1466
|
return self.update_dataset(dataset, **update_data)
|
|
1400
1467
|
|
|
1401
1468
|
def ls(
|
|
@@ -1549,7 +1616,9 @@ class Catalog:
|
|
|
1549
1616
|
)
|
|
1550
1617
|
|
|
1551
1618
|
try:
|
|
1552
|
-
local_dataset = self.get_dataset(
|
|
1619
|
+
local_dataset = self.get_dataset(
|
|
1620
|
+
local_ds_name, namespace_name=namespace.name, project_name=project.name
|
|
1621
|
+
)
|
|
1553
1622
|
if local_dataset and local_dataset.has_version(local_ds_version):
|
|
1554
1623
|
raise DataChainError(
|
|
1555
1624
|
f"Local dataset {local_ds_uri} already exists with different uuid,"
|
datachain/catalog/loader.py
CHANGED
|
@@ -127,7 +127,8 @@ def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
|
|
|
127
127
|
|
|
128
128
|
|
|
129
129
|
def get_catalog(
|
|
130
|
-
client_config: Optional[dict[str, Any]] = None,
|
|
130
|
+
client_config: Optional[dict[str, Any]] = None,
|
|
131
|
+
in_memory: bool = False,
|
|
131
132
|
) -> "Catalog":
|
|
132
133
|
"""
|
|
133
134
|
Function that creates Catalog instance with appropriate metastore
|
|
@@ -142,8 +143,9 @@ def get_catalog(
|
|
|
142
143
|
"""
|
|
143
144
|
from datachain.catalog import Catalog
|
|
144
145
|
|
|
146
|
+
metastore = get_metastore(in_memory=in_memory)
|
|
145
147
|
return Catalog(
|
|
146
|
-
metastore=
|
|
148
|
+
metastore=metastore,
|
|
147
149
|
warehouse=get_warehouse(in_memory=in_memory),
|
|
148
150
|
client_config=client_config,
|
|
149
151
|
in_memory=in_memory,
|
datachain/cli/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ from tabulate import tabulate
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from datachain.catalog import Catalog
|
|
8
8
|
|
|
9
|
+
from datachain.catalog import is_namespace_local
|
|
9
10
|
from datachain.cli.utils import determine_flavors
|
|
10
11
|
from datachain.config import Config
|
|
11
12
|
from datachain.error import DataChainError, DatasetNotFoundError
|
|
@@ -107,8 +108,9 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
|
|
|
107
108
|
def list_datasets_local_versions(catalog: "Catalog", name: str):
|
|
108
109
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
109
110
|
|
|
110
|
-
|
|
111
|
-
|
|
111
|
+
ds = catalog.get_dataset(
|
|
112
|
+
name, namespace_name=namespace_name, project_name=project_name
|
|
113
|
+
)
|
|
112
114
|
for v in ds.versions:
|
|
113
115
|
yield (name, v.version)
|
|
114
116
|
|
|
@@ -137,15 +139,18 @@ def rm_dataset(
|
|
|
137
139
|
):
|
|
138
140
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
139
141
|
|
|
140
|
-
if
|
|
142
|
+
if studio:
|
|
143
|
+
# removing Studio dataset from CLI
|
|
141
144
|
from datachain.studio import remove_studio_dataset
|
|
142
145
|
|
|
143
|
-
|
|
144
|
-
|
|
146
|
+
if Config().read().get("studio", {}).get("token"):
|
|
147
|
+
remove_studio_dataset(
|
|
148
|
+
team, name, namespace_name, project_name, version, force
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
145
151
|
raise DataChainError(
|
|
146
152
|
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
147
153
|
)
|
|
148
|
-
remove_studio_dataset(team, name, namespace_name, project_name, version, force)
|
|
149
154
|
else:
|
|
150
155
|
try:
|
|
151
156
|
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
@@ -162,9 +167,11 @@ def edit_dataset(
|
|
|
162
167
|
attrs: Optional[list[str]] = None,
|
|
163
168
|
team: Optional[str] = None,
|
|
164
169
|
):
|
|
170
|
+
from datachain.lib.dc.utils import is_studio
|
|
171
|
+
|
|
165
172
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
166
173
|
|
|
167
|
-
if
|
|
174
|
+
if is_studio() or is_namespace_local(namespace_name):
|
|
168
175
|
try:
|
|
169
176
|
catalog.edit_dataset(
|
|
170
177
|
name, catalog.metastore.default_project, new_name, description, attrs
|
|
@@ -174,11 +181,11 @@ def edit_dataset(
|
|
|
174
181
|
else:
|
|
175
182
|
from datachain.studio import edit_studio_dataset
|
|
176
183
|
|
|
177
|
-
|
|
178
|
-
|
|
184
|
+
if Config().read().get("studio", {}).get("token"):
|
|
185
|
+
edit_studio_dataset(
|
|
186
|
+
team, name, namespace_name, project_name, new_name, description, attrs
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
179
189
|
raise DataChainError(
|
|
180
190
|
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
181
191
|
)
|
|
182
|
-
edit_studio_dataset(
|
|
183
|
-
team, name, namespace_name, project_name, new_name, description, attrs
|
|
184
|
-
)
|
|
@@ -145,23 +145,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
145
145
|
def list_namespaces(self, conn=None) -> list[Namespace]:
|
|
146
146
|
"""Gets a list of all namespaces"""
|
|
147
147
|
|
|
148
|
-
@property
|
|
149
|
-
@abstractmethod
|
|
150
|
-
def is_studio(self) -> bool:
|
|
151
|
-
"""Returns True if this code is ran in Studio"""
|
|
152
|
-
|
|
153
|
-
def is_local_dataset(self, dataset_namespace: str) -> bool:
|
|
154
|
-
"""
|
|
155
|
-
Returns True if this is local dataset i.e. not pulled from Studio but
|
|
156
|
-
created locally. This is False if we ran code in CLI mode but using dataset
|
|
157
|
-
names that are present in Studio.
|
|
158
|
-
"""
|
|
159
|
-
return self.is_studio or dataset_namespace == Namespace.default()
|
|
160
|
-
|
|
161
|
-
@property
|
|
162
|
-
def namespace_allowed_to_create(self):
|
|
163
|
-
return self.is_studio
|
|
164
|
-
|
|
165
148
|
#
|
|
166
149
|
# Projects
|
|
167
150
|
#
|
|
@@ -215,10 +198,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
215
198
|
def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
|
|
216
199
|
"""Gets list of projects in some namespace or in general (in all namespaces)"""
|
|
217
200
|
|
|
218
|
-
@property
|
|
219
|
-
def project_allowed_to_create(self):
|
|
220
|
-
return self.is_studio
|
|
221
|
-
|
|
222
201
|
#
|
|
223
202
|
# Datasets
|
|
224
203
|
#
|
|
@@ -301,7 +280,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
301
280
|
"""
|
|
302
281
|
|
|
303
282
|
@abstractmethod
|
|
304
|
-
def get_dataset(
|
|
283
|
+
def get_dataset(
|
|
284
|
+
self,
|
|
285
|
+
name: str, # normal, not full dataset name
|
|
286
|
+
namespace_name: Optional[str] = None,
|
|
287
|
+
project_name: Optional[str] = None,
|
|
288
|
+
conn=None,
|
|
289
|
+
) -> DatasetRecord:
|
|
305
290
|
"""Gets a single dataset by name."""
|
|
306
291
|
|
|
307
292
|
@abstractmethod
|
|
@@ -912,11 +897,14 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
912
897
|
**kwargs, # TODO registered = True / False
|
|
913
898
|
) -> DatasetRecord:
|
|
914
899
|
"""Creates new dataset."""
|
|
915
|
-
|
|
900
|
+
if not project_id:
|
|
901
|
+
project = self.default_project
|
|
902
|
+
else:
|
|
903
|
+
project = self.get_project_by_id(project_id)
|
|
916
904
|
|
|
917
905
|
query = self._datasets_insert().values(
|
|
918
906
|
name=name,
|
|
919
|
-
project_id=
|
|
907
|
+
project_id=project.id,
|
|
920
908
|
status=status,
|
|
921
909
|
feature_schema=json.dumps(feature_schema or {}),
|
|
922
910
|
created_at=datetime.now(timezone.utc),
|
|
@@ -935,7 +923,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
935
923
|
query = query.on_conflict_do_nothing(index_elements=["project_id", "name"])
|
|
936
924
|
self.db.execute(query)
|
|
937
925
|
|
|
938
|
-
return self.get_dataset(
|
|
926
|
+
return self.get_dataset(
|
|
927
|
+
name, namespace_name=project.namespace.name, project_name=project.name
|
|
928
|
+
)
|
|
939
929
|
|
|
940
930
|
def create_dataset_version( # noqa: PLR0913
|
|
941
931
|
self,
|
|
@@ -992,7 +982,12 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
992
982
|
)
|
|
993
983
|
self.db.execute(query, conn=conn)
|
|
994
984
|
|
|
995
|
-
return self.get_dataset(
|
|
985
|
+
return self.get_dataset(
|
|
986
|
+
dataset.name,
|
|
987
|
+
namespace_name=dataset.project.namespace.name,
|
|
988
|
+
project_name=dataset.project.name,
|
|
989
|
+
conn=conn,
|
|
990
|
+
)
|
|
996
991
|
|
|
997
992
|
def remove_dataset(self, dataset: DatasetRecord) -> None:
|
|
998
993
|
"""Removes dataset."""
|
|
@@ -1216,21 +1211,30 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1216
1211
|
def get_dataset(
|
|
1217
1212
|
self,
|
|
1218
1213
|
name: str, # normal, not full dataset name
|
|
1219
|
-
|
|
1214
|
+
namespace_name: Optional[str] = None,
|
|
1215
|
+
project_name: Optional[str] = None,
|
|
1220
1216
|
conn=None,
|
|
1221
1217
|
) -> DatasetRecord:
|
|
1222
1218
|
"""
|
|
1223
1219
|
Gets a single dataset in project by dataset name.
|
|
1224
1220
|
"""
|
|
1225
|
-
|
|
1221
|
+
namespace_name = namespace_name or self.default_namespace_name
|
|
1222
|
+
project_name = project_name or self.default_project_name
|
|
1226
1223
|
|
|
1227
1224
|
d = self._datasets
|
|
1225
|
+
n = self._namespaces
|
|
1226
|
+
p = self._projects
|
|
1228
1227
|
query = self._base_dataset_query()
|
|
1229
|
-
query = query.where(
|
|
1228
|
+
query = query.where(
|
|
1229
|
+
d.c.name == name,
|
|
1230
|
+
n.c.name == namespace_name,
|
|
1231
|
+
p.c.name == project_name,
|
|
1232
|
+
) # type: ignore [attr-defined]
|
|
1230
1233
|
ds = self._parse_dataset(self.db.execute(query, conn=conn))
|
|
1231
1234
|
if not ds:
|
|
1232
1235
|
raise DatasetNotFoundError(
|
|
1233
|
-
f"Dataset {name} not found in
|
|
1236
|
+
f"Dataset {name} not found in namespace {namespace_name}"
|
|
1237
|
+
f" and project {project_name}"
|
|
1234
1238
|
)
|
|
1235
1239
|
|
|
1236
1240
|
return ds
|
datachain/data_storage/sqlite.py
CHANGED
datachain/delta.py
CHANGED
|
@@ -77,7 +77,8 @@ def _get_delta_chain(
|
|
|
77
77
|
|
|
78
78
|
def _get_retry_chain(
|
|
79
79
|
name: str,
|
|
80
|
-
|
|
80
|
+
namespace_name: str,
|
|
81
|
+
project_name: str,
|
|
81
82
|
latest_version: str,
|
|
82
83
|
source_ds_name: str,
|
|
83
84
|
source_ds_project: Project,
|
|
@@ -96,8 +97,8 @@ def _get_retry_chain(
|
|
|
96
97
|
# Read the latest version of the result dataset for retry logic
|
|
97
98
|
result_dataset = datachain.read_dataset(
|
|
98
99
|
name,
|
|
99
|
-
namespace=
|
|
100
|
-
project=
|
|
100
|
+
namespace=namespace_name,
|
|
101
|
+
project=project_name,
|
|
101
102
|
version=latest_version,
|
|
102
103
|
)
|
|
103
104
|
source_dc = datachain.read_dataset(
|
|
@@ -128,7 +129,8 @@ def _get_retry_chain(
|
|
|
128
129
|
|
|
129
130
|
def _get_source_info(
|
|
130
131
|
name: str,
|
|
131
|
-
|
|
132
|
+
namespace_name: str,
|
|
133
|
+
project_name: str,
|
|
132
134
|
latest_version: str,
|
|
133
135
|
catalog,
|
|
134
136
|
) -> tuple[
|
|
@@ -145,7 +147,11 @@ def _get_source_info(
|
|
|
145
147
|
Returns (None, None, None, None) if source dataset was removed.
|
|
146
148
|
"""
|
|
147
149
|
dependencies = catalog.get_dataset_dependencies(
|
|
148
|
-
name,
|
|
150
|
+
name,
|
|
151
|
+
latest_version,
|
|
152
|
+
namespace_name=namespace_name,
|
|
153
|
+
project_name=project_name,
|
|
154
|
+
indirect=False,
|
|
149
155
|
)
|
|
150
156
|
|
|
151
157
|
dep = dependencies[0]
|
|
@@ -157,7 +163,9 @@ def _get_source_info(
|
|
|
157
163
|
source_ds_name = dep.name
|
|
158
164
|
source_ds_version = dep.version
|
|
159
165
|
source_ds_latest_version = catalog.get_dataset(
|
|
160
|
-
source_ds_name,
|
|
166
|
+
source_ds_name,
|
|
167
|
+
namespace_name=source_ds_project.namespace.name,
|
|
168
|
+
project_name=source_ds_project.name,
|
|
161
169
|
).latest_version
|
|
162
170
|
|
|
163
171
|
return (
|
|
@@ -211,12 +219,14 @@ def delta_retry_update(
|
|
|
211
219
|
"""
|
|
212
220
|
|
|
213
221
|
catalog = dc.session.catalog
|
|
214
|
-
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
222
|
+
# project = catalog.metastore.get_project(project_name, namespace_name)
|
|
215
223
|
dc._query.apply_listing_pre_step()
|
|
216
224
|
|
|
217
225
|
# Check if dataset exists
|
|
218
226
|
try:
|
|
219
|
-
dataset = catalog.get_dataset(
|
|
227
|
+
dataset = catalog.get_dataset(
|
|
228
|
+
name, namespace_name=namespace_name, project_name=project_name
|
|
229
|
+
)
|
|
220
230
|
latest_version = dataset.latest_version
|
|
221
231
|
except DatasetNotFoundError:
|
|
222
232
|
# First creation of result dataset
|
|
@@ -234,7 +244,7 @@ def delta_retry_update(
|
|
|
234
244
|
source_ds_version,
|
|
235
245
|
source_ds_latest_version,
|
|
236
246
|
dependencies,
|
|
237
|
-
) = _get_source_info(name,
|
|
247
|
+
) = _get_source_info(name, namespace_name, project_name, latest_version, catalog)
|
|
238
248
|
|
|
239
249
|
# If source_ds_name is None, starting dataset was removed
|
|
240
250
|
if source_ds_name is None:
|
|
@@ -264,7 +274,8 @@ def delta_retry_update(
|
|
|
264
274
|
if delta_retry:
|
|
265
275
|
retry_chain = _get_retry_chain(
|
|
266
276
|
name,
|
|
267
|
-
|
|
277
|
+
namespace_name,
|
|
278
|
+
project_name,
|
|
268
279
|
latest_version,
|
|
269
280
|
source_ds_name,
|
|
270
281
|
source_ds_project,
|
|
@@ -290,8 +301,8 @@ def delta_retry_update(
|
|
|
290
301
|
|
|
291
302
|
latest_dataset = datachain.read_dataset(
|
|
292
303
|
name,
|
|
293
|
-
namespace=
|
|
294
|
-
project=
|
|
304
|
+
namespace=namespace_name,
|
|
305
|
+
project=project_name,
|
|
295
306
|
version=latest_version,
|
|
296
307
|
)
|
|
297
308
|
compared_chain = latest_dataset.diff(
|
datachain/func/string.py
CHANGED
datachain/lib/dc/__init__.py
CHANGED
|
@@ -9,7 +9,7 @@ from .pandas import read_pandas
|
|
|
9
9
|
from .parquet import read_parquet
|
|
10
10
|
from .records import read_records
|
|
11
11
|
from .storage import read_storage
|
|
12
|
-
from .utils import DatasetMergeError, DatasetPrepareError, Sys
|
|
12
|
+
from .utils import DatasetMergeError, DatasetPrepareError, Sys, is_studio
|
|
13
13
|
from .values import read_values
|
|
14
14
|
|
|
15
15
|
__all__ = [
|
|
@@ -21,6 +21,7 @@ __all__ = [
|
|
|
21
21
|
"Sys",
|
|
22
22
|
"datasets",
|
|
23
23
|
"delete_dataset",
|
|
24
|
+
"is_studio",
|
|
24
25
|
"listings",
|
|
25
26
|
"move_dataset",
|
|
26
27
|
"read_csv",
|