datachain 0.21.1__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +180 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +43 -28
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +1 -35
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +422 -37
- datachain/data_storage/sqlite.py +136 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +126 -12
- datachain/delta.py +11 -7
- datachain/error.py +36 -0
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +260 -92
- datachain/lib/dc/datasets.py +104 -50
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +1 -0
- datachain/lib/dc/storage.py +38 -40
- datachain/lib/file.py +77 -23
- datachain/lib/listing.py +3 -1
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +71 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +10 -0
- datachain/lib/tar.py +1 -2
- datachain/lib/udf.py +1 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +61 -26
- datachain/studio.py +23 -6
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
datachain/query/dataset.py
CHANGED
|
@@ -41,12 +41,13 @@ from datachain.data_storage.schema import (
|
|
|
41
41
|
partition_col_names,
|
|
42
42
|
partition_columns,
|
|
43
43
|
)
|
|
44
|
-
from datachain.dataset import
|
|
44
|
+
from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
|
|
45
45
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
46
46
|
from datachain.func.base import Function
|
|
47
47
|
from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
|
|
48
48
|
from datachain.lib.udf import UDFAdapter, _get_cache
|
|
49
49
|
from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
|
|
50
|
+
from datachain.project import Project
|
|
50
51
|
from datachain.query.schema import C, UDFParamSpec, normalize_param
|
|
51
52
|
from datachain.query.session import Session
|
|
52
53
|
from datachain.query.udf import UdfInfo
|
|
@@ -83,7 +84,7 @@ PartitionByType = Union[
|
|
|
83
84
|
Function, ColumnElement, Sequence[Union[Function, ColumnElement]]
|
|
84
85
|
]
|
|
85
86
|
JoinPredicateType = Union[str, ColumnClause, ColumnElement]
|
|
86
|
-
DatasetDependencyType = tuple[
|
|
87
|
+
DatasetDependencyType = tuple["DatasetRecord", str]
|
|
87
88
|
|
|
88
89
|
logger = logging.getLogger("datachain")
|
|
89
90
|
|
|
@@ -169,18 +170,17 @@ class QueryStep:
|
|
|
169
170
|
"""A query that returns all rows from specific dataset version"""
|
|
170
171
|
|
|
171
172
|
catalog: "Catalog"
|
|
172
|
-
|
|
173
|
+
dataset: "DatasetRecord"
|
|
173
174
|
dataset_version: str
|
|
174
175
|
|
|
175
176
|
def apply(self) -> "StepResult":
|
|
176
177
|
def q(*columns):
|
|
177
178
|
return sqlalchemy.select(*columns)
|
|
178
179
|
|
|
179
|
-
|
|
180
|
-
dr = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
|
|
180
|
+
dr = self.catalog.warehouse.dataset_rows(self.dataset, self.dataset_version)
|
|
181
181
|
|
|
182
182
|
return step_result(
|
|
183
|
-
q, dr.columns, dependencies=[(self.
|
|
183
|
+
q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
|
|
184
184
|
)
|
|
185
185
|
|
|
186
186
|
|
|
@@ -1095,6 +1095,8 @@ class DatasetQuery:
|
|
|
1095
1095
|
self,
|
|
1096
1096
|
name: str,
|
|
1097
1097
|
version: Optional[str] = None,
|
|
1098
|
+
project_name: Optional[str] = None,
|
|
1099
|
+
namespace_name: Optional[str] = None,
|
|
1098
1100
|
catalog: Optional["Catalog"] = None,
|
|
1099
1101
|
session: Optional[Session] = None,
|
|
1100
1102
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
@@ -1128,33 +1130,38 @@ class DatasetQuery:
|
|
|
1128
1130
|
if version:
|
|
1129
1131
|
self.version = version
|
|
1130
1132
|
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
# this point
|
|
1139
|
-
self.list_ds_name = name
|
|
1133
|
+
namespace_name = namespace_name or self.catalog.metastore.default_namespace_name
|
|
1134
|
+
project_name = project_name or self.catalog.metastore.default_project_name
|
|
1135
|
+
|
|
1136
|
+
if is_listing_dataset(name) and not version:
|
|
1137
|
+
# not setting query step yet as listing dataset might not exist at
|
|
1138
|
+
# this point
|
|
1139
|
+
self.list_ds_name = name
|
|
1140
1140
|
elif fallback_to_studio and is_token_set():
|
|
1141
1141
|
self._set_starting_step(
|
|
1142
|
-
self.catalog.get_dataset_with_remote_fallback(
|
|
1142
|
+
self.catalog.get_dataset_with_remote_fallback(
|
|
1143
|
+
name,
|
|
1144
|
+
namespace_name=namespace_name,
|
|
1145
|
+
project_name=project_name,
|
|
1146
|
+
version=version,
|
|
1147
|
+
)
|
|
1143
1148
|
)
|
|
1144
1149
|
else:
|
|
1145
|
-
self.
|
|
1150
|
+
project = self.catalog.metastore.get_project(project_name, namespace_name)
|
|
1151
|
+
self._set_starting_step(self.catalog.get_dataset(name, project=project))
|
|
1146
1152
|
|
|
1147
1153
|
def _set_starting_step(self, ds: "DatasetRecord") -> None:
|
|
1148
1154
|
if not self.version:
|
|
1149
1155
|
self.version = ds.latest_version
|
|
1150
1156
|
|
|
1151
|
-
self.starting_step = QueryStep(self.catalog, ds
|
|
1157
|
+
self.starting_step = QueryStep(self.catalog, ds, self.version)
|
|
1152
1158
|
|
|
1153
1159
|
# at this point we know our starting dataset so setting up schemas
|
|
1154
1160
|
self.feature_schema = ds.get_version(self.version).feature_schema
|
|
1155
1161
|
self.column_types = copy(ds.schema)
|
|
1156
1162
|
if "sys__id" in self.column_types:
|
|
1157
1163
|
self.column_types.pop("sys__id")
|
|
1164
|
+
self.project = ds.project
|
|
1158
1165
|
|
|
1159
1166
|
def __iter__(self):
|
|
1160
1167
|
return iter(self.db_results())
|
|
@@ -1162,21 +1169,6 @@ class DatasetQuery:
|
|
|
1162
1169
|
def __or__(self, other):
|
|
1163
1170
|
return self.union(other)
|
|
1164
1171
|
|
|
1165
|
-
def pull_dataset(self, name: str, version: Optional[str] = None) -> "DatasetRecord":
|
|
1166
|
-
print("Dataset not found in local catalog, trying to get from studio")
|
|
1167
|
-
|
|
1168
|
-
remote_ds_uri = f"{DATASET_PREFIX}{name}"
|
|
1169
|
-
if version:
|
|
1170
|
-
remote_ds_uri += f"@v{version}"
|
|
1171
|
-
|
|
1172
|
-
self.catalog.pull_dataset(
|
|
1173
|
-
remote_ds_uri=remote_ds_uri,
|
|
1174
|
-
local_ds_name=name,
|
|
1175
|
-
local_ds_version=version,
|
|
1176
|
-
)
|
|
1177
|
-
|
|
1178
|
-
return self.catalog.get_dataset(name)
|
|
1179
|
-
|
|
1180
1172
|
@staticmethod
|
|
1181
1173
|
def get_table() -> "TableClause":
|
|
1182
1174
|
table_name = "".join(
|
|
@@ -1657,6 +1649,8 @@ class DatasetQuery:
|
|
|
1657
1649
|
workers: Union[bool, int] = False,
|
|
1658
1650
|
min_task_size: Optional[int] = None,
|
|
1659
1651
|
partition_by: Optional[PartitionByType] = None,
|
|
1652
|
+
namespace: Optional[str] = None,
|
|
1653
|
+
project: Optional[str] = None,
|
|
1660
1654
|
cache: bool = False,
|
|
1661
1655
|
) -> "Self":
|
|
1662
1656
|
query = self.clone()
|
|
@@ -1676,26 +1670,36 @@ class DatasetQuery:
|
|
|
1676
1670
|
|
|
1677
1671
|
def _add_dependencies(self, dataset: "DatasetRecord", version: str):
|
|
1678
1672
|
dependencies: set[DatasetDependencyType] = set()
|
|
1679
|
-
for
|
|
1680
|
-
if Session.is_temp_dataset(
|
|
1673
|
+
for dep_dataset, dep_dataset_version in self.dependencies:
|
|
1674
|
+
if Session.is_temp_dataset(dep_dataset.name):
|
|
1681
1675
|
# temp dataset are created for optimization and they will be removed
|
|
1682
1676
|
# afterwards. Therefore, we should not put them as dependencies, but
|
|
1683
1677
|
# their own direct dependencies
|
|
1684
1678
|
for dep in self.catalog.get_dataset_dependencies(
|
|
1685
|
-
|
|
1679
|
+
dep_dataset.name,
|
|
1680
|
+
dep_dataset_version,
|
|
1681
|
+
dep_dataset.project,
|
|
1682
|
+
indirect=False,
|
|
1686
1683
|
):
|
|
1687
1684
|
if dep:
|
|
1688
|
-
|
|
1685
|
+
dep_project = self.catalog.metastore.get_project(
|
|
1686
|
+
dep.project, dep.namespace
|
|
1687
|
+
)
|
|
1688
|
+
dependencies.add(
|
|
1689
|
+
(
|
|
1690
|
+
self.catalog.get_dataset(dep.name, dep_project),
|
|
1691
|
+
dep.version,
|
|
1692
|
+
)
|
|
1693
|
+
)
|
|
1689
1694
|
else:
|
|
1690
|
-
dependencies.add((
|
|
1695
|
+
dependencies.add((dep_dataset, dep_dataset_version))
|
|
1691
1696
|
|
|
1692
|
-
for
|
|
1693
|
-
# ds_dependency_name, ds_dependency_version = dependency
|
|
1697
|
+
for dep_dataset, dep_dataset_version in dependencies:
|
|
1694
1698
|
self.catalog.metastore.add_dataset_dependency(
|
|
1695
|
-
dataset
|
|
1699
|
+
dataset,
|
|
1696
1700
|
version,
|
|
1697
|
-
|
|
1698
|
-
|
|
1701
|
+
dep_dataset,
|
|
1702
|
+
dep_dataset_version,
|
|
1699
1703
|
)
|
|
1700
1704
|
|
|
1701
1705
|
def exec(self) -> "Self":
|
|
@@ -1711,6 +1715,7 @@ class DatasetQuery:
|
|
|
1711
1715
|
self,
|
|
1712
1716
|
name: Optional[str] = None,
|
|
1713
1717
|
version: Optional[str] = None,
|
|
1718
|
+
project: Optional[Project] = None,
|
|
1714
1719
|
feature_schema: Optional[dict] = None,
|
|
1715
1720
|
dependencies: Optional[list[DatasetDependency]] = None,
|
|
1716
1721
|
description: Optional[str] = None,
|
|
@@ -1719,8 +1724,13 @@ class DatasetQuery:
|
|
|
1719
1724
|
**kwargs,
|
|
1720
1725
|
) -> "Self":
|
|
1721
1726
|
"""Save the query as a dataset."""
|
|
1727
|
+
project = project or self.catalog.metastore.default_project
|
|
1722
1728
|
try:
|
|
1723
|
-
if
|
|
1729
|
+
if (
|
|
1730
|
+
name
|
|
1731
|
+
and version
|
|
1732
|
+
and self.catalog.get_dataset(name, project).has_version(version)
|
|
1733
|
+
):
|
|
1724
1734
|
raise RuntimeError(f"Dataset {name} already has version {version}")
|
|
1725
1735
|
except DatasetNotFoundError:
|
|
1726
1736
|
pass
|
|
@@ -1745,6 +1755,7 @@ class DatasetQuery:
|
|
|
1745
1755
|
|
|
1746
1756
|
dataset = self.catalog.create_dataset(
|
|
1747
1757
|
name,
|
|
1758
|
+
project,
|
|
1748
1759
|
version=version,
|
|
1749
1760
|
feature_schema=feature_schema,
|
|
1750
1761
|
columns=columns,
|
|
@@ -1770,11 +1781,25 @@ class DatasetQuery:
|
|
|
1770
1781
|
|
|
1771
1782
|
if dependencies:
|
|
1772
1783
|
# overriding dependencies
|
|
1773
|
-
self.dependencies =
|
|
1784
|
+
self.dependencies = set()
|
|
1785
|
+
for dep in dependencies:
|
|
1786
|
+
dep_project = self.catalog.metastore.get_project(
|
|
1787
|
+
dep.project, dep.namespace
|
|
1788
|
+
)
|
|
1789
|
+
self.dependencies.add(
|
|
1790
|
+
(self.catalog.get_dataset(dep.name, dep_project), dep.version)
|
|
1791
|
+
)
|
|
1792
|
+
|
|
1774
1793
|
self._add_dependencies(dataset, version) # type: ignore [arg-type]
|
|
1775
1794
|
finally:
|
|
1776
1795
|
self.cleanup()
|
|
1777
|
-
return self.__class__(
|
|
1796
|
+
return self.__class__(
|
|
1797
|
+
name=name,
|
|
1798
|
+
namespace_name=project.namespace.name,
|
|
1799
|
+
project_name=project.name,
|
|
1800
|
+
version=version,
|
|
1801
|
+
catalog=self.catalog,
|
|
1802
|
+
)
|
|
1778
1803
|
|
|
1779
1804
|
@property
|
|
1780
1805
|
def is_ordered(self) -> bool:
|
datachain/query/session.py
CHANGED
|
@@ -108,7 +108,7 @@ class Session:
|
|
|
108
108
|
prefix = self.get_temp_prefix()
|
|
109
109
|
try:
|
|
110
110
|
for dataset in list(self.catalog.metastore.list_datasets_by_prefix(prefix)):
|
|
111
|
-
self.catalog.remove_dataset(dataset.name, force=True)
|
|
111
|
+
self.catalog.remove_dataset(dataset.name, dataset.project, force=True)
|
|
112
112
|
# suppress error when metastore has been reset during testing
|
|
113
113
|
except TableMissingError:
|
|
114
114
|
pass
|
datachain/remote/studio.py
CHANGED
|
@@ -17,6 +17,7 @@ import websockets
|
|
|
17
17
|
from requests.exceptions import HTTPError, Timeout
|
|
18
18
|
|
|
19
19
|
from datachain.config import Config
|
|
20
|
+
from datachain.dataset import DatasetRecord
|
|
20
21
|
from datachain.error import DataChainError
|
|
21
22
|
from datachain.utils import STUDIO_URL, retry_with_backoff
|
|
22
23
|
|
|
@@ -36,13 +37,33 @@ logger = logging.getLogger("datachain")
|
|
|
36
37
|
DATASET_ROWS_CHUNK_SIZE = 8192
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
def get_studio_env_variable(name: str) -> Any:
|
|
41
|
+
"""
|
|
42
|
+
Get the value of a DataChain Studio environment variable.
|
|
43
|
+
It first checks for the variable prefixed with 'DATACHAIN_STUDIO_',
|
|
44
|
+
then checks for the deprecated 'DVC_STUDIO_' prefix.
|
|
45
|
+
If neither is set, it returns the provided default value.
|
|
46
|
+
"""
|
|
47
|
+
if (value := os.environ.get(f"DATACHAIN_STUDIO_{name}")) is not None:
|
|
48
|
+
return value
|
|
49
|
+
if (value := os.environ.get(f"DVC_STUDIO_{name}")) is not None: # deprecated
|
|
50
|
+
logger.warning(
|
|
51
|
+
"Environment variable 'DVC_STUDIO_%s' is deprecated, "
|
|
52
|
+
"use 'DATACHAIN_STUDIO_%s' instead.",
|
|
53
|
+
name,
|
|
54
|
+
name,
|
|
55
|
+
)
|
|
56
|
+
return value
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
|
|
39
60
|
def _is_server_error(status_code: int) -> bool:
|
|
40
61
|
return str(status_code).startswith("5")
|
|
41
62
|
|
|
42
63
|
|
|
43
64
|
def is_token_set() -> bool:
|
|
44
65
|
return (
|
|
45
|
-
bool(
|
|
66
|
+
bool(get_studio_env_variable("TOKEN"))
|
|
46
67
|
or Config().read().get("studio", {}).get("token") is not None
|
|
47
68
|
)
|
|
48
69
|
|
|
@@ -78,12 +99,12 @@ class StudioClient:
|
|
|
78
99
|
|
|
79
100
|
@property
|
|
80
101
|
def token(self) -> str:
|
|
81
|
-
token =
|
|
102
|
+
token = get_studio_env_variable("TOKEN") or self.config.get("token")
|
|
82
103
|
|
|
83
104
|
if not token:
|
|
84
105
|
raise DataChainError(
|
|
85
106
|
"Studio token is not set. Use `datachain auth login` "
|
|
86
|
-
"or environment variable `
|
|
107
|
+
"or environment variable `DATACHAIN_STUDIO_TOKEN` to set it."
|
|
87
108
|
)
|
|
88
109
|
|
|
89
110
|
return token
|
|
@@ -91,8 +112,8 @@ class StudioClient:
|
|
|
91
112
|
@property
|
|
92
113
|
def url(self) -> str:
|
|
93
114
|
return (
|
|
94
|
-
|
|
95
|
-
) + "/api"
|
|
115
|
+
get_studio_env_variable("URL") or self.config.get("url") or STUDIO_URL
|
|
116
|
+
).rstrip("/") + "/api"
|
|
96
117
|
|
|
97
118
|
@property
|
|
98
119
|
def config(self) -> dict:
|
|
@@ -107,13 +128,13 @@ class StudioClient:
|
|
|
107
128
|
return self._team
|
|
108
129
|
|
|
109
130
|
def _get_team(self) -> str:
|
|
110
|
-
team =
|
|
131
|
+
team = get_studio_env_variable("TEAM") or self.config.get("team")
|
|
111
132
|
|
|
112
133
|
if not team:
|
|
113
134
|
raise DataChainError(
|
|
114
135
|
"Studio team is not set. "
|
|
115
136
|
"Use `datachain auth team <team_name>` "
|
|
116
|
-
"or environment variable `
|
|
137
|
+
"or environment variable `DATACHAIN_STUDIO_TEAM` to set it. "
|
|
117
138
|
"You can also set `studio.team` in the config file."
|
|
118
139
|
)
|
|
119
140
|
|
|
@@ -291,13 +312,17 @@ class StudioClient:
|
|
|
291
312
|
def edit_dataset(
|
|
292
313
|
self,
|
|
293
314
|
name: str,
|
|
315
|
+
namespace: str,
|
|
316
|
+
project: str,
|
|
294
317
|
new_name: Optional[str] = None,
|
|
295
318
|
description: Optional[str] = None,
|
|
296
319
|
attrs: Optional[list[str]] = None,
|
|
297
320
|
) -> Response[DatasetInfoData]:
|
|
298
321
|
body = {
|
|
299
322
|
"new_name": new_name,
|
|
300
|
-
"
|
|
323
|
+
"name": name,
|
|
324
|
+
"namespace": namespace,
|
|
325
|
+
"project": project,
|
|
301
326
|
"description": description,
|
|
302
327
|
"attrs": attrs,
|
|
303
328
|
}
|
|
@@ -310,44 +335,44 @@ class StudioClient:
|
|
|
310
335
|
def rm_dataset(
|
|
311
336
|
self,
|
|
312
337
|
name: str,
|
|
338
|
+
namespace: str,
|
|
339
|
+
project: str,
|
|
313
340
|
version: Optional[str] = None,
|
|
314
341
|
force: Optional[bool] = False,
|
|
315
342
|
) -> Response[DatasetInfoData]:
|
|
316
343
|
return self._send_request(
|
|
317
344
|
"datachain/datasets",
|
|
318
345
|
{
|
|
319
|
-
"
|
|
320
|
-
"
|
|
346
|
+
"name": name,
|
|
347
|
+
"namespace": namespace,
|
|
348
|
+
"project": project,
|
|
349
|
+
"version": version,
|
|
321
350
|
"force": force,
|
|
322
351
|
},
|
|
323
352
|
method="DELETE",
|
|
324
353
|
)
|
|
325
354
|
|
|
326
|
-
def dataset_info(
|
|
355
|
+
def dataset_info(
|
|
356
|
+
self, namespace: str, project: str, name: str
|
|
357
|
+
) -> Response[DatasetInfoData]:
|
|
327
358
|
def _parse_dataset_info(dataset_info):
|
|
328
359
|
_parse_dates(dataset_info, ["created_at", "finished_at"])
|
|
329
360
|
for version in dataset_info.get("versions"):
|
|
330
361
|
_parse_dates(version, ["created_at"])
|
|
362
|
+
_parse_dates(dataset_info.get("project"), ["created_at"])
|
|
363
|
+
_parse_dates(dataset_info.get("project").get("namespace"), ["created_at"])
|
|
331
364
|
|
|
332
365
|
return dataset_info
|
|
333
366
|
|
|
334
367
|
response = self._send_request(
|
|
335
|
-
"datachain/datasets/info",
|
|
368
|
+
"datachain/datasets/info",
|
|
369
|
+
{"namespace": namespace, "project": project, "name": name},
|
|
370
|
+
method="GET",
|
|
336
371
|
)
|
|
337
372
|
if response.ok:
|
|
338
373
|
response.data = _parse_dataset_info(response.data)
|
|
339
374
|
return response
|
|
340
375
|
|
|
341
|
-
def dataset_rows_chunk(
|
|
342
|
-
self, name: str, version: str, offset: int
|
|
343
|
-
) -> Response[DatasetRowsData]:
|
|
344
|
-
req_data = {"dataset_name": name, "dataset_version": version}
|
|
345
|
-
return self._send_request_msgpack(
|
|
346
|
-
"datachain/datasets/rows",
|
|
347
|
-
{**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
|
|
348
|
-
method="GET",
|
|
349
|
-
)
|
|
350
|
-
|
|
351
376
|
def dataset_job_versions(self, job_id: str) -> Response[DatasetJobVersionsData]:
|
|
352
377
|
return self._send_request(
|
|
353
378
|
"datachain/datasets/dataset_job_versions",
|
|
@@ -356,20 +381,30 @@ class StudioClient:
|
|
|
356
381
|
)
|
|
357
382
|
|
|
358
383
|
def export_dataset_table(
|
|
359
|
-
self,
|
|
384
|
+
self, dataset: DatasetRecord, version: str
|
|
360
385
|
) -> Response[DatasetExportSignedUrls]:
|
|
361
386
|
return self._send_request(
|
|
362
387
|
"datachain/datasets/export",
|
|
363
|
-
{
|
|
388
|
+
{
|
|
389
|
+
"namespace": dataset.project.namespace.name,
|
|
390
|
+
"project": dataset.project.name,
|
|
391
|
+
"name": dataset.name,
|
|
392
|
+
"version": version,
|
|
393
|
+
},
|
|
364
394
|
method="GET",
|
|
365
395
|
)
|
|
366
396
|
|
|
367
397
|
def dataset_export_status(
|
|
368
|
-
self,
|
|
398
|
+
self, dataset: DatasetRecord, version: str
|
|
369
399
|
) -> Response[DatasetExportStatus]:
|
|
370
400
|
return self._send_request(
|
|
371
401
|
"datachain/datasets/export-status",
|
|
372
|
-
{
|
|
402
|
+
{
|
|
403
|
+
"namespace": dataset.project.namespace.name,
|
|
404
|
+
"project": dataset.project.name,
|
|
405
|
+
"name": dataset.name,
|
|
406
|
+
"version": version,
|
|
407
|
+
},
|
|
373
408
|
method="GET",
|
|
374
409
|
)
|
|
375
410
|
|
datachain/studio.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Optional
|
|
|
6
6
|
import tabulate
|
|
7
7
|
|
|
8
8
|
from datachain.config import Config, ConfigLevel
|
|
9
|
-
from datachain.dataset import QUERY_DATASET_PREFIX
|
|
9
|
+
from datachain.dataset import QUERY_DATASET_PREFIX, parse_dataset_name
|
|
10
10
|
from datachain.error import DataChainError
|
|
11
11
|
from datachain.remote.studio import StudioClient
|
|
12
12
|
from datachain.utils import STUDIO_URL
|
|
@@ -102,11 +102,13 @@ def set_team(args: "Namespace"):
|
|
|
102
102
|
def login(args: "Namespace"):
|
|
103
103
|
from dvc_studio_client.auth import StudioAuthError, get_access_token
|
|
104
104
|
|
|
105
|
+
from datachain.remote.studio import get_studio_env_variable
|
|
106
|
+
|
|
105
107
|
config = Config().read().get("studio", {})
|
|
106
108
|
name = args.name
|
|
107
109
|
hostname = (
|
|
108
110
|
args.hostname
|
|
109
|
-
or
|
|
111
|
+
or get_studio_env_variable("URL")
|
|
110
112
|
or config.get("url")
|
|
111
113
|
or STUDIO_URL
|
|
112
114
|
)
|
|
@@ -165,6 +167,11 @@ def token():
|
|
|
165
167
|
|
|
166
168
|
|
|
167
169
|
def list_datasets(team: Optional[str] = None, name: Optional[str] = None):
|
|
170
|
+
def ds_full_name(ds: dict) -> str:
|
|
171
|
+
return (
|
|
172
|
+
f"{ds['project']['namespace']['name']}.{ds['project']['name']}.{ds['name']}"
|
|
173
|
+
)
|
|
174
|
+
|
|
168
175
|
if name:
|
|
169
176
|
yield from list_dataset_versions(team, name)
|
|
170
177
|
return
|
|
@@ -181,18 +188,22 @@ def list_datasets(team: Optional[str] = None, name: Optional[str] = None):
|
|
|
181
188
|
|
|
182
189
|
for d in response.data:
|
|
183
190
|
name = d.get("name")
|
|
191
|
+
full_name = ds_full_name(d)
|
|
184
192
|
if name and name.startswith(QUERY_DATASET_PREFIX):
|
|
185
193
|
continue
|
|
186
194
|
|
|
187
195
|
for v in d.get("versions", []):
|
|
188
196
|
version = v.get("version")
|
|
189
|
-
yield (
|
|
197
|
+
yield (full_name, version)
|
|
190
198
|
|
|
191
199
|
|
|
192
200
|
def list_dataset_versions(team: Optional[str] = None, name: str = ""):
|
|
193
201
|
client = StudioClient(team=team)
|
|
194
202
|
|
|
195
|
-
|
|
203
|
+
namespace_name, project_name, name = parse_dataset_name(name)
|
|
204
|
+
if not namespace_name or not project_name:
|
|
205
|
+
raise DataChainError(f"Missing namespace or project form dataset name {name}")
|
|
206
|
+
response = client.dataset_info(namespace_name, project_name, name)
|
|
196
207
|
|
|
197
208
|
if not response.ok:
|
|
198
209
|
raise DataChainError(response.message)
|
|
@@ -208,12 +219,16 @@ def list_dataset_versions(team: Optional[str] = None, name: str = ""):
|
|
|
208
219
|
def edit_studio_dataset(
|
|
209
220
|
team_name: Optional[str],
|
|
210
221
|
name: str,
|
|
222
|
+
namespace: str,
|
|
223
|
+
project: str,
|
|
211
224
|
new_name: Optional[str] = None,
|
|
212
225
|
description: Optional[str] = None,
|
|
213
226
|
attrs: Optional[list[str]] = None,
|
|
214
227
|
):
|
|
215
228
|
client = StudioClient(team=team_name)
|
|
216
|
-
response = client.edit_dataset(
|
|
229
|
+
response = client.edit_dataset(
|
|
230
|
+
name, namespace, project, new_name, description, attrs
|
|
231
|
+
)
|
|
217
232
|
if not response.ok:
|
|
218
233
|
raise DataChainError(response.message)
|
|
219
234
|
|
|
@@ -223,11 +238,13 @@ def edit_studio_dataset(
|
|
|
223
238
|
def remove_studio_dataset(
|
|
224
239
|
team_name: Optional[str],
|
|
225
240
|
name: str,
|
|
241
|
+
namespace: str,
|
|
242
|
+
project: str,
|
|
226
243
|
version: Optional[str] = None,
|
|
227
244
|
force: Optional[bool] = False,
|
|
228
245
|
):
|
|
229
246
|
client = StudioClient(team=team_name)
|
|
230
|
-
response = client.rm_dataset(name, version, force)
|
|
247
|
+
response = client.rm_dataset(name, namespace, project, version, force)
|
|
231
248
|
if not response.ok:
|
|
232
249
|
raise DataChainError(response.message)
|
|
233
250
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.22.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -94,7 +94,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
94
94
|
Requires-Dist: ultralytics; extra == "tests"
|
|
95
95
|
Provides-Extra: dev
|
|
96
96
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
97
|
-
Requires-Dist: mypy==1.16.
|
|
97
|
+
Requires-Dist: mypy==1.16.1; extra == "dev"
|
|
98
98
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
99
99
|
Requires-Dist: types-pytz; extra == "dev"
|
|
100
100
|
Requires-Dist: types-PyYAML; extra == "dev"
|