datachain 0.20.4__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +0 -2
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +65 -180
- datachain/cli/__init__.py +7 -0
- datachain/cli/commands/datasets.py +28 -43
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +35 -1
- datachain/client/fsspec.py +3 -5
- datachain/client/hf.py +0 -10
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +37 -405
- datachain/data_storage/sqlite.py +7 -136
- datachain/data_storage/warehouse.py +7 -26
- datachain/dataset.py +12 -126
- datachain/delta.py +7 -11
- datachain/error.py +0 -36
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +0 -4
- datachain/lib/dc/datachain.py +92 -260
- datachain/lib/dc/datasets.py +50 -104
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +0 -1
- datachain/lib/dc/storage.py +40 -38
- datachain/lib/file.py +23 -77
- datachain/lib/listing.py +1 -3
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +0 -10
- datachain/lib/tar.py +2 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +20 -30
- datachain/listing.py +1 -3
- datachain/query/dataset.py +46 -71
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +26 -61
- datachain/studio.py +7 -23
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
- datachain/lib/namespaces.py +0 -71
- datachain/lib/projects.py +0 -86
- datachain/namespace.py +0 -65
- datachain/project.py +0 -78
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
datachain/lib/settings.py
CHANGED
|
@@ -14,16 +14,12 @@ class Settings:
|
|
|
14
14
|
workers=None,
|
|
15
15
|
min_task_size=None,
|
|
16
16
|
prefetch=None,
|
|
17
|
-
namespace=None,
|
|
18
|
-
project=None,
|
|
19
17
|
):
|
|
20
18
|
self._cache = cache
|
|
21
19
|
self.parallel = parallel
|
|
22
20
|
self._workers = workers
|
|
23
21
|
self.min_task_size = min_task_size
|
|
24
22
|
self.prefetch = prefetch
|
|
25
|
-
self.namespace = namespace
|
|
26
|
-
self.project = project
|
|
27
23
|
|
|
28
24
|
if not isinstance(cache, bool) and cache is not None:
|
|
29
25
|
raise SettingsError(
|
|
@@ -71,10 +67,6 @@ class Settings:
|
|
|
71
67
|
res["workers"] = self.workers
|
|
72
68
|
if self.min_task_size is not None:
|
|
73
69
|
res["min_task_size"] = self.min_task_size
|
|
74
|
-
if self.namespace is not None:
|
|
75
|
-
res["namespace"] = self.namespace
|
|
76
|
-
if self.project is not None:
|
|
77
|
-
res["project"] = self.project
|
|
78
70
|
return res
|
|
79
71
|
|
|
80
72
|
def add(self, settings: "Settings"):
|
|
@@ -82,7 +74,5 @@ class Settings:
|
|
|
82
74
|
self.parallel = settings.parallel or self.parallel
|
|
83
75
|
self._workers = settings._workers or self._workers
|
|
84
76
|
self.min_task_size = settings.min_task_size or self.min_task_size
|
|
85
|
-
self.namespace = settings.namespace or self.namespace
|
|
86
|
-
self.project = settings.project or self.project
|
|
87
77
|
if settings.prefetch is not None:
|
|
88
78
|
self.prefetch = settings.prefetch
|
datachain/lib/tar.py
CHANGED
|
@@ -6,11 +6,12 @@ from datachain.lib.file import File, TarVFile
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
|
|
9
|
+
new_parent = parent.get_full_name()
|
|
9
10
|
etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
|
|
10
11
|
etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
|
|
11
12
|
return File(
|
|
12
13
|
source=parent.source,
|
|
13
|
-
path=f"{
|
|
14
|
+
path=f"{new_parent}/{info.name}",
|
|
14
15
|
version=parent.version,
|
|
15
16
|
size=info.size,
|
|
16
17
|
etag=etag,
|
datachain/lib/udf_signature.py
CHANGED
datachain/lib/webdataset.py
CHANGED
|
@@ -34,29 +34,29 @@ warnings.filterwarnings(
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
class WDSError(DataChainError):
|
|
37
|
-
def __init__(self,
|
|
38
|
-
super().__init__(f"WebDataset error '{
|
|
37
|
+
def __init__(self, tar_stream, message: str):
|
|
38
|
+
super().__init__(f"WebDataset error '{tar_stream.get_full_name()}': {message}")
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
class CoreFileDuplicationError(WDSError):
|
|
42
|
-
def __init__(self,
|
|
42
|
+
def __init__(self, tar_stream, file1: str, file2: str):
|
|
43
43
|
super().__init__(
|
|
44
|
-
|
|
44
|
+
tar_stream, f"duplication of files with core extensions: {file1}, {file2}"
|
|
45
45
|
)
|
|
46
46
|
|
|
47
47
|
|
|
48
48
|
class CoreFileNotFoundError(WDSError):
|
|
49
|
-
def __init__(self,
|
|
49
|
+
def __init__(self, tar_stream, extensions, stem):
|
|
50
50
|
super().__init__(
|
|
51
|
-
|
|
51
|
+
tar_stream,
|
|
52
52
|
f"no files with the extensions '{','.join(extensions)}'"
|
|
53
53
|
f" were found for file stem {stem}",
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
class UnknownFileExtensionError(WDSError):
|
|
58
|
-
def __init__(self,
|
|
59
|
-
super().__init__(
|
|
58
|
+
def __init__(self, tar_stream, name, ext):
|
|
59
|
+
super().__init__(tar_stream, f"unknown extension '{ext}' for file '{name}'")
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
class WDSBasic(DataModel):
|
|
@@ -113,10 +113,10 @@ class Builder:
|
|
|
113
113
|
def __init__(
|
|
114
114
|
self,
|
|
115
115
|
tar_stream: File,
|
|
116
|
-
core_extensions:
|
|
116
|
+
core_extensions: list[str],
|
|
117
117
|
wds_class: type[WDSBasic],
|
|
118
|
-
tar
|
|
119
|
-
encoding
|
|
118
|
+
tar,
|
|
119
|
+
encoding="utf-8",
|
|
120
120
|
):
|
|
121
121
|
self._core_extensions = core_extensions
|
|
122
122
|
self._tar_stream = tar_stream
|
|
@@ -145,20 +145,18 @@ class Builder:
|
|
|
145
145
|
if ext in self._core_extensions:
|
|
146
146
|
if self.state.core_file is not None:
|
|
147
147
|
raise CoreFileDuplicationError(
|
|
148
|
-
self._tar_stream
|
|
148
|
+
self._tar_stream, file.name, self.state.core_file.name
|
|
149
149
|
)
|
|
150
150
|
self.state.core_file = file
|
|
151
151
|
elif ext in self.state.data:
|
|
152
152
|
raise WDSError(
|
|
153
|
-
self._tar_stream
|
|
153
|
+
self._tar_stream,
|
|
154
154
|
f"file with extension '.{ext}' already exists in the archive",
|
|
155
155
|
)
|
|
156
156
|
else:
|
|
157
157
|
type_ = self._get_type(ext)
|
|
158
158
|
if type_ is None:
|
|
159
|
-
raise UnknownFileExtensionError(
|
|
160
|
-
self._tar_stream.name, fstream.name, ext
|
|
161
|
-
)
|
|
159
|
+
raise UnknownFileExtensionError(self._tar_stream, fstream.name, ext)
|
|
162
160
|
|
|
163
161
|
if issubclass(type_, WDSReadableSubclass):
|
|
164
162
|
reader = type_._reader
|
|
@@ -167,7 +165,7 @@ class Builder:
|
|
|
167
165
|
|
|
168
166
|
if reader is None:
|
|
169
167
|
raise WDSError(
|
|
170
|
-
self._tar_stream
|
|
168
|
+
self._tar_stream,
|
|
171
169
|
f"unable to find a reader for type {type_}, extension .{ext}",
|
|
172
170
|
)
|
|
173
171
|
self.state.data[ext] = reader(self, file)
|
|
@@ -175,7 +173,7 @@ class Builder:
|
|
|
175
173
|
def produce(self):
|
|
176
174
|
if self.state.core_file is None:
|
|
177
175
|
raise CoreFileNotFoundError(
|
|
178
|
-
self._tar_stream
|
|
176
|
+
self._tar_stream, self._core_extensions, self.state.stem
|
|
179
177
|
)
|
|
180
178
|
|
|
181
179
|
file = build_tar_member(self._tar_stream, self.state.core_file)
|
|
@@ -196,13 +194,7 @@ class Builder:
|
|
|
196
194
|
return anno
|
|
197
195
|
|
|
198
196
|
|
|
199
|
-
def get_tar_groups(
|
|
200
|
-
stream: File,
|
|
201
|
-
tar: tarfile.TarFile,
|
|
202
|
-
core_extensions: Sequence[str],
|
|
203
|
-
spec: type[WDSBasic],
|
|
204
|
-
encoding: str = "utf-8",
|
|
205
|
-
) -> Iterator[WDSBasic]:
|
|
197
|
+
def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
|
|
206
198
|
builder = Builder(stream, core_extensions, spec, tar, encoding)
|
|
207
199
|
|
|
208
200
|
for item in sorted(tar.getmembers(), key=lambda m: Path(m.name).stem):
|
|
@@ -218,11 +210,9 @@ def get_tar_groups(
|
|
|
218
210
|
|
|
219
211
|
|
|
220
212
|
def process_webdataset(
|
|
221
|
-
core_extensions: Sequence[str] = ("jpg", "png"),
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
) -> Callable[[File], Iterator]:
|
|
225
|
-
def wds_func(file: File) -> Iterator[spec]: # type: ignore[valid-type]
|
|
213
|
+
core_extensions: Sequence[str] = ("jpg", "png"), spec=WDSAllFile, encoding="utf-8"
|
|
214
|
+
) -> Callable:
|
|
215
|
+
def wds_func(file: File) -> Iterator[spec]:
|
|
226
216
|
with file.open() as fd:
|
|
227
217
|
with tarfile.open(fileobj=fd) as tar:
|
|
228
218
|
yield from get_tar_groups(file, tar, core_extensions, spec, encoding)
|
datachain/listing.py
CHANGED
|
@@ -66,9 +66,7 @@ class Listing:
|
|
|
66
66
|
@cached_property
|
|
67
67
|
def dataset(self) -> "DatasetRecord":
|
|
68
68
|
assert self.dataset_name
|
|
69
|
-
return self.metastore.get_dataset(
|
|
70
|
-
self.dataset_name, self.metastore.listing_project.id
|
|
71
|
-
)
|
|
69
|
+
return self.metastore.get_dataset(self.dataset_name)
|
|
72
70
|
|
|
73
71
|
@cached_property
|
|
74
72
|
def dataset_rows(self):
|
datachain/query/dataset.py
CHANGED
|
@@ -41,13 +41,12 @@ from datachain.data_storage.schema import (
|
|
|
41
41
|
partition_col_names,
|
|
42
42
|
partition_columns,
|
|
43
43
|
)
|
|
44
|
-
from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
|
|
44
|
+
from datachain.dataset import DATASET_PREFIX, DatasetDependency, DatasetStatus, RowDict
|
|
45
45
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
46
46
|
from datachain.func.base import Function
|
|
47
47
|
from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
|
|
48
48
|
from datachain.lib.udf import UDFAdapter, _get_cache
|
|
49
49
|
from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
|
|
50
|
-
from datachain.project import Project
|
|
51
50
|
from datachain.query.schema import C, UDFParamSpec, normalize_param
|
|
52
51
|
from datachain.query.session import Session
|
|
53
52
|
from datachain.query.udf import UdfInfo
|
|
@@ -84,7 +83,7 @@ PartitionByType = Union[
|
|
|
84
83
|
Function, ColumnElement, Sequence[Union[Function, ColumnElement]]
|
|
85
84
|
]
|
|
86
85
|
JoinPredicateType = Union[str, ColumnClause, ColumnElement]
|
|
87
|
-
DatasetDependencyType = tuple[
|
|
86
|
+
DatasetDependencyType = tuple[str, str]
|
|
88
87
|
|
|
89
88
|
logger = logging.getLogger("datachain")
|
|
90
89
|
|
|
@@ -170,17 +169,18 @@ class QueryStep:
|
|
|
170
169
|
"""A query that returns all rows from specific dataset version"""
|
|
171
170
|
|
|
172
171
|
catalog: "Catalog"
|
|
173
|
-
|
|
172
|
+
dataset_name: str
|
|
174
173
|
dataset_version: str
|
|
175
174
|
|
|
176
175
|
def apply(self) -> "StepResult":
|
|
177
176
|
def q(*columns):
|
|
178
177
|
return sqlalchemy.select(*columns)
|
|
179
178
|
|
|
180
|
-
|
|
179
|
+
dataset = self.catalog.get_dataset(self.dataset_name)
|
|
180
|
+
dr = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
|
|
181
181
|
|
|
182
182
|
return step_result(
|
|
183
|
-
q, dr.columns, dependencies=[(self.
|
|
183
|
+
q, dr.columns, dependencies=[(self.dataset_name, self.dataset_version)]
|
|
184
184
|
)
|
|
185
185
|
|
|
186
186
|
|
|
@@ -1095,8 +1095,6 @@ class DatasetQuery:
|
|
|
1095
1095
|
self,
|
|
1096
1096
|
name: str,
|
|
1097
1097
|
version: Optional[str] = None,
|
|
1098
|
-
project_name: Optional[str] = None,
|
|
1099
|
-
namespace_name: Optional[str] = None,
|
|
1100
1098
|
catalog: Optional["Catalog"] = None,
|
|
1101
1099
|
session: Optional[Session] = None,
|
|
1102
1100
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
@@ -1130,38 +1128,33 @@ class DatasetQuery:
|
|
|
1130
1128
|
if version:
|
|
1131
1129
|
self.version = version
|
|
1132
1130
|
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1131
|
+
if is_listing_dataset(name):
|
|
1132
|
+
if version:
|
|
1133
|
+
# this listing dataset should already be listed as we specify
|
|
1134
|
+
# exact version
|
|
1135
|
+
self._set_starting_step(self.catalog.get_dataset(name))
|
|
1136
|
+
else:
|
|
1137
|
+
# not setting query step yet as listing dataset might not exist at
|
|
1138
|
+
# this point
|
|
1139
|
+
self.list_ds_name = name
|
|
1140
1140
|
elif fallback_to_studio and is_token_set():
|
|
1141
1141
|
self._set_starting_step(
|
|
1142
|
-
self.catalog.get_dataset_with_remote_fallback(
|
|
1143
|
-
name,
|
|
1144
|
-
namespace_name=namespace_name,
|
|
1145
|
-
project_name=project_name,
|
|
1146
|
-
version=version,
|
|
1147
|
-
)
|
|
1142
|
+
self.catalog.get_dataset_with_remote_fallback(name, version)
|
|
1148
1143
|
)
|
|
1149
1144
|
else:
|
|
1150
|
-
|
|
1151
|
-
self._set_starting_step(self.catalog.get_dataset(name, project=project))
|
|
1145
|
+
self._set_starting_step(self.catalog.get_dataset(name))
|
|
1152
1146
|
|
|
1153
1147
|
def _set_starting_step(self, ds: "DatasetRecord") -> None:
|
|
1154
1148
|
if not self.version:
|
|
1155
1149
|
self.version = ds.latest_version
|
|
1156
1150
|
|
|
1157
|
-
self.starting_step = QueryStep(self.catalog, ds, self.version)
|
|
1151
|
+
self.starting_step = QueryStep(self.catalog, ds.name, self.version)
|
|
1158
1152
|
|
|
1159
1153
|
# at this point we know our starting dataset so setting up schemas
|
|
1160
1154
|
self.feature_schema = ds.get_version(self.version).feature_schema
|
|
1161
1155
|
self.column_types = copy(ds.schema)
|
|
1162
1156
|
if "sys__id" in self.column_types:
|
|
1163
1157
|
self.column_types.pop("sys__id")
|
|
1164
|
-
self.project = ds.project
|
|
1165
1158
|
|
|
1166
1159
|
def __iter__(self):
|
|
1167
1160
|
return iter(self.db_results())
|
|
@@ -1169,6 +1162,21 @@ class DatasetQuery:
|
|
|
1169
1162
|
def __or__(self, other):
|
|
1170
1163
|
return self.union(other)
|
|
1171
1164
|
|
|
1165
|
+
def pull_dataset(self, name: str, version: Optional[str] = None) -> "DatasetRecord":
|
|
1166
|
+
print("Dataset not found in local catalog, trying to get from studio")
|
|
1167
|
+
|
|
1168
|
+
remote_ds_uri = f"{DATASET_PREFIX}{name}"
|
|
1169
|
+
if version:
|
|
1170
|
+
remote_ds_uri += f"@v{version}"
|
|
1171
|
+
|
|
1172
|
+
self.catalog.pull_dataset(
|
|
1173
|
+
remote_ds_uri=remote_ds_uri,
|
|
1174
|
+
local_ds_name=name,
|
|
1175
|
+
local_ds_version=version,
|
|
1176
|
+
)
|
|
1177
|
+
|
|
1178
|
+
return self.catalog.get_dataset(name)
|
|
1179
|
+
|
|
1172
1180
|
@staticmethod
|
|
1173
1181
|
def get_table() -> "TableClause":
|
|
1174
1182
|
table_name = "".join(
|
|
@@ -1649,8 +1657,6 @@ class DatasetQuery:
|
|
|
1649
1657
|
workers: Union[bool, int] = False,
|
|
1650
1658
|
min_task_size: Optional[int] = None,
|
|
1651
1659
|
partition_by: Optional[PartitionByType] = None,
|
|
1652
|
-
namespace: Optional[str] = None,
|
|
1653
|
-
project: Optional[str] = None,
|
|
1654
1660
|
cache: bool = False,
|
|
1655
1661
|
) -> "Self":
|
|
1656
1662
|
query = self.clone()
|
|
@@ -1670,36 +1676,26 @@ class DatasetQuery:
|
|
|
1670
1676
|
|
|
1671
1677
|
def _add_dependencies(self, dataset: "DatasetRecord", version: str):
|
|
1672
1678
|
dependencies: set[DatasetDependencyType] = set()
|
|
1673
|
-
for
|
|
1674
|
-
if Session.is_temp_dataset(
|
|
1679
|
+
for dep_name, dep_version in self.dependencies:
|
|
1680
|
+
if Session.is_temp_dataset(dep_name):
|
|
1675
1681
|
# temp dataset are created for optimization and they will be removed
|
|
1676
1682
|
# afterwards. Therefore, we should not put them as dependencies, but
|
|
1677
1683
|
# their own direct dependencies
|
|
1678
1684
|
for dep in self.catalog.get_dataset_dependencies(
|
|
1679
|
-
|
|
1680
|
-
dep_dataset_version,
|
|
1681
|
-
dep_dataset.project,
|
|
1682
|
-
indirect=False,
|
|
1685
|
+
dep_name, dep_version, indirect=False
|
|
1683
1686
|
):
|
|
1684
1687
|
if dep:
|
|
1685
|
-
|
|
1686
|
-
dep.project, dep.namespace
|
|
1687
|
-
)
|
|
1688
|
-
dependencies.add(
|
|
1689
|
-
(
|
|
1690
|
-
self.catalog.get_dataset(dep.name, dep_project),
|
|
1691
|
-
dep.version,
|
|
1692
|
-
)
|
|
1693
|
-
)
|
|
1688
|
+
dependencies.add((dep.name, dep.version))
|
|
1694
1689
|
else:
|
|
1695
|
-
dependencies.add((
|
|
1690
|
+
dependencies.add((dep_name, dep_version))
|
|
1696
1691
|
|
|
1697
|
-
for
|
|
1692
|
+
for dep_name, dep_version in dependencies:
|
|
1693
|
+
# ds_dependency_name, ds_dependency_version = dependency
|
|
1698
1694
|
self.catalog.metastore.add_dataset_dependency(
|
|
1699
|
-
dataset,
|
|
1695
|
+
dataset.name,
|
|
1700
1696
|
version,
|
|
1701
|
-
|
|
1702
|
-
|
|
1697
|
+
dep_name,
|
|
1698
|
+
dep_version,
|
|
1703
1699
|
)
|
|
1704
1700
|
|
|
1705
1701
|
def exec(self) -> "Self":
|
|
@@ -1715,7 +1711,6 @@ class DatasetQuery:
|
|
|
1715
1711
|
self,
|
|
1716
1712
|
name: Optional[str] = None,
|
|
1717
1713
|
version: Optional[str] = None,
|
|
1718
|
-
project: Optional[Project] = None,
|
|
1719
1714
|
feature_schema: Optional[dict] = None,
|
|
1720
1715
|
dependencies: Optional[list[DatasetDependency]] = None,
|
|
1721
1716
|
description: Optional[str] = None,
|
|
@@ -1724,13 +1719,8 @@ class DatasetQuery:
|
|
|
1724
1719
|
**kwargs,
|
|
1725
1720
|
) -> "Self":
|
|
1726
1721
|
"""Save the query as a dataset."""
|
|
1727
|
-
project = project or self.catalog.metastore.default_project
|
|
1728
1722
|
try:
|
|
1729
|
-
if (
|
|
1730
|
-
name
|
|
1731
|
-
and version
|
|
1732
|
-
and self.catalog.get_dataset(name, project).has_version(version)
|
|
1733
|
-
):
|
|
1723
|
+
if name and version and self.catalog.get_dataset(name).has_version(version):
|
|
1734
1724
|
raise RuntimeError(f"Dataset {name} already has version {version}")
|
|
1735
1725
|
except DatasetNotFoundError:
|
|
1736
1726
|
pass
|
|
@@ -1755,7 +1745,6 @@ class DatasetQuery:
|
|
|
1755
1745
|
|
|
1756
1746
|
dataset = self.catalog.create_dataset(
|
|
1757
1747
|
name,
|
|
1758
|
-
project,
|
|
1759
1748
|
version=version,
|
|
1760
1749
|
feature_schema=feature_schema,
|
|
1761
1750
|
columns=columns,
|
|
@@ -1781,25 +1770,11 @@ class DatasetQuery:
|
|
|
1781
1770
|
|
|
1782
1771
|
if dependencies:
|
|
1783
1772
|
# overriding dependencies
|
|
1784
|
-
self.dependencies =
|
|
1785
|
-
for dep in dependencies:
|
|
1786
|
-
dep_project = self.catalog.metastore.get_project(
|
|
1787
|
-
dep.project, dep.namespace
|
|
1788
|
-
)
|
|
1789
|
-
self.dependencies.add(
|
|
1790
|
-
(self.catalog.get_dataset(dep.name, dep_project), dep.version)
|
|
1791
|
-
)
|
|
1792
|
-
|
|
1773
|
+
self.dependencies = {(dep.name, dep.version) for dep in dependencies}
|
|
1793
1774
|
self._add_dependencies(dataset, version) # type: ignore [arg-type]
|
|
1794
1775
|
finally:
|
|
1795
1776
|
self.cleanup()
|
|
1796
|
-
return self.__class__(
|
|
1797
|
-
name=name,
|
|
1798
|
-
namespace_name=project.namespace.name,
|
|
1799
|
-
project_name=project.name,
|
|
1800
|
-
version=version,
|
|
1801
|
-
catalog=self.catalog,
|
|
1802
|
-
)
|
|
1777
|
+
return self.__class__(name=name, version=version, catalog=self.catalog)
|
|
1803
1778
|
|
|
1804
1779
|
@property
|
|
1805
1780
|
def is_ordered(self) -> bool:
|
datachain/query/session.py
CHANGED
|
@@ -108,7 +108,7 @@ class Session:
|
|
|
108
108
|
prefix = self.get_temp_prefix()
|
|
109
109
|
try:
|
|
110
110
|
for dataset in list(self.catalog.metastore.list_datasets_by_prefix(prefix)):
|
|
111
|
-
self.catalog.remove_dataset(dataset.name,
|
|
111
|
+
self.catalog.remove_dataset(dataset.name, force=True)
|
|
112
112
|
# suppress error when metastore has been reset during testing
|
|
113
113
|
except TableMissingError:
|
|
114
114
|
pass
|
datachain/remote/studio.py
CHANGED
|
@@ -17,7 +17,6 @@ import websockets
|
|
|
17
17
|
from requests.exceptions import HTTPError, Timeout
|
|
18
18
|
|
|
19
19
|
from datachain.config import Config
|
|
20
|
-
from datachain.dataset import DatasetRecord
|
|
21
20
|
from datachain.error import DataChainError
|
|
22
21
|
from datachain.utils import STUDIO_URL, retry_with_backoff
|
|
23
22
|
|
|
@@ -37,33 +36,13 @@ logger = logging.getLogger("datachain")
|
|
|
37
36
|
DATASET_ROWS_CHUNK_SIZE = 8192
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
def get_studio_env_variable(name: str) -> Any:
|
|
41
|
-
"""
|
|
42
|
-
Get the value of a DataChain Studio environment variable.
|
|
43
|
-
It first checks for the variable prefixed with 'DATACHAIN_STUDIO_',
|
|
44
|
-
then checks for the deprecated 'DVC_STUDIO_' prefix.
|
|
45
|
-
If neither is set, it returns the provided default value.
|
|
46
|
-
"""
|
|
47
|
-
if (value := os.environ.get(f"DATACHAIN_STUDIO_{name}")) is not None:
|
|
48
|
-
return value
|
|
49
|
-
if (value := os.environ.get(f"DVC_STUDIO_{name}")) is not None: # deprecated
|
|
50
|
-
logger.warning(
|
|
51
|
-
"Environment variable 'DVC_STUDIO_%s' is deprecated, "
|
|
52
|
-
"use 'DATACHAIN_STUDIO_%s' instead.",
|
|
53
|
-
name,
|
|
54
|
-
name,
|
|
55
|
-
)
|
|
56
|
-
return value
|
|
57
|
-
return None
|
|
58
|
-
|
|
59
|
-
|
|
60
39
|
def _is_server_error(status_code: int) -> bool:
|
|
61
40
|
return str(status_code).startswith("5")
|
|
62
41
|
|
|
63
42
|
|
|
64
43
|
def is_token_set() -> bool:
|
|
65
44
|
return (
|
|
66
|
-
bool(
|
|
45
|
+
bool(os.environ.get("DVC_STUDIO_TOKEN"))
|
|
67
46
|
or Config().read().get("studio", {}).get("token") is not None
|
|
68
47
|
)
|
|
69
48
|
|
|
@@ -99,12 +78,12 @@ class StudioClient:
|
|
|
99
78
|
|
|
100
79
|
@property
|
|
101
80
|
def token(self) -> str:
|
|
102
|
-
token =
|
|
81
|
+
token = os.environ.get("DVC_STUDIO_TOKEN") or self.config.get("token")
|
|
103
82
|
|
|
104
83
|
if not token:
|
|
105
84
|
raise DataChainError(
|
|
106
85
|
"Studio token is not set. Use `datachain auth login` "
|
|
107
|
-
"or environment variable `
|
|
86
|
+
"or environment variable `DVC_STUDIO_TOKEN` to set it."
|
|
108
87
|
)
|
|
109
88
|
|
|
110
89
|
return token
|
|
@@ -112,8 +91,8 @@ class StudioClient:
|
|
|
112
91
|
@property
|
|
113
92
|
def url(self) -> str:
|
|
114
93
|
return (
|
|
115
|
-
|
|
116
|
-
)
|
|
94
|
+
os.environ.get("DVC_STUDIO_URL") or self.config.get("url") or STUDIO_URL
|
|
95
|
+
) + "/api"
|
|
117
96
|
|
|
118
97
|
@property
|
|
119
98
|
def config(self) -> dict:
|
|
@@ -128,13 +107,13 @@ class StudioClient:
|
|
|
128
107
|
return self._team
|
|
129
108
|
|
|
130
109
|
def _get_team(self) -> str:
|
|
131
|
-
team =
|
|
110
|
+
team = os.environ.get("DVC_STUDIO_TEAM") or self.config.get("team")
|
|
132
111
|
|
|
133
112
|
if not team:
|
|
134
113
|
raise DataChainError(
|
|
135
114
|
"Studio team is not set. "
|
|
136
115
|
"Use `datachain auth team <team_name>` "
|
|
137
|
-
"or environment variable `
|
|
116
|
+
"or environment variable `DVC_STUDIO_TEAM` to set it. "
|
|
138
117
|
"You can also set `studio.team` in the config file."
|
|
139
118
|
)
|
|
140
119
|
|
|
@@ -312,17 +291,13 @@ class StudioClient:
|
|
|
312
291
|
def edit_dataset(
|
|
313
292
|
self,
|
|
314
293
|
name: str,
|
|
315
|
-
namespace: str,
|
|
316
|
-
project: str,
|
|
317
294
|
new_name: Optional[str] = None,
|
|
318
295
|
description: Optional[str] = None,
|
|
319
296
|
attrs: Optional[list[str]] = None,
|
|
320
297
|
) -> Response[DatasetInfoData]:
|
|
321
298
|
body = {
|
|
322
299
|
"new_name": new_name,
|
|
323
|
-
"
|
|
324
|
-
"namespace": namespace,
|
|
325
|
-
"project": project,
|
|
300
|
+
"dataset_name": name,
|
|
326
301
|
"description": description,
|
|
327
302
|
"attrs": attrs,
|
|
328
303
|
}
|
|
@@ -335,44 +310,44 @@ class StudioClient:
|
|
|
335
310
|
def rm_dataset(
|
|
336
311
|
self,
|
|
337
312
|
name: str,
|
|
338
|
-
namespace: str,
|
|
339
|
-
project: str,
|
|
340
313
|
version: Optional[str] = None,
|
|
341
314
|
force: Optional[bool] = False,
|
|
342
315
|
) -> Response[DatasetInfoData]:
|
|
343
316
|
return self._send_request(
|
|
344
317
|
"datachain/datasets",
|
|
345
318
|
{
|
|
346
|
-
"
|
|
347
|
-
"
|
|
348
|
-
"project": project,
|
|
349
|
-
"version": version,
|
|
319
|
+
"dataset_name": name,
|
|
320
|
+
"dataset_version": version,
|
|
350
321
|
"force": force,
|
|
351
322
|
},
|
|
352
323
|
method="DELETE",
|
|
353
324
|
)
|
|
354
325
|
|
|
355
|
-
def dataset_info(
|
|
356
|
-
self, namespace: str, project: str, name: str
|
|
357
|
-
) -> Response[DatasetInfoData]:
|
|
326
|
+
def dataset_info(self, name: str) -> Response[DatasetInfoData]:
|
|
358
327
|
def _parse_dataset_info(dataset_info):
|
|
359
328
|
_parse_dates(dataset_info, ["created_at", "finished_at"])
|
|
360
329
|
for version in dataset_info.get("versions"):
|
|
361
330
|
_parse_dates(version, ["created_at"])
|
|
362
|
-
_parse_dates(dataset_info.get("project"), ["created_at"])
|
|
363
|
-
_parse_dates(dataset_info.get("project").get("namespace"), ["created_at"])
|
|
364
331
|
|
|
365
332
|
return dataset_info
|
|
366
333
|
|
|
367
334
|
response = self._send_request(
|
|
368
|
-
"datachain/datasets/info",
|
|
369
|
-
{"namespace": namespace, "project": project, "name": name},
|
|
370
|
-
method="GET",
|
|
335
|
+
"datachain/datasets/info", {"dataset_name": name}, method="GET"
|
|
371
336
|
)
|
|
372
337
|
if response.ok:
|
|
373
338
|
response.data = _parse_dataset_info(response.data)
|
|
374
339
|
return response
|
|
375
340
|
|
|
341
|
+
def dataset_rows_chunk(
|
|
342
|
+
self, name: str, version: str, offset: int
|
|
343
|
+
) -> Response[DatasetRowsData]:
|
|
344
|
+
req_data = {"dataset_name": name, "dataset_version": version}
|
|
345
|
+
return self._send_request_msgpack(
|
|
346
|
+
"datachain/datasets/rows",
|
|
347
|
+
{**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
|
|
348
|
+
method="GET",
|
|
349
|
+
)
|
|
350
|
+
|
|
376
351
|
def dataset_job_versions(self, job_id: str) -> Response[DatasetJobVersionsData]:
|
|
377
352
|
return self._send_request(
|
|
378
353
|
"datachain/datasets/dataset_job_versions",
|
|
@@ -381,30 +356,20 @@ class StudioClient:
|
|
|
381
356
|
)
|
|
382
357
|
|
|
383
358
|
def export_dataset_table(
|
|
384
|
-
self,
|
|
359
|
+
self, name: str, version: str
|
|
385
360
|
) -> Response[DatasetExportSignedUrls]:
|
|
386
361
|
return self._send_request(
|
|
387
362
|
"datachain/datasets/export",
|
|
388
|
-
{
|
|
389
|
-
"namespace": dataset.project.namespace.name,
|
|
390
|
-
"project": dataset.project.name,
|
|
391
|
-
"name": dataset.name,
|
|
392
|
-
"version": version,
|
|
393
|
-
},
|
|
363
|
+
{"dataset_name": name, "dataset_version": version},
|
|
394
364
|
method="GET",
|
|
395
365
|
)
|
|
396
366
|
|
|
397
367
|
def dataset_export_status(
|
|
398
|
-
self,
|
|
368
|
+
self, name: str, version: str
|
|
399
369
|
) -> Response[DatasetExportStatus]:
|
|
400
370
|
return self._send_request(
|
|
401
371
|
"datachain/datasets/export-status",
|
|
402
|
-
{
|
|
403
|
-
"namespace": dataset.project.namespace.name,
|
|
404
|
-
"project": dataset.project.name,
|
|
405
|
-
"name": dataset.name,
|
|
406
|
-
"version": version,
|
|
407
|
-
},
|
|
372
|
+
{"dataset_name": name, "dataset_version": version},
|
|
408
373
|
method="GET",
|
|
409
374
|
)
|
|
410
375
|
|