datachain 0.20.4__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +0 -2
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +65 -180
- datachain/cli/__init__.py +7 -0
- datachain/cli/commands/datasets.py +28 -43
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +35 -1
- datachain/client/fsspec.py +3 -5
- datachain/client/hf.py +0 -10
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +37 -405
- datachain/data_storage/sqlite.py +7 -136
- datachain/data_storage/warehouse.py +7 -26
- datachain/dataset.py +12 -126
- datachain/delta.py +7 -11
- datachain/error.py +0 -36
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +0 -4
- datachain/lib/dc/datachain.py +92 -260
- datachain/lib/dc/datasets.py +50 -104
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +0 -1
- datachain/lib/dc/storage.py +40 -38
- datachain/lib/file.py +23 -77
- datachain/lib/listing.py +1 -3
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +0 -10
- datachain/lib/tar.py +2 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +20 -30
- datachain/listing.py +1 -3
- datachain/query/dataset.py +46 -71
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +26 -61
- datachain/studio.py +7 -23
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
- datachain/lib/namespaces.py +0 -71
- datachain/lib/projects.py +0 -86
- datachain/namespace.py +0 -65
- datachain/project.py +0 -78
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.4.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -8,8 +8,7 @@ if TYPE_CHECKING:
|
|
|
8
8
|
|
|
9
9
|
from datachain.cli.utils import determine_flavors
|
|
10
10
|
from datachain.config import Config
|
|
11
|
-
from datachain.
|
|
12
|
-
from datachain.error import DataChainError, DatasetNotFoundError
|
|
11
|
+
from datachain.error import DatasetNotFoundError
|
|
13
12
|
from datachain.studio import list_datasets as list_datasets_studio
|
|
14
13
|
|
|
15
14
|
|
|
@@ -102,15 +101,11 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
|
|
|
102
101
|
|
|
103
102
|
for d in catalog.ls_datasets():
|
|
104
103
|
for v in d.versions:
|
|
105
|
-
yield (d.
|
|
104
|
+
yield (d.name, v.version)
|
|
106
105
|
|
|
107
106
|
|
|
108
107
|
def list_datasets_local_versions(catalog: "Catalog", name: str):
|
|
109
|
-
|
|
110
|
-
namespace_name = namespace_name or catalog.metastore.default_namespace_name
|
|
111
|
-
project_name = project_name or catalog.metastore.default_project_name
|
|
112
|
-
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
113
|
-
ds = catalog.get_dataset(name, project)
|
|
108
|
+
ds = catalog.get_dataset(name)
|
|
114
109
|
for v in ds.versions:
|
|
115
110
|
yield (name, v.version)
|
|
116
111
|
|
|
@@ -134,29 +129,25 @@ def rm_dataset(
|
|
|
134
129
|
name: str,
|
|
135
130
|
version: Optional[str] = None,
|
|
136
131
|
force: Optional[bool] = False,
|
|
137
|
-
studio:
|
|
132
|
+
studio: bool = False,
|
|
133
|
+
local: bool = False,
|
|
134
|
+
all: bool = True,
|
|
138
135
|
team: Optional[str] = None,
|
|
139
136
|
):
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
token = Config().read().get("studio", {}).get("token")
|
|
148
|
-
if not token:
|
|
149
|
-
raise DataChainError(
|
|
150
|
-
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
151
|
-
)
|
|
152
|
-
remove_studio_dataset(team, name, namespace_name, project_name, version, force)
|
|
153
|
-
else:
|
|
137
|
+
from datachain.studio import remove_studio_dataset
|
|
138
|
+
|
|
139
|
+
token = Config().read().get("studio", {}).get("token")
|
|
140
|
+
all, local, studio = determine_flavors(studio, local, all, token)
|
|
141
|
+
|
|
142
|
+
if all or local:
|
|
154
143
|
try:
|
|
155
|
-
|
|
156
|
-
catalog.remove_dataset(name, project, version=version, force=force)
|
|
144
|
+
catalog.remove_dataset(name, version=version, force=force)
|
|
157
145
|
except DatasetNotFoundError:
|
|
158
146
|
print("Dataset not found in local", file=sys.stderr)
|
|
159
147
|
|
|
148
|
+
if (all or studio) and token:
|
|
149
|
+
remove_studio_dataset(team, name, version, force)
|
|
150
|
+
|
|
160
151
|
|
|
161
152
|
def edit_dataset(
|
|
162
153
|
catalog: "Catalog",
|
|
@@ -164,27 +155,21 @@ def edit_dataset(
|
|
|
164
155
|
new_name: Optional[str] = None,
|
|
165
156
|
description: Optional[str] = None,
|
|
166
157
|
attrs: Optional[list[str]] = None,
|
|
158
|
+
studio: bool = False,
|
|
159
|
+
local: bool = False,
|
|
160
|
+
all: bool = True,
|
|
167
161
|
team: Optional[str] = None,
|
|
168
162
|
):
|
|
169
|
-
|
|
170
|
-
namespace_name = namespace_name or catalog.metastore.default_namespace_name
|
|
171
|
-
project_name = project_name or catalog.metastore.default_project_name
|
|
163
|
+
from datachain.studio import edit_studio_dataset
|
|
172
164
|
|
|
173
|
-
|
|
165
|
+
token = Config().read().get("studio", {}).get("token")
|
|
166
|
+
all, local, studio = determine_flavors(studio, local, all, token)
|
|
167
|
+
|
|
168
|
+
if all or local:
|
|
174
169
|
try:
|
|
175
|
-
catalog.edit_dataset(
|
|
176
|
-
name, catalog.metastore.default_project, new_name, description, attrs
|
|
177
|
-
)
|
|
170
|
+
catalog.edit_dataset(name, new_name, description, attrs)
|
|
178
171
|
except DatasetNotFoundError:
|
|
179
172
|
print("Dataset not found in local", file=sys.stderr)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
token = Config().read().get("studio", {}).get("token")
|
|
184
|
-
if not token:
|
|
185
|
-
raise DataChainError(
|
|
186
|
-
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
187
|
-
)
|
|
188
|
-
edit_studio_dataset(
|
|
189
|
-
team, name, namespace_name, project_name, new_name, description, attrs
|
|
190
|
-
)
|
|
173
|
+
|
|
174
|
+
if (all or studio) and token:
|
|
175
|
+
edit_studio_dataset(team, name, new_name, description, attrs)
|
datachain/cli/commands/ls.py
CHANGED
|
@@ -63,8 +63,8 @@ def ls_local(
|
|
|
63
63
|
print(format_ls_entry(entry))
|
|
64
64
|
else:
|
|
65
65
|
# Collect results in a list here to prevent interference from `tqdm` and `print`
|
|
66
|
-
listing = listings().
|
|
67
|
-
for
|
|
66
|
+
listing = list(listings().collect("listing"))
|
|
67
|
+
for ls in listing:
|
|
68
68
|
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
69
69
|
|
|
70
70
|
|
datachain/cli/parser/__init__.py
CHANGED
|
@@ -221,6 +221,26 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
221
221
|
nargs="+",
|
|
222
222
|
help="Dataset attributes",
|
|
223
223
|
)
|
|
224
|
+
parse_edit_dataset.add_argument(
|
|
225
|
+
"--studio",
|
|
226
|
+
action="store_true",
|
|
227
|
+
default=False,
|
|
228
|
+
help="Edit dataset from Studio",
|
|
229
|
+
)
|
|
230
|
+
parse_edit_dataset.add_argument(
|
|
231
|
+
"-L",
|
|
232
|
+
"--local",
|
|
233
|
+
action="store_true",
|
|
234
|
+
default=False,
|
|
235
|
+
help="Edit local dataset only",
|
|
236
|
+
)
|
|
237
|
+
parse_edit_dataset.add_argument(
|
|
238
|
+
"-a",
|
|
239
|
+
"--all",
|
|
240
|
+
action="store_true",
|
|
241
|
+
default=True,
|
|
242
|
+
help="Edit both datasets from studio and local",
|
|
243
|
+
)
|
|
224
244
|
parse_edit_dataset.add_argument(
|
|
225
245
|
"--team",
|
|
226
246
|
action="store",
|
|
@@ -295,7 +315,21 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
295
315
|
"--studio",
|
|
296
316
|
action="store_true",
|
|
297
317
|
default=False,
|
|
298
|
-
help="Remove dataset from Studio
|
|
318
|
+
help="Remove dataset from Studio",
|
|
319
|
+
)
|
|
320
|
+
rm_dataset_parser.add_argument(
|
|
321
|
+
"-L",
|
|
322
|
+
"--local",
|
|
323
|
+
action="store_true",
|
|
324
|
+
default=False,
|
|
325
|
+
help="Remove local datasets only",
|
|
326
|
+
)
|
|
327
|
+
rm_dataset_parser.add_argument(
|
|
328
|
+
"-a",
|
|
329
|
+
"--all",
|
|
330
|
+
action="store_true",
|
|
331
|
+
default=True,
|
|
332
|
+
help="Remove both local and studio",
|
|
299
333
|
)
|
|
300
334
|
rm_dataset_parser.add_argument(
|
|
301
335
|
"--team",
|
datachain/client/fsspec.py
CHANGED
|
@@ -207,14 +207,13 @@ class Client(ABC):
|
|
|
207
207
|
)
|
|
208
208
|
|
|
209
209
|
async def get_current_etag(self, file: "File") -> str:
|
|
210
|
-
file_path = file.get_path_normalized()
|
|
211
210
|
kwargs = {}
|
|
212
211
|
if self._is_version_aware():
|
|
213
212
|
kwargs["version_id"] = file.version
|
|
214
213
|
info = await self.fs._info(
|
|
215
|
-
self.get_full_path(
|
|
214
|
+
self.get_full_path(file.path, file.version), **kwargs
|
|
216
215
|
)
|
|
217
|
-
return self.info_to_file(info,
|
|
216
|
+
return self.info_to_file(info, file.path).etag
|
|
218
217
|
|
|
219
218
|
def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
|
|
220
219
|
info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
|
|
@@ -387,8 +386,7 @@ class Client(ABC):
|
|
|
387
386
|
return open(cache_path, mode="rb")
|
|
388
387
|
assert not file.location
|
|
389
388
|
return FileWrapper(
|
|
390
|
-
self.fs.open(self.get_full_path(file.
|
|
391
|
-
cb,
|
|
389
|
+
self.fs.open(self.get_full_path(file.path, file.version)), cb
|
|
392
390
|
) # type: ignore[return-value]
|
|
393
391
|
|
|
394
392
|
def upload(self, data: bytes, path: str) -> "File":
|
datachain/client/hf.py
CHANGED
|
@@ -21,9 +21,6 @@ def _wrap_class(sync_fs_class):
|
|
|
21
21
|
asynchronous to False by default. This is similar to other Async FS
|
|
22
22
|
we initialize. E.g. it means we don't break things in Jupyter where code
|
|
23
23
|
run in async.
|
|
24
|
-
|
|
25
|
-
This also fixes write operations by ensuring they are properly forwarded
|
|
26
|
-
to the underlying filesystem without async buffering issues.
|
|
27
24
|
"""
|
|
28
25
|
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
|
29
26
|
|
|
@@ -32,13 +29,6 @@ def _wrap_class(sync_fs_class):
|
|
|
32
29
|
sync_fs = sync_fs_class(*args, **kwargs)
|
|
33
30
|
super().__init__(sync_fs, asynchronous=False)
|
|
34
31
|
|
|
35
|
-
def open(self, path, mode="rb", **kwargs):
|
|
36
|
-
# Override open to ensure write operations work correctly.
|
|
37
|
-
# It seems to be a bug in the fsspec wrapper. It avoids
|
|
38
|
-
# wrapping open() explicitly but also doesn't redirect it to
|
|
39
|
-
# sync filesystem.
|
|
40
|
-
return self.sync_fs.open(path, mode, **kwargs)
|
|
41
|
-
|
|
42
32
|
GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
|
|
43
33
|
return GeneratedAsyncFileSystemWrapper
|
|
44
34
|
|
datachain/client/local.py
CHANGED
|
@@ -99,7 +99,7 @@ class FileClient(Client):
|
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
async def get_current_etag(self, file: "File") -> str:
|
|
102
|
-
info = self.fs.info(self.get_full_path(file.
|
|
102
|
+
info = self.fs.info(self.get_full_path(file.path))
|
|
103
103
|
return self.info_to_file(info, "").etag
|
|
104
104
|
|
|
105
105
|
async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
|
|
@@ -138,8 +138,8 @@ class FileClient(Client):
|
|
|
138
138
|
if not self.use_symlinks:
|
|
139
139
|
super().fetch_nodes(nodes, shared_progress_bar)
|
|
140
140
|
|
|
141
|
-
def do_instantiate_object(self,
|
|
141
|
+
def do_instantiate_object(self, uid, dst):
|
|
142
142
|
if self.use_symlinks:
|
|
143
|
-
os.symlink(Path(self.name,
|
|
143
|
+
os.symlink(Path(self.name, uid.path), dst)
|
|
144
144
|
else:
|
|
145
|
-
super().do_instantiate_object(
|
|
145
|
+
super().do_instantiate_object(uid, dst)
|