datachain 0.21.0__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +180 -65
- datachain/cli/__init__.py +4 -9
- datachain/cli/commands/datasets.py +43 -28
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +1 -35
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +422 -37
- datachain/data_storage/sqlite.py +136 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +126 -12
- datachain/delta.py +11 -7
- datachain/error.py +36 -0
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +260 -92
- datachain/lib/dc/datasets.py +104 -50
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +1 -0
- datachain/lib/dc/storage.py +38 -40
- datachain/lib/file.py +77 -23
- datachain/lib/listing.py +3 -1
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +71 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +10 -0
- datachain/lib/tar.py +1 -2
- datachain/lib/udf.py +1 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +61 -26
- datachain/studio.py +36 -10
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,8 @@ if TYPE_CHECKING:
|
|
|
8
8
|
|
|
9
9
|
from datachain.cli.utils import determine_flavors
|
|
10
10
|
from datachain.config import Config
|
|
11
|
-
from datachain.
|
|
11
|
+
from datachain.dataset import parse_dataset_name
|
|
12
|
+
from datachain.error import DataChainError, DatasetNotFoundError
|
|
12
13
|
from datachain.studio import list_datasets as list_datasets_studio
|
|
13
14
|
|
|
14
15
|
|
|
@@ -101,11 +102,15 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
|
|
|
101
102
|
|
|
102
103
|
for d in catalog.ls_datasets():
|
|
103
104
|
for v in d.versions:
|
|
104
|
-
yield (d.
|
|
105
|
+
yield (d.full_name, v.version)
|
|
105
106
|
|
|
106
107
|
|
|
107
108
|
def list_datasets_local_versions(catalog: "Catalog", name: str):
|
|
108
|
-
|
|
109
|
+
namespace_name, project_name, name = parse_dataset_name(name)
|
|
110
|
+
namespace_name = namespace_name or catalog.metastore.default_namespace_name
|
|
111
|
+
project_name = project_name or catalog.metastore.default_project_name
|
|
112
|
+
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
113
|
+
ds = catalog.get_dataset(name, project)
|
|
109
114
|
for v in ds.versions:
|
|
110
115
|
yield (name, v.version)
|
|
111
116
|
|
|
@@ -129,25 +134,29 @@ def rm_dataset(
|
|
|
129
134
|
name: str,
|
|
130
135
|
version: Optional[str] = None,
|
|
131
136
|
force: Optional[bool] = False,
|
|
132
|
-
studio: bool = False,
|
|
133
|
-
local: bool = False,
|
|
134
|
-
all: bool = True,
|
|
137
|
+
studio: Optional[bool] = False,
|
|
135
138
|
team: Optional[str] = None,
|
|
136
139
|
):
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
140
|
+
namespace_name, project_name, name = parse_dataset_name(name)
|
|
141
|
+
namespace_name = namespace_name or catalog.metastore.default_namespace_name
|
|
142
|
+
project_name = project_name or catalog.metastore.default_project_name
|
|
143
|
+
|
|
144
|
+
if not catalog.metastore.is_local_dataset(namespace_name) and studio:
|
|
145
|
+
from datachain.studio import remove_studio_dataset
|
|
146
|
+
|
|
147
|
+
token = Config().read().get("studio", {}).get("token")
|
|
148
|
+
if not token:
|
|
149
|
+
raise DataChainError(
|
|
150
|
+
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
151
|
+
)
|
|
152
|
+
remove_studio_dataset(team, name, namespace_name, project_name, version, force)
|
|
153
|
+
else:
|
|
143
154
|
try:
|
|
144
|
-
catalog.
|
|
155
|
+
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
156
|
+
catalog.remove_dataset(name, project, version=version, force=force)
|
|
145
157
|
except DatasetNotFoundError:
|
|
146
158
|
print("Dataset not found in local", file=sys.stderr)
|
|
147
159
|
|
|
148
|
-
if (all or studio) and token:
|
|
149
|
-
remove_studio_dataset(team, name, version, force)
|
|
150
|
-
|
|
151
160
|
|
|
152
161
|
def edit_dataset(
|
|
153
162
|
catalog: "Catalog",
|
|
@@ -155,21 +164,27 @@ def edit_dataset(
|
|
|
155
164
|
new_name: Optional[str] = None,
|
|
156
165
|
description: Optional[str] = None,
|
|
157
166
|
attrs: Optional[list[str]] = None,
|
|
158
|
-
studio: bool = False,
|
|
159
|
-
local: bool = False,
|
|
160
|
-
all: bool = True,
|
|
161
167
|
team: Optional[str] = None,
|
|
162
168
|
):
|
|
163
|
-
|
|
169
|
+
namespace_name, project_name, name = parse_dataset_name(name)
|
|
170
|
+
namespace_name = namespace_name or catalog.metastore.default_namespace_name
|
|
171
|
+
project_name = project_name or catalog.metastore.default_project_name
|
|
164
172
|
|
|
165
|
-
|
|
166
|
-
all, local, studio = determine_flavors(studio, local, all, token)
|
|
167
|
-
|
|
168
|
-
if all or local:
|
|
173
|
+
if catalog.metastore.is_local_dataset(namespace_name):
|
|
169
174
|
try:
|
|
170
|
-
catalog.edit_dataset(
|
|
175
|
+
catalog.edit_dataset(
|
|
176
|
+
name, catalog.metastore.default_project, new_name, description, attrs
|
|
177
|
+
)
|
|
171
178
|
except DatasetNotFoundError:
|
|
172
179
|
print("Dataset not found in local", file=sys.stderr)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
180
|
+
else:
|
|
181
|
+
from datachain.studio import edit_studio_dataset
|
|
182
|
+
|
|
183
|
+
token = Config().read().get("studio", {}).get("token")
|
|
184
|
+
if not token:
|
|
185
|
+
raise DataChainError(
|
|
186
|
+
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
187
|
+
)
|
|
188
|
+
edit_studio_dataset(
|
|
189
|
+
team, name, namespace_name, project_name, new_name, description, attrs
|
|
190
|
+
)
|
datachain/cli/commands/ls.py
CHANGED
|
@@ -63,8 +63,8 @@ def ls_local(
|
|
|
63
63
|
print(format_ls_entry(entry))
|
|
64
64
|
else:
|
|
65
65
|
# Collect results in a list here to prevent interference from `tqdm` and `print`
|
|
66
|
-
listing =
|
|
67
|
-
for ls in listing:
|
|
66
|
+
listing = listings().to_list("listing")
|
|
67
|
+
for (ls,) in listing:
|
|
68
68
|
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
69
69
|
|
|
70
70
|
|
datachain/cli/parser/__init__.py
CHANGED
|
@@ -221,26 +221,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
221
221
|
nargs="+",
|
|
222
222
|
help="Dataset attributes",
|
|
223
223
|
)
|
|
224
|
-
parse_edit_dataset.add_argument(
|
|
225
|
-
"--studio",
|
|
226
|
-
action="store_true",
|
|
227
|
-
default=False,
|
|
228
|
-
help="Edit dataset from Studio",
|
|
229
|
-
)
|
|
230
|
-
parse_edit_dataset.add_argument(
|
|
231
|
-
"-L",
|
|
232
|
-
"--local",
|
|
233
|
-
action="store_true",
|
|
234
|
-
default=False,
|
|
235
|
-
help="Edit local dataset only",
|
|
236
|
-
)
|
|
237
|
-
parse_edit_dataset.add_argument(
|
|
238
|
-
"-a",
|
|
239
|
-
"--all",
|
|
240
|
-
action="store_true",
|
|
241
|
-
default=True,
|
|
242
|
-
help="Edit both datasets from studio and local",
|
|
243
|
-
)
|
|
244
224
|
parse_edit_dataset.add_argument(
|
|
245
225
|
"--team",
|
|
246
226
|
action="store",
|
|
@@ -315,21 +295,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
315
295
|
"--studio",
|
|
316
296
|
action="store_true",
|
|
317
297
|
default=False,
|
|
318
|
-
help="Remove dataset from Studio",
|
|
319
|
-
)
|
|
320
|
-
rm_dataset_parser.add_argument(
|
|
321
|
-
"-L",
|
|
322
|
-
"--local",
|
|
323
|
-
action="store_true",
|
|
324
|
-
default=False,
|
|
325
|
-
help="Remove local datasets only",
|
|
326
|
-
)
|
|
327
|
-
rm_dataset_parser.add_argument(
|
|
328
|
-
"-a",
|
|
329
|
-
"--all",
|
|
330
|
-
action="store_true",
|
|
331
|
-
default=True,
|
|
332
|
-
help="Remove both local and studio",
|
|
298
|
+
help="Remove dataset from Studio only",
|
|
333
299
|
)
|
|
334
300
|
rm_dataset_parser.add_argument(
|
|
335
301
|
"--team",
|
datachain/client/fsspec.py
CHANGED
|
@@ -207,13 +207,14 @@ class Client(ABC):
|
|
|
207
207
|
)
|
|
208
208
|
|
|
209
209
|
async def get_current_etag(self, file: "File") -> str:
|
|
210
|
+
file_path = file.get_path_normalized()
|
|
210
211
|
kwargs = {}
|
|
211
212
|
if self._is_version_aware():
|
|
212
213
|
kwargs["version_id"] = file.version
|
|
213
214
|
info = await self.fs._info(
|
|
214
|
-
self.get_full_path(
|
|
215
|
+
self.get_full_path(file_path, file.version), **kwargs
|
|
215
216
|
)
|
|
216
|
-
return self.info_to_file(info,
|
|
217
|
+
return self.info_to_file(info, file_path).etag
|
|
217
218
|
|
|
218
219
|
def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
|
|
219
220
|
info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
|
|
@@ -386,7 +387,8 @@ class Client(ABC):
|
|
|
386
387
|
return open(cache_path, mode="rb")
|
|
387
388
|
assert not file.location
|
|
388
389
|
return FileWrapper(
|
|
389
|
-
self.fs.open(self.get_full_path(file.
|
|
390
|
+
self.fs.open(self.get_full_path(file.get_path_normalized(), file.version)),
|
|
391
|
+
cb,
|
|
390
392
|
) # type: ignore[return-value]
|
|
391
393
|
|
|
392
394
|
def upload(self, data: bytes, path: str) -> "File":
|
datachain/client/hf.py
CHANGED
|
@@ -21,6 +21,9 @@ def _wrap_class(sync_fs_class):
|
|
|
21
21
|
asynchronous to False by default. This is similar to other Async FS
|
|
22
22
|
we initialize. E.g. it means we don't break things in Jupyter where code
|
|
23
23
|
run in async.
|
|
24
|
+
|
|
25
|
+
This also fixes write operations by ensuring they are properly forwarded
|
|
26
|
+
to the underlying filesystem without async buffering issues.
|
|
24
27
|
"""
|
|
25
28
|
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
|
26
29
|
|
|
@@ -29,6 +32,13 @@ def _wrap_class(sync_fs_class):
|
|
|
29
32
|
sync_fs = sync_fs_class(*args, **kwargs)
|
|
30
33
|
super().__init__(sync_fs, asynchronous=False)
|
|
31
34
|
|
|
35
|
+
def open(self, path, mode="rb", **kwargs):
|
|
36
|
+
# Override open to ensure write operations work correctly.
|
|
37
|
+
# It seems to be a bug in the fsspec wrapper. It avoids
|
|
38
|
+
# wrapping open() explicitly but also doesn't redirect it to
|
|
39
|
+
# sync filesystem.
|
|
40
|
+
return self.sync_fs.open(path, mode, **kwargs)
|
|
41
|
+
|
|
32
42
|
GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
|
|
33
43
|
return GeneratedAsyncFileSystemWrapper
|
|
34
44
|
|
datachain/client/local.py
CHANGED
|
@@ -99,7 +99,7 @@ class FileClient(Client):
|
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
async def get_current_etag(self, file: "File") -> str:
|
|
102
|
-
info = self.fs.info(self.get_full_path(file.
|
|
102
|
+
info = self.fs.info(self.get_full_path(file.get_path_normalized()))
|
|
103
103
|
return self.info_to_file(info, "").etag
|
|
104
104
|
|
|
105
105
|
async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
|
|
@@ -138,8 +138,8 @@ class FileClient(Client):
|
|
|
138
138
|
if not self.use_symlinks:
|
|
139
139
|
super().fetch_nodes(nodes, shared_progress_bar)
|
|
140
140
|
|
|
141
|
-
def do_instantiate_object(self,
|
|
141
|
+
def do_instantiate_object(self, file: File, dst: str) -> None:
|
|
142
142
|
if self.use_symlinks:
|
|
143
|
-
os.symlink(Path(self.name,
|
|
143
|
+
os.symlink(Path(self.name, file.path), dst)
|
|
144
144
|
else:
|
|
145
|
-
super().do_instantiate_object(
|
|
145
|
+
super().do_instantiate_object(file, dst)
|