datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -0
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +213 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +35 -26
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/parser/__init__.py +1 -35
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +433 -37
- datachain/data_storage/sqlite.py +140 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +128 -12
- datachain/delta.py +11 -7
- datachain/error.py +36 -0
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +253 -91
- datachain/lib/dc/datasets.py +103 -50
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/records.py +2 -1
- datachain/lib/dc/storage.py +38 -40
- datachain/lib/file.py +77 -23
- datachain/lib/listing.py +3 -1
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +71 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/pytorch.py +1 -1
- datachain/lib/settings.py +10 -0
- datachain/lib/signal_schema.py +8 -0
- datachain/lib/tar.py +1 -2
- datachain/lib/udf.py +1 -1
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +61 -26
- datachain/studio.py +23 -6
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,7 @@ if TYPE_CHECKING:
|
|
|
8
8
|
|
|
9
9
|
from datachain.cli.utils import determine_flavors
|
|
10
10
|
from datachain.config import Config
|
|
11
|
-
from datachain.error import DatasetNotFoundError
|
|
11
|
+
from datachain.error import DataChainError, DatasetNotFoundError
|
|
12
12
|
from datachain.studio import list_datasets as list_datasets_studio
|
|
13
13
|
|
|
14
14
|
|
|
@@ -101,11 +101,14 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
|
|
|
101
101
|
|
|
102
102
|
for d in catalog.ls_datasets():
|
|
103
103
|
for v in d.versions:
|
|
104
|
-
yield (d.
|
|
104
|
+
yield (d.full_name, v.version)
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
def list_datasets_local_versions(catalog: "Catalog", name: str):
|
|
108
|
-
|
|
108
|
+
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
109
|
+
|
|
110
|
+
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
111
|
+
ds = catalog.get_dataset(name, project)
|
|
109
112
|
for v in ds.versions:
|
|
110
113
|
yield (name, v.version)
|
|
111
114
|
|
|
@@ -129,25 +132,27 @@ def rm_dataset(
|
|
|
129
132
|
name: str,
|
|
130
133
|
version: Optional[str] = None,
|
|
131
134
|
force: Optional[bool] = False,
|
|
132
|
-
studio: bool = False,
|
|
133
|
-
local: bool = False,
|
|
134
|
-
all: bool = True,
|
|
135
|
+
studio: Optional[bool] = False,
|
|
135
136
|
team: Optional[str] = None,
|
|
136
137
|
):
|
|
137
|
-
|
|
138
|
+
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
138
139
|
|
|
139
|
-
|
|
140
|
-
|
|
140
|
+
if not catalog.metastore.is_local_dataset(namespace_name) and studio:
|
|
141
|
+
from datachain.studio import remove_studio_dataset
|
|
141
142
|
|
|
142
|
-
|
|
143
|
+
token = Config().read().get("studio", {}).get("token")
|
|
144
|
+
if not token:
|
|
145
|
+
raise DataChainError(
|
|
146
|
+
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
147
|
+
)
|
|
148
|
+
remove_studio_dataset(team, name, namespace_name, project_name, version, force)
|
|
149
|
+
else:
|
|
143
150
|
try:
|
|
144
|
-
catalog.
|
|
151
|
+
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
152
|
+
catalog.remove_dataset(name, project, version=version, force=force)
|
|
145
153
|
except DatasetNotFoundError:
|
|
146
154
|
print("Dataset not found in local", file=sys.stderr)
|
|
147
155
|
|
|
148
|
-
if (all or studio) and token:
|
|
149
|
-
remove_studio_dataset(team, name, version, force)
|
|
150
|
-
|
|
151
156
|
|
|
152
157
|
def edit_dataset(
|
|
153
158
|
catalog: "Catalog",
|
|
@@ -155,21 +160,25 @@ def edit_dataset(
|
|
|
155
160
|
new_name: Optional[str] = None,
|
|
156
161
|
description: Optional[str] = None,
|
|
157
162
|
attrs: Optional[list[str]] = None,
|
|
158
|
-
studio: bool = False,
|
|
159
|
-
local: bool = False,
|
|
160
|
-
all: bool = True,
|
|
161
163
|
team: Optional[str] = None,
|
|
162
164
|
):
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
token = Config().read().get("studio", {}).get("token")
|
|
166
|
-
all, local, studio = determine_flavors(studio, local, all, token)
|
|
165
|
+
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
167
166
|
|
|
168
|
-
if
|
|
167
|
+
if catalog.metastore.is_local_dataset(namespace_name):
|
|
169
168
|
try:
|
|
170
|
-
catalog.edit_dataset(
|
|
169
|
+
catalog.edit_dataset(
|
|
170
|
+
name, catalog.metastore.default_project, new_name, description, attrs
|
|
171
|
+
)
|
|
171
172
|
except DatasetNotFoundError:
|
|
172
173
|
print("Dataset not found in local", file=sys.stderr)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
174
|
+
else:
|
|
175
|
+
from datachain.studio import edit_studio_dataset
|
|
176
|
+
|
|
177
|
+
token = Config().read().get("studio", {}).get("token")
|
|
178
|
+
if not token:
|
|
179
|
+
raise DataChainError(
|
|
180
|
+
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
181
|
+
)
|
|
182
|
+
edit_studio_dataset(
|
|
183
|
+
team, name, namespace_name, project_name, new_name, description, attrs
|
|
184
|
+
)
|
datachain/cli/commands/ls.py
CHANGED
|
@@ -63,8 +63,8 @@ def ls_local(
|
|
|
63
63
|
print(format_ls_entry(entry))
|
|
64
64
|
else:
|
|
65
65
|
# Collect results in a list here to prevent interference from `tqdm` and `print`
|
|
66
|
-
listing =
|
|
67
|
-
for ls in listing:
|
|
66
|
+
listing = listings().to_list("listing")
|
|
67
|
+
for (ls,) in listing:
|
|
68
68
|
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
69
69
|
|
|
70
70
|
|
datachain/cli/parser/__init__.py
CHANGED
|
@@ -221,26 +221,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
221
221
|
nargs="+",
|
|
222
222
|
help="Dataset attributes",
|
|
223
223
|
)
|
|
224
|
-
parse_edit_dataset.add_argument(
|
|
225
|
-
"--studio",
|
|
226
|
-
action="store_true",
|
|
227
|
-
default=False,
|
|
228
|
-
help="Edit dataset from Studio",
|
|
229
|
-
)
|
|
230
|
-
parse_edit_dataset.add_argument(
|
|
231
|
-
"-L",
|
|
232
|
-
"--local",
|
|
233
|
-
action="store_true",
|
|
234
|
-
default=False,
|
|
235
|
-
help="Edit local dataset only",
|
|
236
|
-
)
|
|
237
|
-
parse_edit_dataset.add_argument(
|
|
238
|
-
"-a",
|
|
239
|
-
"--all",
|
|
240
|
-
action="store_true",
|
|
241
|
-
default=True,
|
|
242
|
-
help="Edit both datasets from studio and local",
|
|
243
|
-
)
|
|
244
224
|
parse_edit_dataset.add_argument(
|
|
245
225
|
"--team",
|
|
246
226
|
action="store",
|
|
@@ -315,21 +295,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
315
295
|
"--studio",
|
|
316
296
|
action="store_true",
|
|
317
297
|
default=False,
|
|
318
|
-
help="Remove dataset from Studio",
|
|
319
|
-
)
|
|
320
|
-
rm_dataset_parser.add_argument(
|
|
321
|
-
"-L",
|
|
322
|
-
"--local",
|
|
323
|
-
action="store_true",
|
|
324
|
-
default=False,
|
|
325
|
-
help="Remove local datasets only",
|
|
326
|
-
)
|
|
327
|
-
rm_dataset_parser.add_argument(
|
|
328
|
-
"-a",
|
|
329
|
-
"--all",
|
|
330
|
-
action="store_true",
|
|
331
|
-
default=True,
|
|
332
|
-
help="Remove both local and studio",
|
|
298
|
+
help="Remove dataset from Studio only",
|
|
333
299
|
)
|
|
334
300
|
rm_dataset_parser.add_argument(
|
|
335
301
|
"--team",
|
datachain/client/fsspec.py
CHANGED
|
@@ -207,13 +207,14 @@ class Client(ABC):
|
|
|
207
207
|
)
|
|
208
208
|
|
|
209
209
|
async def get_current_etag(self, file: "File") -> str:
|
|
210
|
+
file_path = file.get_path_normalized()
|
|
210
211
|
kwargs = {}
|
|
211
212
|
if self._is_version_aware():
|
|
212
213
|
kwargs["version_id"] = file.version
|
|
213
214
|
info = await self.fs._info(
|
|
214
|
-
self.get_full_path(
|
|
215
|
+
self.get_full_path(file_path, file.version), **kwargs
|
|
215
216
|
)
|
|
216
|
-
return self.info_to_file(info,
|
|
217
|
+
return self.info_to_file(info, file_path).etag
|
|
217
218
|
|
|
218
219
|
def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
|
|
219
220
|
info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
|
|
@@ -386,7 +387,8 @@ class Client(ABC):
|
|
|
386
387
|
return open(cache_path, mode="rb")
|
|
387
388
|
assert not file.location
|
|
388
389
|
return FileWrapper(
|
|
389
|
-
self.fs.open(self.get_full_path(file.
|
|
390
|
+
self.fs.open(self.get_full_path(file.get_path_normalized(), file.version)),
|
|
391
|
+
cb,
|
|
390
392
|
) # type: ignore[return-value]
|
|
391
393
|
|
|
392
394
|
def upload(self, data: bytes, path: str) -> "File":
|
datachain/client/hf.py
CHANGED
|
@@ -21,6 +21,9 @@ def _wrap_class(sync_fs_class):
|
|
|
21
21
|
asynchronous to False by default. This is similar to other Async FS
|
|
22
22
|
we initialize. E.g. it means we don't break things in Jupyter where code
|
|
23
23
|
run in async.
|
|
24
|
+
|
|
25
|
+
This also fixes write operations by ensuring they are properly forwarded
|
|
26
|
+
to the underlying filesystem without async buffering issues.
|
|
24
27
|
"""
|
|
25
28
|
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
|
26
29
|
|
|
@@ -29,6 +32,13 @@ def _wrap_class(sync_fs_class):
|
|
|
29
32
|
sync_fs = sync_fs_class(*args, **kwargs)
|
|
30
33
|
super().__init__(sync_fs, asynchronous=False)
|
|
31
34
|
|
|
35
|
+
def open(self, path, mode="rb", **kwargs):
|
|
36
|
+
# Override open to ensure write operations work correctly.
|
|
37
|
+
# It seems to be a bug in the fsspec wrapper. It avoids
|
|
38
|
+
# wrapping open() explicitly but also doesn't redirect it to
|
|
39
|
+
# sync filesystem.
|
|
40
|
+
return self.sync_fs.open(path, mode, **kwargs)
|
|
41
|
+
|
|
32
42
|
GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
|
|
33
43
|
return GeneratedAsyncFileSystemWrapper
|
|
34
44
|
|
datachain/client/local.py
CHANGED
|
@@ -99,7 +99,7 @@ class FileClient(Client):
|
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
async def get_current_etag(self, file: "File") -> str:
|
|
102
|
-
info = self.fs.info(self.get_full_path(file.
|
|
102
|
+
info = self.fs.info(self.get_full_path(file.get_path_normalized()))
|
|
103
103
|
return self.info_to_file(info, "").etag
|
|
104
104
|
|
|
105
105
|
async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
|
|
@@ -138,8 +138,8 @@ class FileClient(Client):
|
|
|
138
138
|
if not self.use_symlinks:
|
|
139
139
|
super().fetch_nodes(nodes, shared_progress_bar)
|
|
140
140
|
|
|
141
|
-
def do_instantiate_object(self,
|
|
141
|
+
def do_instantiate_object(self, file: File, dst: str) -> None:
|
|
142
142
|
if self.use_symlinks:
|
|
143
|
-
os.symlink(Path(self.name,
|
|
143
|
+
os.symlink(Path(self.name, file.path), dst)
|
|
144
144
|
else:
|
|
145
|
-
super().do_instantiate_object(
|
|
145
|
+
super().do_instantiate_object(file, dst)
|