datachain 0.21.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (48) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +180 -65
  4. datachain/cli/__init__.py +0 -7
  5. datachain/cli/commands/datasets.py +43 -28
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +422 -37
  12. datachain/data_storage/sqlite.py +136 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +126 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +260 -92
  21. datachain/lib/dc/datasets.py +104 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +1 -0
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/tar.py +1 -2
  33. datachain/lib/udf.py +1 -1
  34. datachain/lib/udf_signature.py +1 -1
  35. datachain/lib/webdataset.py +30 -20
  36. datachain/listing.py +3 -1
  37. datachain/namespace.py +65 -0
  38. datachain/project.py +78 -0
  39. datachain/query/dataset.py +71 -46
  40. datachain/query/session.py +1 -1
  41. datachain/remote/studio.py +61 -26
  42. datachain/studio.py +23 -6
  43. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
  44. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
  45. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
  46. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
  47. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
  48. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,8 @@ if TYPE_CHECKING:
8
8
 
9
9
  from datachain.cli.utils import determine_flavors
10
10
  from datachain.config import Config
11
- from datachain.error import DatasetNotFoundError
11
+ from datachain.dataset import parse_dataset_name
12
+ from datachain.error import DataChainError, DatasetNotFoundError
12
13
  from datachain.studio import list_datasets as list_datasets_studio
13
14
 
14
15
 
@@ -101,11 +102,15 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
101
102
 
102
103
  for d in catalog.ls_datasets():
103
104
  for v in d.versions:
104
- yield (d.name, v.version)
105
+ yield (d.full_name, v.version)
105
106
 
106
107
 
107
108
  def list_datasets_local_versions(catalog: "Catalog", name: str):
108
- ds = catalog.get_dataset(name)
109
+ namespace_name, project_name, name = parse_dataset_name(name)
110
+ namespace_name = namespace_name or catalog.metastore.default_namespace_name
111
+ project_name = project_name or catalog.metastore.default_project_name
112
+ project = catalog.metastore.get_project(project_name, namespace_name)
113
+ ds = catalog.get_dataset(name, project)
109
114
  for v in ds.versions:
110
115
  yield (name, v.version)
111
116
 
@@ -129,25 +134,29 @@ def rm_dataset(
129
134
  name: str,
130
135
  version: Optional[str] = None,
131
136
  force: Optional[bool] = False,
132
- studio: bool = False,
133
- local: bool = False,
134
- all: bool = True,
137
+ studio: Optional[bool] = False,
135
138
  team: Optional[str] = None,
136
139
  ):
137
- from datachain.studio import remove_studio_dataset
138
-
139
- token = Config().read().get("studio", {}).get("token")
140
- all, local, studio = determine_flavors(studio, local, all, token)
141
-
142
- if all or local:
140
+ namespace_name, project_name, name = parse_dataset_name(name)
141
+ namespace_name = namespace_name or catalog.metastore.default_namespace_name
142
+ project_name = project_name or catalog.metastore.default_project_name
143
+
144
+ if not catalog.metastore.is_local_dataset(namespace_name) and studio:
145
+ from datachain.studio import remove_studio_dataset
146
+
147
+ token = Config().read().get("studio", {}).get("token")
148
+ if not token:
149
+ raise DataChainError(
150
+ "Not logged in to Studio. Log in with 'datachain auth login'."
151
+ )
152
+ remove_studio_dataset(team, name, namespace_name, project_name, version, force)
153
+ else:
143
154
  try:
144
- catalog.remove_dataset(name, version=version, force=force)
155
+ project = catalog.metastore.get_project(project_name, namespace_name)
156
+ catalog.remove_dataset(name, project, version=version, force=force)
145
157
  except DatasetNotFoundError:
146
158
  print("Dataset not found in local", file=sys.stderr)
147
159
 
148
- if (all or studio) and token:
149
- remove_studio_dataset(team, name, version, force)
150
-
151
160
 
152
161
  def edit_dataset(
153
162
  catalog: "Catalog",
@@ -155,21 +164,27 @@ def edit_dataset(
155
164
  new_name: Optional[str] = None,
156
165
  description: Optional[str] = None,
157
166
  attrs: Optional[list[str]] = None,
158
- studio: bool = False,
159
- local: bool = False,
160
- all: bool = True,
161
167
  team: Optional[str] = None,
162
168
  ):
163
- from datachain.studio import edit_studio_dataset
169
+ namespace_name, project_name, name = parse_dataset_name(name)
170
+ namespace_name = namespace_name or catalog.metastore.default_namespace_name
171
+ project_name = project_name or catalog.metastore.default_project_name
164
172
 
165
- token = Config().read().get("studio", {}).get("token")
166
- all, local, studio = determine_flavors(studio, local, all, token)
167
-
168
- if all or local:
173
+ if catalog.metastore.is_local_dataset(namespace_name):
169
174
  try:
170
- catalog.edit_dataset(name, new_name, description, attrs)
175
+ catalog.edit_dataset(
176
+ name, catalog.metastore.default_project, new_name, description, attrs
177
+ )
171
178
  except DatasetNotFoundError:
172
179
  print("Dataset not found in local", file=sys.stderr)
173
-
174
- if (all or studio) and token:
175
- edit_studio_dataset(team, name, new_name, description, attrs)
180
+ else:
181
+ from datachain.studio import edit_studio_dataset
182
+
183
+ token = Config().read().get("studio", {}).get("token")
184
+ if not token:
185
+ raise DataChainError(
186
+ "Not logged in to Studio. Log in with 'datachain auth login'."
187
+ )
188
+ edit_studio_dataset(
189
+ team, name, namespace_name, project_name, new_name, description, attrs
190
+ )
@@ -63,8 +63,8 @@ def ls_local(
63
63
  print(format_ls_entry(entry))
64
64
  else:
65
65
  # Collect results in a list here to prevent interference from `tqdm` and `print`
66
- listing = list(listings().collect("listing"))
67
- for ls in listing:
66
+ listing = listings().to_list("listing")
67
+ for (ls,) in listing:
68
68
  print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
69
69
 
70
70
 
@@ -221,26 +221,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
221
221
  nargs="+",
222
222
  help="Dataset attributes",
223
223
  )
224
- parse_edit_dataset.add_argument(
225
- "--studio",
226
- action="store_true",
227
- default=False,
228
- help="Edit dataset from Studio",
229
- )
230
- parse_edit_dataset.add_argument(
231
- "-L",
232
- "--local",
233
- action="store_true",
234
- default=False,
235
- help="Edit local dataset only",
236
- )
237
- parse_edit_dataset.add_argument(
238
- "-a",
239
- "--all",
240
- action="store_true",
241
- default=True,
242
- help="Edit both datasets from studio and local",
243
- )
244
224
  parse_edit_dataset.add_argument(
245
225
  "--team",
246
226
  action="store",
@@ -315,21 +295,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
315
295
  "--studio",
316
296
  action="store_true",
317
297
  default=False,
318
- help="Remove dataset from Studio",
319
- )
320
- rm_dataset_parser.add_argument(
321
- "-L",
322
- "--local",
323
- action="store_true",
324
- default=False,
325
- help="Remove local datasets only",
326
- )
327
- rm_dataset_parser.add_argument(
328
- "-a",
329
- "--all",
330
- action="store_true",
331
- default=True,
332
- help="Remove both local and studio",
298
+ help="Remove dataset from Studio only",
333
299
  )
334
300
  rm_dataset_parser.add_argument(
335
301
  "--team",
@@ -207,13 +207,14 @@ class Client(ABC):
207
207
  )
208
208
 
209
209
  async def get_current_etag(self, file: "File") -> str:
210
+ file_path = file.get_path_normalized()
210
211
  kwargs = {}
211
212
  if self._is_version_aware():
212
213
  kwargs["version_id"] = file.version
213
214
  info = await self.fs._info(
214
- self.get_full_path(file.path, file.version), **kwargs
215
+ self.get_full_path(file_path, file.version), **kwargs
215
216
  )
216
- return self.info_to_file(info, file.path).etag
217
+ return self.info_to_file(info, file_path).etag
217
218
 
218
219
  def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
219
220
  info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
@@ -386,7 +387,8 @@ class Client(ABC):
386
387
  return open(cache_path, mode="rb")
387
388
  assert not file.location
388
389
  return FileWrapper(
389
- self.fs.open(self.get_full_path(file.path, file.version)), cb
390
+ self.fs.open(self.get_full_path(file.get_path_normalized(), file.version)),
391
+ cb,
390
392
  ) # type: ignore[return-value]
391
393
 
392
394
  def upload(self, data: bytes, path: str) -> "File":
datachain/client/hf.py CHANGED
@@ -21,6 +21,9 @@ def _wrap_class(sync_fs_class):
21
21
  asynchronous to False by default. This is similar to other Async FS
22
22
  we initialize. E.g. it means we don't break things in Jupyter where code
23
23
  run in async.
24
+
25
+ This also fixes write operations by ensuring they are properly forwarded
26
+ to the underlying filesystem without async buffering issues.
24
27
  """
25
28
  from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
26
29
 
@@ -29,6 +32,13 @@ def _wrap_class(sync_fs_class):
29
32
  sync_fs = sync_fs_class(*args, **kwargs)
30
33
  super().__init__(sync_fs, asynchronous=False)
31
34
 
35
+ def open(self, path, mode="rb", **kwargs):
36
+ # Override open to ensure write operations work correctly.
37
+ # It seems to be a bug in the fsspec wrapper. It avoids
38
+ # wrapping open() explicitly but also doesn't redirect it to
39
+ # sync filesystem.
40
+ return self.sync_fs.open(path, mode, **kwargs)
41
+
32
42
  GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
33
43
  return GeneratedAsyncFileSystemWrapper
34
44
 
datachain/client/local.py CHANGED
@@ -99,7 +99,7 @@ class FileClient(Client):
99
99
  )
100
100
 
101
101
  async def get_current_etag(self, file: "File") -> str:
102
- info = self.fs.info(self.get_full_path(file.path))
102
+ info = self.fs.info(self.get_full_path(file.get_path_normalized()))
103
103
  return self.info_to_file(info, "").etag
104
104
 
105
105
  async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
@@ -138,8 +138,8 @@ class FileClient(Client):
138
138
  if not self.use_symlinks:
139
139
  super().fetch_nodes(nodes, shared_progress_bar)
140
140
 
141
- def do_instantiate_object(self, uid, dst):
141
+ def do_instantiate_object(self, file: File, dst: str) -> None:
142
142
  if self.use_symlinks:
143
- os.symlink(Path(self.name, uid.path), dst)
143
+ os.symlink(Path(self.name, file.path), dst)
144
144
  else:
145
- super().do_instantiate_object(uid, dst)
145
+ super().do_instantiate_object(file, dst)