datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +213 -65
  4. datachain/cli/__init__.py +0 -7
  5. datachain/cli/commands/datasets.py +35 -26
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +433 -37
  12. datachain/data_storage/sqlite.py +140 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +128 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +253 -91
  21. datachain/lib/dc/datasets.py +103 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +2 -1
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/signal_schema.py +8 -0
  33. datachain/lib/tar.py +1 -2
  34. datachain/lib/udf.py +1 -1
  35. datachain/lib/udf_signature.py +1 -1
  36. datachain/lib/webdataset.py +30 -20
  37. datachain/listing.py +3 -1
  38. datachain/namespace.py +65 -0
  39. datachain/project.py +78 -0
  40. datachain/query/dataset.py +71 -46
  41. datachain/query/session.py +1 -1
  42. datachain/remote/studio.py +61 -26
  43. datachain/studio.py +23 -6
  44. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
  45. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
  46. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
  47. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
  48. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
  49. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ if TYPE_CHECKING:
8
8
 
9
9
  from datachain.cli.utils import determine_flavors
10
10
  from datachain.config import Config
11
- from datachain.error import DatasetNotFoundError
11
+ from datachain.error import DataChainError, DatasetNotFoundError
12
12
  from datachain.studio import list_datasets as list_datasets_studio
13
13
 
14
14
 
@@ -101,11 +101,14 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
101
101
 
102
102
  for d in catalog.ls_datasets():
103
103
  for v in d.versions:
104
- yield (d.name, v.version)
104
+ yield (d.full_name, v.version)
105
105
 
106
106
 
107
107
  def list_datasets_local_versions(catalog: "Catalog", name: str):
108
- ds = catalog.get_dataset(name)
108
+ namespace_name, project_name, name = catalog.get_full_dataset_name(name)
109
+
110
+ project = catalog.metastore.get_project(project_name, namespace_name)
111
+ ds = catalog.get_dataset(name, project)
109
112
  for v in ds.versions:
110
113
  yield (name, v.version)
111
114
 
@@ -129,25 +132,27 @@ def rm_dataset(
129
132
  name: str,
130
133
  version: Optional[str] = None,
131
134
  force: Optional[bool] = False,
132
- studio: bool = False,
133
- local: bool = False,
134
- all: bool = True,
135
+ studio: Optional[bool] = False,
135
136
  team: Optional[str] = None,
136
137
  ):
137
- from datachain.studio import remove_studio_dataset
138
+ namespace_name, project_name, name = catalog.get_full_dataset_name(name)
138
139
 
139
- token = Config().read().get("studio", {}).get("token")
140
- all, local, studio = determine_flavors(studio, local, all, token)
140
+ if not catalog.metastore.is_local_dataset(namespace_name) and studio:
141
+ from datachain.studio import remove_studio_dataset
141
142
 
142
- if all or local:
143
+ token = Config().read().get("studio", {}).get("token")
144
+ if not token:
145
+ raise DataChainError(
146
+ "Not logged in to Studio. Log in with 'datachain auth login'."
147
+ )
148
+ remove_studio_dataset(team, name, namespace_name, project_name, version, force)
149
+ else:
143
150
  try:
144
- catalog.remove_dataset(name, version=version, force=force)
151
+ project = catalog.metastore.get_project(project_name, namespace_name)
152
+ catalog.remove_dataset(name, project, version=version, force=force)
145
153
  except DatasetNotFoundError:
146
154
  print("Dataset not found in local", file=sys.stderr)
147
155
 
148
- if (all or studio) and token:
149
- remove_studio_dataset(team, name, version, force)
150
-
151
156
 
152
157
  def edit_dataset(
153
158
  catalog: "Catalog",
@@ -155,21 +160,25 @@ def edit_dataset(
155
160
  new_name: Optional[str] = None,
156
161
  description: Optional[str] = None,
157
162
  attrs: Optional[list[str]] = None,
158
- studio: bool = False,
159
- local: bool = False,
160
- all: bool = True,
161
163
  team: Optional[str] = None,
162
164
  ):
163
- from datachain.studio import edit_studio_dataset
164
-
165
- token = Config().read().get("studio", {}).get("token")
166
- all, local, studio = determine_flavors(studio, local, all, token)
165
+ namespace_name, project_name, name = catalog.get_full_dataset_name(name)
167
166
 
168
- if all or local:
167
+ if catalog.metastore.is_local_dataset(namespace_name):
169
168
  try:
170
- catalog.edit_dataset(name, new_name, description, attrs)
169
+ catalog.edit_dataset(
170
+ name, catalog.metastore.default_project, new_name, description, attrs
171
+ )
171
172
  except DatasetNotFoundError:
172
173
  print("Dataset not found in local", file=sys.stderr)
173
-
174
- if (all or studio) and token:
175
- edit_studio_dataset(team, name, new_name, description, attrs)
174
+ else:
175
+ from datachain.studio import edit_studio_dataset
176
+
177
+ token = Config().read().get("studio", {}).get("token")
178
+ if not token:
179
+ raise DataChainError(
180
+ "Not logged in to Studio. Log in with 'datachain auth login'."
181
+ )
182
+ edit_studio_dataset(
183
+ team, name, namespace_name, project_name, new_name, description, attrs
184
+ )
@@ -63,8 +63,8 @@ def ls_local(
63
63
  print(format_ls_entry(entry))
64
64
  else:
65
65
  # Collect results in a list here to prevent interference from `tqdm` and `print`
66
- listing = list(listings().collect("listing"))
67
- for ls in listing:
66
+ listing = listings().to_list("listing")
67
+ for (ls,) in listing:
68
68
  print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
69
69
 
70
70
 
@@ -221,26 +221,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
221
221
  nargs="+",
222
222
  help="Dataset attributes",
223
223
  )
224
- parse_edit_dataset.add_argument(
225
- "--studio",
226
- action="store_true",
227
- default=False,
228
- help="Edit dataset from Studio",
229
- )
230
- parse_edit_dataset.add_argument(
231
- "-L",
232
- "--local",
233
- action="store_true",
234
- default=False,
235
- help="Edit local dataset only",
236
- )
237
- parse_edit_dataset.add_argument(
238
- "-a",
239
- "--all",
240
- action="store_true",
241
- default=True,
242
- help="Edit both datasets from studio and local",
243
- )
244
224
  parse_edit_dataset.add_argument(
245
225
  "--team",
246
226
  action="store",
@@ -315,21 +295,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
315
295
  "--studio",
316
296
  action="store_true",
317
297
  default=False,
318
- help="Remove dataset from Studio",
319
- )
320
- rm_dataset_parser.add_argument(
321
- "-L",
322
- "--local",
323
- action="store_true",
324
- default=False,
325
- help="Remove local datasets only",
326
- )
327
- rm_dataset_parser.add_argument(
328
- "-a",
329
- "--all",
330
- action="store_true",
331
- default=True,
332
- help="Remove both local and studio",
298
+ help="Remove dataset from Studio only",
333
299
  )
334
300
  rm_dataset_parser.add_argument(
335
301
  "--team",
@@ -207,13 +207,14 @@ class Client(ABC):
207
207
  )
208
208
 
209
209
  async def get_current_etag(self, file: "File") -> str:
210
+ file_path = file.get_path_normalized()
210
211
  kwargs = {}
211
212
  if self._is_version_aware():
212
213
  kwargs["version_id"] = file.version
213
214
  info = await self.fs._info(
214
- self.get_full_path(file.path, file.version), **kwargs
215
+ self.get_full_path(file_path, file.version), **kwargs
215
216
  )
216
- return self.info_to_file(info, file.path).etag
217
+ return self.info_to_file(info, file_path).etag
217
218
 
218
219
  def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
219
220
  info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
@@ -386,7 +387,8 @@ class Client(ABC):
386
387
  return open(cache_path, mode="rb")
387
388
  assert not file.location
388
389
  return FileWrapper(
389
- self.fs.open(self.get_full_path(file.path, file.version)), cb
390
+ self.fs.open(self.get_full_path(file.get_path_normalized(), file.version)),
391
+ cb,
390
392
  ) # type: ignore[return-value]
391
393
 
392
394
  def upload(self, data: bytes, path: str) -> "File":
datachain/client/hf.py CHANGED
@@ -21,6 +21,9 @@ def _wrap_class(sync_fs_class):
21
21
  asynchronous to False by default. This is similar to other Async FS
22
22
  we initialize. E.g. it means we don't break things in Jupyter where code
23
23
  run in async.
24
+
25
+ This also fixes write operations by ensuring they are properly forwarded
26
+ to the underlying filesystem without async buffering issues.
24
27
  """
25
28
  from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
26
29
 
@@ -29,6 +32,13 @@ def _wrap_class(sync_fs_class):
29
32
  sync_fs = sync_fs_class(*args, **kwargs)
30
33
  super().__init__(sync_fs, asynchronous=False)
31
34
 
35
+ def open(self, path, mode="rb", **kwargs):
36
+ # Override open to ensure write operations work correctly.
37
+ # It seems to be a bug in the fsspec wrapper. It avoids
38
+ # wrapping open() explicitly but also doesn't redirect it to
39
+ # sync filesystem.
40
+ return self.sync_fs.open(path, mode, **kwargs)
41
+
32
42
  GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
33
43
  return GeneratedAsyncFileSystemWrapper
34
44
 
datachain/client/local.py CHANGED
@@ -99,7 +99,7 @@ class FileClient(Client):
99
99
  )
100
100
 
101
101
  async def get_current_etag(self, file: "File") -> str:
102
- info = self.fs.info(self.get_full_path(file.path))
102
+ info = self.fs.info(self.get_full_path(file.get_path_normalized()))
103
103
  return self.info_to_file(info, "").etag
104
104
 
105
105
  async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
@@ -138,8 +138,8 @@ class FileClient(Client):
138
138
  if not self.use_symlinks:
139
139
  super().fetch_nodes(nodes, shared_progress_bar)
140
140
 
141
- def do_instantiate_object(self, uid, dst):
141
+ def do_instantiate_object(self, file: File, dst: str) -> None:
142
142
  if self.use_symlinks:
143
- os.symlink(Path(self.name, uid.path), dst)
143
+ os.symlink(Path(self.name, file.path), dst)
144
144
  else:
145
- super().do_instantiate_object(uid, dst)
145
+ super().do_instantiate_object(file, dst)