datachain 0.20.4__py3-none-any.whl → 0.21.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (47) hide show
  1. datachain/__init__.py +0 -2
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +65 -180
  4. datachain/cli/__init__.py +11 -2
  5. datachain/cli/commands/datasets.py +28 -43
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +35 -1
  8. datachain/client/fsspec.py +3 -5
  9. datachain/client/hf.py +0 -10
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +37 -405
  12. datachain/data_storage/sqlite.py +7 -136
  13. datachain/data_storage/warehouse.py +7 -26
  14. datachain/dataset.py +12 -126
  15. datachain/delta.py +7 -11
  16. datachain/error.py +0 -36
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +0 -4
  20. datachain/lib/dc/datachain.py +92 -260
  21. datachain/lib/dc/datasets.py +50 -104
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +0 -1
  24. datachain/lib/dc/storage.py +40 -38
  25. datachain/lib/file.py +23 -77
  26. datachain/lib/listing.py +1 -3
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/pytorch.py +1 -1
  29. datachain/lib/settings.py +0 -10
  30. datachain/lib/tar.py +2 -1
  31. datachain/lib/udf_signature.py +1 -1
  32. datachain/lib/webdataset.py +20 -30
  33. datachain/listing.py +1 -3
  34. datachain/query/dataset.py +46 -71
  35. datachain/query/session.py +1 -1
  36. datachain/remote/studio.py +26 -61
  37. datachain/studio.py +20 -27
  38. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/METADATA +2 -2
  39. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/RECORD +43 -47
  40. datachain/lib/namespaces.py +0 -71
  41. datachain/lib/projects.py +0 -86
  42. datachain/namespace.py +0 -65
  43. datachain/project.py +0 -78
  44. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/WHEEL +0 -0
  45. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/licenses/LICENSE +0 -0
  47. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/top_level.txt +0 -0
@@ -8,8 +8,7 @@ if TYPE_CHECKING:
8
8
 
9
9
  from datachain.cli.utils import determine_flavors
10
10
  from datachain.config import Config
11
- from datachain.dataset import parse_dataset_name
12
- from datachain.error import DataChainError, DatasetNotFoundError
11
+ from datachain.error import DatasetNotFoundError
13
12
  from datachain.studio import list_datasets as list_datasets_studio
14
13
 
15
14
 
@@ -102,15 +101,11 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
102
101
 
103
102
  for d in catalog.ls_datasets():
104
103
  for v in d.versions:
105
- yield (d.full_name, v.version)
104
+ yield (d.name, v.version)
106
105
 
107
106
 
108
107
  def list_datasets_local_versions(catalog: "Catalog", name: str):
109
- namespace_name, project_name, name = parse_dataset_name(name)
110
- namespace_name = namespace_name or catalog.metastore.default_namespace_name
111
- project_name = project_name or catalog.metastore.default_project_name
112
- project = catalog.metastore.get_project(project_name, namespace_name)
113
- ds = catalog.get_dataset(name, project)
108
+ ds = catalog.get_dataset(name)
114
109
  for v in ds.versions:
115
110
  yield (name, v.version)
116
111
 
@@ -134,29 +129,25 @@ def rm_dataset(
134
129
  name: str,
135
130
  version: Optional[str] = None,
136
131
  force: Optional[bool] = False,
137
- studio: Optional[bool] = False,
132
+ studio: bool = False,
133
+ local: bool = False,
134
+ all: bool = True,
138
135
  team: Optional[str] = None,
139
136
  ):
140
- namespace_name, project_name, name = parse_dataset_name(name)
141
- namespace_name = namespace_name or catalog.metastore.default_namespace_name
142
- project_name = project_name or catalog.metastore.default_project_name
143
-
144
- if not catalog.metastore.is_local_dataset(namespace_name) and studio:
145
- from datachain.studio import remove_studio_dataset
146
-
147
- token = Config().read().get("studio", {}).get("token")
148
- if not token:
149
- raise DataChainError(
150
- "Not logged in to Studio. Log in with 'datachain auth login'."
151
- )
152
- remove_studio_dataset(team, name, namespace_name, project_name, version, force)
153
- else:
137
+ from datachain.studio import remove_studio_dataset
138
+
139
+ token = Config().read().get("studio", {}).get("token")
140
+ all, local, studio = determine_flavors(studio, local, all, token)
141
+
142
+ if all or local:
154
143
  try:
155
- project = catalog.metastore.get_project(project_name, namespace_name)
156
- catalog.remove_dataset(name, project, version=version, force=force)
144
+ catalog.remove_dataset(name, version=version, force=force)
157
145
  except DatasetNotFoundError:
158
146
  print("Dataset not found in local", file=sys.stderr)
159
147
 
148
+ if (all or studio) and token:
149
+ remove_studio_dataset(team, name, version, force)
150
+
160
151
 
161
152
  def edit_dataset(
162
153
  catalog: "Catalog",
@@ -164,27 +155,21 @@ def edit_dataset(
164
155
  new_name: Optional[str] = None,
165
156
  description: Optional[str] = None,
166
157
  attrs: Optional[list[str]] = None,
158
+ studio: bool = False,
159
+ local: bool = False,
160
+ all: bool = True,
167
161
  team: Optional[str] = None,
168
162
  ):
169
- namespace_name, project_name, name = parse_dataset_name(name)
170
- namespace_name = namespace_name or catalog.metastore.default_namespace_name
171
- project_name = project_name or catalog.metastore.default_project_name
163
+ from datachain.studio import edit_studio_dataset
172
164
 
173
- if catalog.metastore.is_local_dataset(namespace_name):
165
+ token = Config().read().get("studio", {}).get("token")
166
+ all, local, studio = determine_flavors(studio, local, all, token)
167
+
168
+ if all or local:
174
169
  try:
175
- catalog.edit_dataset(
176
- name, catalog.metastore.default_project, new_name, description, attrs
177
- )
170
+ catalog.edit_dataset(name, new_name, description, attrs)
178
171
  except DatasetNotFoundError:
179
172
  print("Dataset not found in local", file=sys.stderr)
180
- else:
181
- from datachain.studio import edit_studio_dataset
182
-
183
- token = Config().read().get("studio", {}).get("token")
184
- if not token:
185
- raise DataChainError(
186
- "Not logged in to Studio. Log in with 'datachain auth login'."
187
- )
188
- edit_studio_dataset(
189
- team, name, namespace_name, project_name, new_name, description, attrs
190
- )
173
+
174
+ if (all or studio) and token:
175
+ edit_studio_dataset(team, name, new_name, description, attrs)
@@ -63,8 +63,8 @@ def ls_local(
63
63
  print(format_ls_entry(entry))
64
64
  else:
65
65
  # Collect results in a list here to prevent interference from `tqdm` and `print`
66
- listing = listings().to_list("listing")
67
- for (ls,) in listing:
66
+ listing = list(listings().collect("listing"))
67
+ for ls in listing:
68
68
  print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
69
69
 
70
70
 
@@ -221,6 +221,26 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
221
221
  nargs="+",
222
222
  help="Dataset attributes",
223
223
  )
224
+ parse_edit_dataset.add_argument(
225
+ "--studio",
226
+ action="store_true",
227
+ default=False,
228
+ help="Edit dataset from Studio",
229
+ )
230
+ parse_edit_dataset.add_argument(
231
+ "-L",
232
+ "--local",
233
+ action="store_true",
234
+ default=False,
235
+ help="Edit local dataset only",
236
+ )
237
+ parse_edit_dataset.add_argument(
238
+ "-a",
239
+ "--all",
240
+ action="store_true",
241
+ default=True,
242
+ help="Edit both datasets from studio and local",
243
+ )
224
244
  parse_edit_dataset.add_argument(
225
245
  "--team",
226
246
  action="store",
@@ -295,7 +315,21 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
295
315
  "--studio",
296
316
  action="store_true",
297
317
  default=False,
298
- help="Remove dataset from Studio only",
318
+ help="Remove dataset from Studio",
319
+ )
320
+ rm_dataset_parser.add_argument(
321
+ "-L",
322
+ "--local",
323
+ action="store_true",
324
+ default=False,
325
+ help="Remove local datasets only",
326
+ )
327
+ rm_dataset_parser.add_argument(
328
+ "-a",
329
+ "--all",
330
+ action="store_true",
331
+ default=True,
332
+ help="Remove both local and studio",
299
333
  )
300
334
  rm_dataset_parser.add_argument(
301
335
  "--team",
@@ -207,14 +207,13 @@ class Client(ABC):
207
207
  )
208
208
 
209
209
  async def get_current_etag(self, file: "File") -> str:
210
- file_path = file.get_path_normalized()
211
210
  kwargs = {}
212
211
  if self._is_version_aware():
213
212
  kwargs["version_id"] = file.version
214
213
  info = await self.fs._info(
215
- self.get_full_path(file_path, file.version), **kwargs
214
+ self.get_full_path(file.path, file.version), **kwargs
216
215
  )
217
- return self.info_to_file(info, file_path).etag
216
+ return self.info_to_file(info, file.path).etag
218
217
 
219
218
  def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
220
219
  info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
@@ -387,8 +386,7 @@ class Client(ABC):
387
386
  return open(cache_path, mode="rb")
388
387
  assert not file.location
389
388
  return FileWrapper(
390
- self.fs.open(self.get_full_path(file.get_path_normalized(), file.version)),
391
- cb,
389
+ self.fs.open(self.get_full_path(file.path, file.version)), cb
392
390
  ) # type: ignore[return-value]
393
391
 
394
392
  def upload(self, data: bytes, path: str) -> "File":
datachain/client/hf.py CHANGED
@@ -21,9 +21,6 @@ def _wrap_class(sync_fs_class):
21
21
  asynchronous to False by default. This is similar to other Async FS
22
22
  we initialize. E.g. it means we don't break things in Jupyter where code
23
23
  run in async.
24
-
25
- This also fixes write operations by ensuring they are properly forwarded
26
- to the underlying filesystem without async buffering issues.
27
24
  """
28
25
  from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
29
26
 
@@ -32,13 +29,6 @@ def _wrap_class(sync_fs_class):
32
29
  sync_fs = sync_fs_class(*args, **kwargs)
33
30
  super().__init__(sync_fs, asynchronous=False)
34
31
 
35
- def open(self, path, mode="rb", **kwargs):
36
- # Override open to ensure write operations work correctly.
37
- # It seems to be a bug in the fsspec wrapper. It avoids
38
- # wrapping open() explicitly but also doesn't redirect it to
39
- # sync filesystem.
40
- return self.sync_fs.open(path, mode, **kwargs)
41
-
42
32
  GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
43
33
  return GeneratedAsyncFileSystemWrapper
44
34
 
datachain/client/local.py CHANGED
@@ -99,7 +99,7 @@ class FileClient(Client):
99
99
  )
100
100
 
101
101
  async def get_current_etag(self, file: "File") -> str:
102
- info = self.fs.info(self.get_full_path(file.get_path_normalized()))
102
+ info = self.fs.info(self.get_full_path(file.path))
103
103
  return self.info_to_file(info, "").etag
104
104
 
105
105
  async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
@@ -138,8 +138,8 @@ class FileClient(Client):
138
138
  if not self.use_symlinks:
139
139
  super().fetch_nodes(nodes, shared_progress_bar)
140
140
 
141
- def do_instantiate_object(self, file: File, dst: str) -> None:
141
+ def do_instantiate_object(self, uid, dst):
142
142
  if self.use_symlinks:
143
- os.symlink(Path(self.name, file.path), dst)
143
+ os.symlink(Path(self.name, uid.path), dst)
144
144
  else:
145
- super().do_instantiate_object(file, dst)
145
+ super().do_instantiate_object(uid, dst)