datachain 0.21.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (48) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +180 -65
  4. datachain/cli/__init__.py +0 -7
  5. datachain/cli/commands/datasets.py +43 -28
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +422 -37
  12. datachain/data_storage/sqlite.py +136 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +126 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +260 -92
  21. datachain/lib/dc/datasets.py +104 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +1 -0
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/tar.py +1 -2
  33. datachain/lib/udf.py +1 -1
  34. datachain/lib/udf_signature.py +1 -1
  35. datachain/lib/webdataset.py +30 -20
  36. datachain/listing.py +3 -1
  37. datachain/namespace.py +65 -0
  38. datachain/project.py +78 -0
  39. datachain/query/dataset.py +71 -46
  40. datachain/query/session.py +1 -1
  41. datachain/remote/studio.py +61 -26
  42. datachain/studio.py +23 -6
  43. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
  44. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
  45. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
  46. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
  47. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
  48. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,71 @@
1
+ from typing import Optional
2
+
3
+ from datachain.error import NamespaceCreateNotAllowedError
4
+ from datachain.namespace import Namespace
5
+ from datachain.query import Session
6
+
7
+
8
+ def create(
9
+ name: str, descr: Optional[str] = None, session: Optional[Session] = None
10
+ ) -> Namespace:
11
+ """
12
+ Creates a new namespace.
13
+
14
+ Namespaces organize projects, which in turn organize datasets. A default
15
+ namespace always exists and is used if none is specified. Multiple namespaces
16
+ can be created in Studio, but only the default is available in the CLI.
17
+
18
+ Parameters:
19
+ name: Name of the new namespace.
20
+ descr: Optional description of the namespace.
21
+ session: Optional session to use for the operation.
22
+
23
+ Example:
24
+ ```py
25
+ from datachain.lib.namespaces import create as create_namespace
26
+ namespace = create_namespace("dev", "Dev namespace")
27
+ ```
28
+ """
29
+ session = Session.get(session)
30
+
31
+ if not session.catalog.metastore.namespace_allowed_to_create:
32
+ raise NamespaceCreateNotAllowedError("Creating namespace is not allowed")
33
+
34
+ Namespace.validate_name(name)
35
+
36
+ return session.catalog.metastore.create_namespace(name, descr)
37
+
38
+
39
+ def get(name: str, session: Optional[Session] = None) -> Namespace:
40
+ """
41
+ Gets a namespace by name.
42
+ If the namespace is not found, a `NamespaceNotFoundError` is raised.
43
+
44
+ Parameters:
45
+ name : The name of the namespace.
46
+ session : Session to use for getting namespace.
47
+
48
+ Example:
49
+ ```py
50
+ import datachain as dc
51
+ namespace = dc.get_namespace("local")
52
+ ```
53
+ """
54
+ session = Session.get(session)
55
+ return session.catalog.metastore.get_namespace(name)
56
+
57
+
58
+ def ls(session: Optional[Session] = None) -> list[Namespace]:
59
+ """
60
+ Gets a list of all namespaces.
61
+
62
+ Parameters:
63
+ session : Session to use for getting namespaces.
64
+
65
+ Example:
66
+ ```py
67
+ from datachain.lib.namespaces import ls as ls_namespaces
68
+ namespaces = ls_namespaces()
69
+ ```
70
+ """
71
+ return Session.get(session).catalog.metastore.list_namespaces()
@@ -0,0 +1,86 @@
1
+ from typing import Optional
2
+
3
+ from datachain.error import ProjectCreateNotAllowedError
4
+ from datachain.project import Project
5
+ from datachain.query import Session
6
+
7
+
8
+ def create(
9
+ namespace: str,
10
+ name: str,
11
+ descr: Optional[str] = None,
12
+ session: Optional[Session] = None,
13
+ ) -> Project:
14
+ """
15
+ Creates a new project under a specified namespace.
16
+
17
+ Projects help organize datasets. A default project is always available,
18
+ but users can create additional ones (only in Studio, not via CLI).
19
+
20
+
21
+ Parameters:
22
+ name: Name of the new project.
23
+ namespace: Namespace to create the project in. Created if it doesn't exist.
24
+ descr: Optional description of the project.
25
+ session: Optional session to use for the operation.
26
+
27
+ Example:
28
+ ```py
29
+ import datachain as dc
30
+ project = dc.create_project("dev", "my-project", "My personal project")
31
+ ```
32
+ """
33
+ session = Session.get(session)
34
+
35
+ if not session.catalog.metastore.project_allowed_to_create:
36
+ raise ProjectCreateNotAllowedError("Creating project is not allowed")
37
+
38
+ Project.validate_name(name)
39
+
40
+ return session.catalog.metastore.create_project(namespace, name, descr)
41
+
42
+
43
+ def get(name: str, namespace: str, session: Optional[Session]) -> Project:
44
+ """
45
+ Gets a project by name in some namespace.
46
+ If the project is not found, a `ProjectNotFoundError` is raised.
47
+
48
+ Parameters:
49
+ name : The name of the project.
50
+ namespace : The name of the namespace.
51
+ session : Session to use for getting project.
52
+
53
+ Example:
54
+ ```py
55
+ import datachain as dc
56
+ from datachain.lib.projects import get as get_project
57
+ project = get_project("my-project", "local")
58
+ ```
59
+ """
60
+ return Session.get(session).catalog.metastore.get_project(name, namespace)
61
+
62
+
63
+ def ls(
64
+ namespace: Optional[str] = None, session: Optional[Session] = None
65
+ ) -> list[Project]:
66
+ """
67
+ Gets a list of projects in a specific namespace or from all namespaces.
68
+
69
+ Parameters:
70
+ namespace : An optional namespace name.
71
+ session : Session to use for getting project.
72
+
73
+ Example:
74
+ ```py
75
+ import datachain as dc
76
+ from datachain.lib.projects import ls as ls_projects
77
+ local_namespace_projects = ls_projects("local")
78
+ all_projects = ls_projects()
79
+ ```
80
+ """
81
+ session = Session.get(session)
82
+ namespace_id = None
83
+ if namespace:
84
+ namespace_id = session.catalog.metastore.get_namespace(namespace).id
85
+
86
+ return session.catalog.metastore.list_projects(namespace_id)
datachain/lib/pytorch.py CHANGED
@@ -130,7 +130,7 @@ class PytorchDataset(IterableDataset):
130
130
  if self.num_samples > 0:
131
131
  ds = ds.sample(self.num_samples)
132
132
  ds = ds.chunk(total_rank, total_workers)
133
- yield from ds.collect()
133
+ yield from ds.to_iter()
134
134
 
135
135
  def _iter_with_prefetch(self) -> Generator[tuple[Any], None, None]:
136
136
  from datachain.lib.udf import _prefetch_inputs
datachain/lib/settings.py CHANGED
@@ -14,12 +14,16 @@ class Settings:
14
14
  workers=None,
15
15
  min_task_size=None,
16
16
  prefetch=None,
17
+ namespace=None,
18
+ project=None,
17
19
  ):
18
20
  self._cache = cache
19
21
  self.parallel = parallel
20
22
  self._workers = workers
21
23
  self.min_task_size = min_task_size
22
24
  self.prefetch = prefetch
25
+ self.namespace = namespace
26
+ self.project = project
23
27
 
24
28
  if not isinstance(cache, bool) and cache is not None:
25
29
  raise SettingsError(
@@ -67,6 +71,10 @@ class Settings:
67
71
  res["workers"] = self.workers
68
72
  if self.min_task_size is not None:
69
73
  res["min_task_size"] = self.min_task_size
74
+ if self.namespace is not None:
75
+ res["namespace"] = self.namespace
76
+ if self.project is not None:
77
+ res["project"] = self.project
70
78
  return res
71
79
 
72
80
  def add(self, settings: "Settings"):
@@ -74,5 +82,7 @@ class Settings:
74
82
  self.parallel = settings.parallel or self.parallel
75
83
  self._workers = settings._workers or self._workers
76
84
  self.min_task_size = settings.min_task_size or self.min_task_size
85
+ self.namespace = settings.namespace or self.namespace
86
+ self.project = settings.project or self.project
77
87
  if settings.prefetch is not None:
78
88
  self.prefetch = settings.prefetch
datachain/lib/tar.py CHANGED
@@ -6,12 +6,11 @@ from datachain.lib.file import File, TarVFile
6
6
 
7
7
 
8
8
  def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
9
- new_parent = parent.get_full_name()
10
9
  etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
11
10
  etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
12
11
  return File(
13
12
  source=parent.source,
14
- path=f"{new_parent}/{info.name}",
13
+ path=f"{parent.path}/{info.name}",
15
14
  version=parent.version,
16
15
  size=info.size,
17
16
  etag=etag,
datachain/lib/udf.py CHANGED
@@ -309,7 +309,7 @@ async def _prefetch_input(
309
309
  after_prefetch: "Callable[[], None]" = noop,
310
310
  ) -> T:
311
311
  for obj in row:
312
- if isinstance(obj, File) and await obj._prefetch(download_cb):
312
+ if isinstance(obj, File) and obj.path and await obj._prefetch(download_cb):
313
313
  after_prefetch()
314
314
  return row
315
315
 
@@ -16,7 +16,7 @@ class UdfSignatureError(DataChainParamsError):
16
16
 
17
17
 
18
18
  @dataclass
19
- class UdfSignature:
19
+ class UdfSignature: # noqa: PLW1641
20
20
  func: Union[Callable, UDFBase]
21
21
  params: dict[str, Union[DataType, Any]]
22
22
  output_schema: SignalSchema
@@ -34,29 +34,29 @@ warnings.filterwarnings(
34
34
 
35
35
 
36
36
  class WDSError(DataChainError):
37
- def __init__(self, tar_stream, message: str):
38
- super().__init__(f"WebDataset error '{tar_stream.get_full_name()}': {message}")
37
+ def __init__(self, tar_name: str, message: str):
38
+ super().__init__(f"WebDataset error '{tar_name}': {message}")
39
39
 
40
40
 
41
41
  class CoreFileDuplicationError(WDSError):
42
- def __init__(self, tar_stream, file1: str, file2: str):
42
+ def __init__(self, tar_name: str, file1: str, file2: str):
43
43
  super().__init__(
44
- tar_stream, f"duplication of files with core extensions: {file1}, {file2}"
44
+ tar_name, f"duplication of files with core extensions: {file1}, {file2}"
45
45
  )
46
46
 
47
47
 
48
48
  class CoreFileNotFoundError(WDSError):
49
- def __init__(self, tar_stream, extensions, stem):
49
+ def __init__(self, tar_name: str, extensions: Sequence[str], stem: str):
50
50
  super().__init__(
51
- tar_stream,
51
+ tar_name,
52
52
  f"no files with the extensions '{','.join(extensions)}'"
53
53
  f" were found for file stem {stem}",
54
54
  )
55
55
 
56
56
 
57
57
  class UnknownFileExtensionError(WDSError):
58
- def __init__(self, tar_stream, name, ext):
59
- super().__init__(tar_stream, f"unknown extension '{ext}' for file '{name}'")
58
+ def __init__(self, tar_name, name: str, ext: str):
59
+ super().__init__(tar_name, f"unknown extension '{ext}' for file '{name}'")
60
60
 
61
61
 
62
62
  class WDSBasic(DataModel):
@@ -113,10 +113,10 @@ class Builder:
113
113
  def __init__(
114
114
  self,
115
115
  tar_stream: File,
116
- core_extensions: list[str],
116
+ core_extensions: Sequence[str],
117
117
  wds_class: type[WDSBasic],
118
- tar,
119
- encoding="utf-8",
118
+ tar: tarfile.TarFile,
119
+ encoding: str = "utf-8",
120
120
  ):
121
121
  self._core_extensions = core_extensions
122
122
  self._tar_stream = tar_stream
@@ -145,18 +145,20 @@ class Builder:
145
145
  if ext in self._core_extensions:
146
146
  if self.state.core_file is not None:
147
147
  raise CoreFileDuplicationError(
148
- self._tar_stream, file.name, self.state.core_file.name
148
+ self._tar_stream.name, file.name, self.state.core_file.name
149
149
  )
150
150
  self.state.core_file = file
151
151
  elif ext in self.state.data:
152
152
  raise WDSError(
153
- self._tar_stream,
153
+ self._tar_stream.name,
154
154
  f"file with extension '.{ext}' already exists in the archive",
155
155
  )
156
156
  else:
157
157
  type_ = self._get_type(ext)
158
158
  if type_ is None:
159
- raise UnknownFileExtensionError(self._tar_stream, fstream.name, ext)
159
+ raise UnknownFileExtensionError(
160
+ self._tar_stream.name, fstream.name, ext
161
+ )
160
162
 
161
163
  if issubclass(type_, WDSReadableSubclass):
162
164
  reader = type_._reader
@@ -165,7 +167,7 @@ class Builder:
165
167
 
166
168
  if reader is None:
167
169
  raise WDSError(
168
- self._tar_stream,
170
+ self._tar_stream.name,
169
171
  f"unable to find a reader for type {type_}, extension .{ext}",
170
172
  )
171
173
  self.state.data[ext] = reader(self, file)
@@ -173,7 +175,7 @@ class Builder:
173
175
  def produce(self):
174
176
  if self.state.core_file is None:
175
177
  raise CoreFileNotFoundError(
176
- self._tar_stream, self._core_extensions, self.state.stem
178
+ self._tar_stream.name, self._core_extensions, self.state.stem
177
179
  )
178
180
 
179
181
  file = build_tar_member(self._tar_stream, self.state.core_file)
@@ -194,7 +196,13 @@ class Builder:
194
196
  return anno
195
197
 
196
198
 
197
- def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
199
+ def get_tar_groups(
200
+ stream: File,
201
+ tar: tarfile.TarFile,
202
+ core_extensions: Sequence[str],
203
+ spec: type[WDSBasic],
204
+ encoding: str = "utf-8",
205
+ ) -> Iterator[WDSBasic]:
198
206
  builder = Builder(stream, core_extensions, spec, tar, encoding)
199
207
 
200
208
  for item in sorted(tar.getmembers(), key=lambda m: Path(m.name).stem):
@@ -210,9 +218,11 @@ def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
210
218
 
211
219
 
212
220
  def process_webdataset(
213
- core_extensions: Sequence[str] = ("jpg", "png"), spec=WDSAllFile, encoding="utf-8"
214
- ) -> Callable:
215
- def wds_func(file: File) -> Iterator[spec]:
221
+ core_extensions: Sequence[str] = ("jpg", "png"),
222
+ spec: type[WDSBasic] = WDSAllFile,
223
+ encoding: str = "utf-8",
224
+ ) -> Callable[[File], Iterator]:
225
+ def wds_func(file: File) -> Iterator[spec]: # type: ignore[valid-type]
216
226
  with file.open() as fd:
217
227
  with tarfile.open(fileobj=fd) as tar:
218
228
  yield from get_tar_groups(file, tar, core_extensions, spec, encoding)
datachain/listing.py CHANGED
@@ -66,7 +66,9 @@ class Listing:
66
66
  @cached_property
67
67
  def dataset(self) -> "DatasetRecord":
68
68
  assert self.dataset_name
69
- return self.metastore.get_dataset(self.dataset_name)
69
+ return self.metastore.get_dataset(
70
+ self.dataset_name, self.metastore.listing_project.id
71
+ )
70
72
 
71
73
  @cached_property
72
74
  def dataset_rows(self):
datachain/namespace.py ADDED
@@ -0,0 +1,65 @@
1
+ import builtins
2
+ from dataclasses import dataclass, fields
3
+ from datetime import datetime
4
+ from typing import Any, Optional, TypeVar
5
+
6
+ from datachain.error import InvalidNamespaceNameError
7
+
8
+ N = TypeVar("N", bound="Namespace")
9
+ NAMESPACE_NAME_RESERVED_CHARS = ["."]
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class Namespace:
14
+ id: int
15
+ uuid: str
16
+ name: str
17
+ descr: Optional[str]
18
+ created_at: datetime
19
+
20
+ @staticmethod
21
+ def validate_name(name: str) -> None:
22
+ """Throws exception if name is invalid, otherwise returns None"""
23
+ if not name:
24
+ raise InvalidNamespaceNameError("Namespace name cannot be empty")
25
+
26
+ for c in NAMESPACE_NAME_RESERVED_CHARS:
27
+ if c in name:
28
+ raise InvalidNamespaceNameError(
29
+ f"Character {c} is reserved and not allowed in namespace name"
30
+ )
31
+
32
+ if name in [Namespace.default(), Namespace.system()]:
33
+ raise InvalidNamespaceNameError(
34
+ f"Namespace name {name} is reserved and cannot be used."
35
+ )
36
+
37
+ @staticmethod
38
+ def default() -> str:
39
+ """Name of default namespace"""
40
+ return "local"
41
+
42
+ @staticmethod
43
+ def system() -> str:
44
+ """Name of the system namespace"""
45
+ return "system"
46
+
47
+ @property
48
+ def is_system(self):
49
+ return self.name == Namespace.system()
50
+
51
+ @classmethod
52
+ def parse(
53
+ cls: builtins.type[N],
54
+ id: int,
55
+ uuid: str,
56
+ name: str,
57
+ descr: Optional[str],
58
+ created_at: datetime,
59
+ ) -> "Namespace":
60
+ return cls(id, uuid, name, descr, created_at)
61
+
62
+ @classmethod
63
+ def from_dict(cls, d: dict[str, Any]) -> "Namespace":
64
+ kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
65
+ return cls(**kwargs)
datachain/project.py ADDED
@@ -0,0 +1,78 @@
1
+ import builtins
2
+ from dataclasses import dataclass, fields
3
+ from datetime import datetime
4
+ from typing import Any, Optional, TypeVar
5
+
6
+ from datachain.error import InvalidProjectNameError
7
+ from datachain.namespace import Namespace
8
+
9
+ P = TypeVar("P", bound="Project")
10
+ PROJECT_NAME_RESERVED_CHARS = ["."]
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class Project:
15
+ id: int
16
+ uuid: str
17
+ name: str
18
+ descr: Optional[str]
19
+ created_at: datetime
20
+ namespace: Namespace
21
+
22
+ @staticmethod
23
+ def validate_name(name: str) -> None:
24
+ """Throws exception if name is invalid, otherwise returns None"""
25
+ if not name:
26
+ raise InvalidProjectNameError("Project name cannot be empty")
27
+
28
+ for c in PROJECT_NAME_RESERVED_CHARS:
29
+ if c in name:
30
+ raise InvalidProjectNameError(
31
+ f"Character {c} is reserved and not allowed in project name."
32
+ )
33
+
34
+ if name in [Project.default(), Project.listing()]:
35
+ raise InvalidProjectNameError(
36
+ f"Project name {name} is reserved and cannot be used."
37
+ )
38
+
39
+ @staticmethod
40
+ def default() -> str:
41
+ """Name of default project"""
42
+ return "local"
43
+
44
+ @staticmethod
45
+ def listing() -> str:
46
+ """Name of listing project where all listing datasets will be saved"""
47
+ return "listing"
48
+
49
+ @classmethod
50
+ def parse(
51
+ cls: builtins.type[P],
52
+ namespace_id: int,
53
+ namespace_uuid: str,
54
+ namespace_name: str,
55
+ namespace_descr: Optional[str],
56
+ namespace_created_at: datetime,
57
+ project_id: int,
58
+ uuid: str,
59
+ name: str,
60
+ descr: Optional[str],
61
+ created_at: datetime,
62
+ project_namespace_id: int,
63
+ ) -> "Project":
64
+ namespace = Namespace.parse(
65
+ namespace_id,
66
+ namespace_uuid,
67
+ namespace_name,
68
+ namespace_descr,
69
+ namespace_created_at,
70
+ )
71
+
72
+ return cls(project_id, uuid, name, descr, created_at, namespace)
73
+
74
+ @classmethod
75
+ def from_dict(cls, d: dict[str, Any]) -> "Project":
76
+ namespace = Namespace.from_dict(d.pop("namespace"))
77
+ kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
78
+ return cls(**kwargs, namespace=namespace)