datachain 0.19.1__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (37) hide show
  1. datachain/__init__.py +3 -0
  2. datachain/catalog/catalog.py +180 -65
  3. datachain/cli/__init__.py +0 -7
  4. datachain/cli/commands/datasets.py +43 -28
  5. datachain/cli/parser/__init__.py +1 -35
  6. datachain/cli/parser/job.py +25 -0
  7. datachain/cli/parser/studio.py +11 -4
  8. datachain/data_storage/metastore.py +390 -37
  9. datachain/data_storage/schema.py +23 -1
  10. datachain/data_storage/sqlite.py +139 -7
  11. datachain/data_storage/warehouse.py +26 -7
  12. datachain/dataset.py +125 -12
  13. datachain/delta.py +9 -5
  14. datachain/error.py +36 -0
  15. datachain/lib/dataset_info.py +4 -0
  16. datachain/lib/dc/datachain.py +86 -7
  17. datachain/lib/dc/datasets.py +62 -12
  18. datachain/lib/dc/listings.py +111 -0
  19. datachain/lib/dc/records.py +1 -0
  20. datachain/lib/dc/storage.py +14 -2
  21. datachain/lib/listing.py +3 -1
  22. datachain/lib/namespaces.py +73 -0
  23. datachain/lib/projects.py +86 -0
  24. datachain/lib/settings.py +10 -0
  25. datachain/listing.py +3 -1
  26. datachain/namespace.py +65 -0
  27. datachain/project.py +78 -0
  28. datachain/query/dataset.py +71 -46
  29. datachain/query/session.py +1 -1
  30. datachain/remote/studio.py +67 -26
  31. datachain/studio.py +68 -8
  32. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/METADATA +2 -2
  33. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/RECORD +37 -33
  34. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/WHEEL +0 -0
  35. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/entry_points.txt +0 -0
  36. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/licenses/LICENSE +0 -0
  37. {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,86 @@
1
+ from typing import Optional
2
+
3
+ from datachain.error import ProjectCreateNotAllowedError
4
+ from datachain.project import Project
5
+ from datachain.query import Session
6
+
7
+
8
+ def create(
9
+ name: str,
10
+ namespace_name: str,
11
+ description: Optional[str] = None,
12
+ session: Optional[Session] = None,
13
+ ) -> Project:
14
+ """
15
+ Creates a new custom project.
16
+ A Project is an object used to organize datasets. It is created under a
17
+ specific namespace and has a list of datasets underneath it.
18
+ Note that creating projects is not allowed in the local environment, unlike
19
+ in Studio, where it is allowed.
20
+ In local environment all datasets are created under the default `local` project.
21
+
22
+ Parameters:
23
+ name : The name of the project.
24
+ namespace : The name of the namespace under which the new project is being
25
+ created.
26
+ description : A description of the project.
27
+ session : Session to use for creating project.
28
+
29
+ Example:
30
+ ```py
31
+ import datachain as dc
32
+ project = dc.projects.create("my-project", "dev", "My personal project")
33
+ ```
34
+ """
35
+ session = Session.get(session)
36
+
37
+ if not session.catalog.metastore.project_allowed_to_create:
38
+ raise ProjectCreateNotAllowedError("Creating custom project is not allowed")
39
+
40
+ Project.validate_name(name)
41
+
42
+ return session.catalog.metastore.create_project(name, namespace_name, description)
43
+
44
+
45
+ def get(name: str, namespace_name: str, session: Optional[Session]) -> Project:
46
+ """
47
+ Gets a project by name in some namespace.
48
+ If the project is not found, a `ProjectNotFoundError` is raised.
49
+
50
+ Parameters:
51
+ name : The name of the project.
52
+ namespace_name : The name of the namespace.
53
+ session : Session to use for getting project.
54
+
55
+ Example:
56
+ ```py
57
+ import datachain as dc
58
+ project = dc.get_project("my-project", "local")
59
+ ```
60
+ """
61
+ return Session.get(session).catalog.metastore.get_project(name, namespace_name)
62
+
63
+
64
+ def ls(
65
+ namespace_name: Optional[str] = None, session: Optional[Session] = None
66
+ ) -> list[Project]:
67
+ """
68
+ Gets a list of projects in a specific namespace or from all namespaces.
69
+
70
+ Parameters:
71
+ namespace_name : An optional namespace name.
72
+ session : Session to use for getting project.
73
+
74
+ Example:
75
+ ```py
76
+ import datachain as dc
77
+ local_namespace_projects = dc.projects.ls("local")
78
+ all_projects = dc.projects.ls()
79
+ ```
80
+ """
81
+ session = Session.get(session)
82
+ namespace_id = None
83
+ if namespace_name:
84
+ namespace_id = session.catalog.metastore.get_namespace(namespace_name).id
85
+
86
+ return session.catalog.metastore.list_projects(namespace_id)
datachain/lib/settings.py CHANGED
@@ -14,12 +14,16 @@ class Settings:
14
14
  workers=None,
15
15
  min_task_size=None,
16
16
  prefetch=None,
17
+ namespace=None,
18
+ project=None,
17
19
  ):
18
20
  self._cache = cache
19
21
  self.parallel = parallel
20
22
  self._workers = workers
21
23
  self.min_task_size = min_task_size
22
24
  self.prefetch = prefetch
25
+ self.namespace = namespace
26
+ self.project = project
23
27
 
24
28
  if not isinstance(cache, bool) and cache is not None:
25
29
  raise SettingsError(
@@ -67,6 +71,10 @@ class Settings:
67
71
  res["workers"] = self.workers
68
72
  if self.min_task_size is not None:
69
73
  res["min_task_size"] = self.min_task_size
74
+ if self.namespace is not None:
75
+ res["namespace"] = self.namespace
76
+ if self.project is not None:
77
+ res["project"] = self.project
70
78
  return res
71
79
 
72
80
  def add(self, settings: "Settings"):
@@ -74,5 +82,7 @@ class Settings:
74
82
  self.parallel = settings.parallel or self.parallel
75
83
  self._workers = settings._workers or self._workers
76
84
  self.min_task_size = settings.min_task_size or self.min_task_size
85
+ self.namespace = settings.namespace or self.namespace
86
+ self.project = settings.project or self.project
77
87
  if settings.prefetch is not None:
78
88
  self.prefetch = settings.prefetch
datachain/listing.py CHANGED
@@ -66,7 +66,9 @@ class Listing:
66
66
  @cached_property
67
67
  def dataset(self) -> "DatasetRecord":
68
68
  assert self.dataset_name
69
- return self.metastore.get_dataset(self.dataset_name)
69
+ return self.metastore.get_dataset(
70
+ self.dataset_name, self.metastore.listing_project.id
71
+ )
70
72
 
71
73
  @cached_property
72
74
  def dataset_rows(self):
datachain/namespace.py ADDED
@@ -0,0 +1,65 @@
1
+ import builtins
2
+ from dataclasses import dataclass, fields
3
+ from datetime import datetime
4
+ from typing import Any, Optional, TypeVar
5
+
6
+ from datachain.error import InvalidNamespaceNameError
7
+
8
+ N = TypeVar("N", bound="Namespace")
9
+ NAMESPACE_NAME_RESERVED_CHARS = ["."]
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class Namespace:
14
+ id: int
15
+ uuid: str
16
+ name: str
17
+ description: Optional[str]
18
+ created_at: datetime
19
+
20
+ @staticmethod
21
+ def validate_name(name: str) -> None:
22
+ """Throws exception if name is invalid, otherwise returns None"""
23
+ if not name:
24
+ raise InvalidNamespaceNameError("Namespace name cannot be empty")
25
+
26
+ for c in NAMESPACE_NAME_RESERVED_CHARS:
27
+ if c in name:
28
+ raise InvalidNamespaceNameError(
29
+ f"Character {c} is reserved and not allowed in namespace name"
30
+ )
31
+
32
+ if name in [Namespace.default(), Namespace.system()]:
33
+ raise InvalidNamespaceNameError(
34
+ f"Namespace name {name} is reserved and cannot be used."
35
+ )
36
+
37
+ @staticmethod
38
+ def default() -> str:
39
+ """Name of default namespace"""
40
+ return "local"
41
+
42
+ @staticmethod
43
+ def system() -> str:
44
+ """Name of the system namespace"""
45
+ return "system"
46
+
47
+ @property
48
+ def is_system(self):
49
+ return self.name == Namespace.system()
50
+
51
+ @classmethod
52
+ def parse(
53
+ cls: builtins.type[N],
54
+ id: int,
55
+ uuid: str,
56
+ name: str,
57
+ description: Optional[str],
58
+ created_at: datetime,
59
+ ) -> "Namespace":
60
+ return cls(id, uuid, name, description, created_at)
61
+
62
+ @classmethod
63
+ def from_dict(cls, d: dict[str, Any]) -> "Namespace":
64
+ kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
65
+ return cls(**kwargs)
datachain/project.py ADDED
@@ -0,0 +1,78 @@
1
+ import builtins
2
+ from dataclasses import dataclass, fields
3
+ from datetime import datetime
4
+ from typing import Any, Optional, TypeVar
5
+
6
+ from datachain.error import InvalidProjectNameError
7
+ from datachain.namespace import Namespace
8
+
9
+ P = TypeVar("P", bound="Project")
10
+ PROJECT_NAME_RESERVED_CHARS = ["."]
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class Project:
15
+ id: int
16
+ uuid: str
17
+ name: str
18
+ description: Optional[str]
19
+ created_at: datetime
20
+ namespace: Namespace
21
+
22
+ @staticmethod
23
+ def validate_name(name: str) -> None:
24
+ """Throws exception if name is invalid, otherwise returns None"""
25
+ if not name:
26
+ raise InvalidProjectNameError("Project name cannot be empty")
27
+
28
+ for c in PROJECT_NAME_RESERVED_CHARS:
29
+ if c in name:
30
+ raise InvalidProjectNameError(
31
+ f"Character {c} is reserved and not allowed in project name."
32
+ )
33
+
34
+ if name in [Project.default(), Project.listing()]:
35
+ raise InvalidProjectNameError(
36
+ f"Project name {name} is reserved and cannot be used."
37
+ )
38
+
39
+ @staticmethod
40
+ def default() -> str:
41
+ """Name of default project"""
42
+ return "local"
43
+
44
+ @staticmethod
45
+ def listing() -> str:
46
+ """Name of listing project where all listing datasets will be saved"""
47
+ return "listing"
48
+
49
+ @classmethod
50
+ def parse(
51
+ cls: builtins.type[P],
52
+ namespace_id: int,
53
+ namespace_uuid: str,
54
+ namespace_name: str,
55
+ namespace_description: Optional[str],
56
+ namespace_created_at: datetime,
57
+ project_id: int,
58
+ uuid: str,
59
+ name: str,
60
+ description: Optional[str],
61
+ created_at: datetime,
62
+ project_namespace_id: int,
63
+ ) -> "Project":
64
+ namespace = Namespace.parse(
65
+ namespace_id,
66
+ namespace_uuid,
67
+ namespace_name,
68
+ namespace_description,
69
+ namespace_created_at,
70
+ )
71
+
72
+ return cls(project_id, uuid, name, description, created_at, namespace)
73
+
74
+ @classmethod
75
+ def from_dict(cls, d: dict[str, Any]) -> "Project":
76
+ namespace = Namespace.from_dict(d.pop("namespace"))
77
+ kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
78
+ return cls(**kwargs, namespace=namespace)
@@ -41,12 +41,13 @@ from datachain.data_storage.schema import (
41
41
  partition_col_names,
42
42
  partition_columns,
43
43
  )
44
- from datachain.dataset import DATASET_PREFIX, DatasetDependency, DatasetStatus, RowDict
44
+ from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
45
45
  from datachain.error import DatasetNotFoundError, QueryScriptCancelError
46
46
  from datachain.func.base import Function
47
47
  from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
48
48
  from datachain.lib.udf import UDFAdapter, _get_cache
49
49
  from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
50
+ from datachain.project import Project
50
51
  from datachain.query.schema import C, UDFParamSpec, normalize_param
51
52
  from datachain.query.session import Session
52
53
  from datachain.query.udf import UdfInfo
@@ -83,7 +84,7 @@ PartitionByType = Union[
83
84
  Function, ColumnElement, Sequence[Union[Function, ColumnElement]]
84
85
  ]
85
86
  JoinPredicateType = Union[str, ColumnClause, ColumnElement]
86
- DatasetDependencyType = tuple[str, str]
87
+ DatasetDependencyType = tuple["DatasetRecord", str]
87
88
 
88
89
  logger = logging.getLogger("datachain")
89
90
 
@@ -169,18 +170,17 @@ class QueryStep:
169
170
  """A query that returns all rows from specific dataset version"""
170
171
 
171
172
  catalog: "Catalog"
172
- dataset_name: str
173
+ dataset: "DatasetRecord"
173
174
  dataset_version: str
174
175
 
175
176
  def apply(self) -> "StepResult":
176
177
  def q(*columns):
177
178
  return sqlalchemy.select(*columns)
178
179
 
179
- dataset = self.catalog.get_dataset(self.dataset_name)
180
- dr = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
180
+ dr = self.catalog.warehouse.dataset_rows(self.dataset, self.dataset_version)
181
181
 
182
182
  return step_result(
183
- q, dr.columns, dependencies=[(self.dataset_name, self.dataset_version)]
183
+ q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
184
184
  )
185
185
 
186
186
 
@@ -1095,6 +1095,8 @@ class DatasetQuery:
1095
1095
  self,
1096
1096
  name: str,
1097
1097
  version: Optional[str] = None,
1098
+ project_name: Optional[str] = None,
1099
+ namespace_name: Optional[str] = None,
1098
1100
  catalog: Optional["Catalog"] = None,
1099
1101
  session: Optional[Session] = None,
1100
1102
  indexing_column_types: Optional[dict[str, Any]] = None,
@@ -1128,33 +1130,38 @@ class DatasetQuery:
1128
1130
  if version:
1129
1131
  self.version = version
1130
1132
 
1131
- if is_listing_dataset(name):
1132
- if version:
1133
- # this listing dataset should already be listed as we specify
1134
- # exact version
1135
- self._set_starting_step(self.catalog.get_dataset(name))
1136
- else:
1137
- # not setting query step yet as listing dataset might not exist at
1138
- # this point
1139
- self.list_ds_name = name
1133
+ namespace_name = namespace_name or self.catalog.metastore.default_namespace_name
1134
+ project_name = project_name or self.catalog.metastore.default_project_name
1135
+
1136
+ if is_listing_dataset(name) and not version:
1137
+ # not setting query step yet as listing dataset might not exist at
1138
+ # this point
1139
+ self.list_ds_name = name
1140
1140
  elif fallback_to_studio and is_token_set():
1141
1141
  self._set_starting_step(
1142
- self.catalog.get_dataset_with_remote_fallback(name, version)
1142
+ self.catalog.get_dataset_with_remote_fallback(
1143
+ name,
1144
+ namespace_name=namespace_name,
1145
+ project_name=project_name,
1146
+ version=version,
1147
+ )
1143
1148
  )
1144
1149
  else:
1145
- self._set_starting_step(self.catalog.get_dataset(name))
1150
+ project = self.catalog.metastore.get_project(project_name, namespace_name)
1151
+ self._set_starting_step(self.catalog.get_dataset(name, project=project))
1146
1152
 
1147
1153
  def _set_starting_step(self, ds: "DatasetRecord") -> None:
1148
1154
  if not self.version:
1149
1155
  self.version = ds.latest_version
1150
1156
 
1151
- self.starting_step = QueryStep(self.catalog, ds.name, self.version)
1157
+ self.starting_step = QueryStep(self.catalog, ds, self.version)
1152
1158
 
1153
1159
  # at this point we know our starting dataset so setting up schemas
1154
1160
  self.feature_schema = ds.get_version(self.version).feature_schema
1155
1161
  self.column_types = copy(ds.schema)
1156
1162
  if "sys__id" in self.column_types:
1157
1163
  self.column_types.pop("sys__id")
1164
+ self.project = ds.project
1158
1165
 
1159
1166
  def __iter__(self):
1160
1167
  return iter(self.db_results())
@@ -1162,21 +1169,6 @@ class DatasetQuery:
1162
1169
  def __or__(self, other):
1163
1170
  return self.union(other)
1164
1171
 
1165
- def pull_dataset(self, name: str, version: Optional[str] = None) -> "DatasetRecord":
1166
- print("Dataset not found in local catalog, trying to get from studio")
1167
-
1168
- remote_ds_uri = f"{DATASET_PREFIX}{name}"
1169
- if version:
1170
- remote_ds_uri += f"@v{version}"
1171
-
1172
- self.catalog.pull_dataset(
1173
- remote_ds_uri=remote_ds_uri,
1174
- local_ds_name=name,
1175
- local_ds_version=version,
1176
- )
1177
-
1178
- return self.catalog.get_dataset(name)
1179
-
1180
1172
  @staticmethod
1181
1173
  def get_table() -> "TableClause":
1182
1174
  table_name = "".join(
@@ -1657,6 +1649,8 @@ class DatasetQuery:
1657
1649
  workers: Union[bool, int] = False,
1658
1650
  min_task_size: Optional[int] = None,
1659
1651
  partition_by: Optional[PartitionByType] = None,
1652
+ namespace: Optional[str] = None,
1653
+ project: Optional[str] = None,
1660
1654
  cache: bool = False,
1661
1655
  ) -> "Self":
1662
1656
  query = self.clone()
@@ -1676,26 +1670,36 @@ class DatasetQuery:
1676
1670
 
1677
1671
  def _add_dependencies(self, dataset: "DatasetRecord", version: str):
1678
1672
  dependencies: set[DatasetDependencyType] = set()
1679
- for dep_name, dep_version in self.dependencies:
1680
- if Session.is_temp_dataset(dep_name):
1673
+ for dep_dataset, dep_dataset_version in self.dependencies:
1674
+ if Session.is_temp_dataset(dep_dataset.name):
1681
1675
  # temp dataset are created for optimization and they will be removed
1682
1676
  # afterwards. Therefore, we should not put them as dependencies, but
1683
1677
  # their own direct dependencies
1684
1678
  for dep in self.catalog.get_dataset_dependencies(
1685
- dep_name, dep_version, indirect=False
1679
+ dep_dataset.name,
1680
+ dep_dataset_version,
1681
+ dep_dataset.project,
1682
+ indirect=False,
1686
1683
  ):
1687
1684
  if dep:
1688
- dependencies.add((dep.name, dep.version))
1685
+ dep_project = self.catalog.metastore.get_project(
1686
+ dep.project, dep.namespace
1687
+ )
1688
+ dependencies.add(
1689
+ (
1690
+ self.catalog.get_dataset(dep.name, dep_project),
1691
+ dep.version,
1692
+ )
1693
+ )
1689
1694
  else:
1690
- dependencies.add((dep_name, dep_version))
1695
+ dependencies.add((dep_dataset, dep_dataset_version))
1691
1696
 
1692
- for dep_name, dep_version in dependencies:
1693
- # ds_dependency_name, ds_dependency_version = dependency
1697
+ for dep_dataset, dep_dataset_version in dependencies:
1694
1698
  self.catalog.metastore.add_dataset_dependency(
1695
- dataset.name,
1699
+ dataset,
1696
1700
  version,
1697
- dep_name,
1698
- dep_version,
1701
+ dep_dataset,
1702
+ dep_dataset_version,
1699
1703
  )
1700
1704
 
1701
1705
  def exec(self) -> "Self":
@@ -1711,6 +1715,7 @@ class DatasetQuery:
1711
1715
  self,
1712
1716
  name: Optional[str] = None,
1713
1717
  version: Optional[str] = None,
1718
+ project: Optional[Project] = None,
1714
1719
  feature_schema: Optional[dict] = None,
1715
1720
  dependencies: Optional[list[DatasetDependency]] = None,
1716
1721
  description: Optional[str] = None,
@@ -1719,8 +1724,13 @@ class DatasetQuery:
1719
1724
  **kwargs,
1720
1725
  ) -> "Self":
1721
1726
  """Save the query as a dataset."""
1727
+ project = project or self.catalog.metastore.default_project
1722
1728
  try:
1723
- if name and version and self.catalog.get_dataset(name).has_version(version):
1729
+ if (
1730
+ name
1731
+ and version
1732
+ and self.catalog.get_dataset(name, project).has_version(version)
1733
+ ):
1724
1734
  raise RuntimeError(f"Dataset {name} already has version {version}")
1725
1735
  except DatasetNotFoundError:
1726
1736
  pass
@@ -1745,6 +1755,7 @@ class DatasetQuery:
1745
1755
 
1746
1756
  dataset = self.catalog.create_dataset(
1747
1757
  name,
1758
+ project,
1748
1759
  version=version,
1749
1760
  feature_schema=feature_schema,
1750
1761
  columns=columns,
@@ -1770,11 +1781,25 @@ class DatasetQuery:
1770
1781
 
1771
1782
  if dependencies:
1772
1783
  # overriding dependencies
1773
- self.dependencies = {(dep.name, dep.version) for dep in dependencies}
1784
+ self.dependencies = set()
1785
+ for dep in dependencies:
1786
+ dep_project = self.catalog.metastore.get_project(
1787
+ dep.project, dep.namespace
1788
+ )
1789
+ self.dependencies.add(
1790
+ (self.catalog.get_dataset(dep.name, dep_project), dep.version)
1791
+ )
1792
+
1774
1793
  self._add_dependencies(dataset, version) # type: ignore [arg-type]
1775
1794
  finally:
1776
1795
  self.cleanup()
1777
- return self.__class__(name=name, version=version, catalog=self.catalog)
1796
+ return self.__class__(
1797
+ name=name,
1798
+ namespace_name=project.namespace.name,
1799
+ project_name=project.name,
1800
+ version=version,
1801
+ catalog=self.catalog,
1802
+ )
1778
1803
 
1779
1804
  @property
1780
1805
  def is_ordered(self) -> bool:
@@ -108,7 +108,7 @@ class Session:
108
108
  prefix = self.get_temp_prefix()
109
109
  try:
110
110
  for dataset in list(self.catalog.metastore.list_datasets_by_prefix(prefix)):
111
- self.catalog.remove_dataset(dataset.name, force=True)
111
+ self.catalog.remove_dataset(dataset.name, dataset.project, force=True)
112
112
  # suppress error when metastore has been reset during testing
113
113
  except TableMissingError:
114
114
  pass