datachain 0.19.1__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +3 -0
- datachain/catalog/catalog.py +180 -65
- datachain/cli/__init__.py +0 -7
- datachain/cli/commands/datasets.py +43 -28
- datachain/cli/parser/__init__.py +1 -35
- datachain/cli/parser/job.py +25 -0
- datachain/cli/parser/studio.py +11 -4
- datachain/data_storage/metastore.py +390 -37
- datachain/data_storage/schema.py +23 -1
- datachain/data_storage/sqlite.py +139 -7
- datachain/data_storage/warehouse.py +26 -7
- datachain/dataset.py +125 -12
- datachain/delta.py +9 -5
- datachain/error.py +36 -0
- datachain/lib/dataset_info.py +4 -0
- datachain/lib/dc/datachain.py +86 -7
- datachain/lib/dc/datasets.py +62 -12
- datachain/lib/dc/listings.py +111 -0
- datachain/lib/dc/records.py +1 -0
- datachain/lib/dc/storage.py +14 -2
- datachain/lib/listing.py +3 -1
- datachain/lib/namespaces.py +73 -0
- datachain/lib/projects.py +86 -0
- datachain/lib/settings.py +10 -0
- datachain/listing.py +3 -1
- datachain/namespace.py +65 -0
- datachain/project.py +78 -0
- datachain/query/dataset.py +71 -46
- datachain/query/session.py +1 -1
- datachain/remote/studio.py +67 -26
- datachain/studio.py +68 -8
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/METADATA +2 -2
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/RECORD +37 -33
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/WHEEL +0 -0
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from datachain.error import ProjectCreateNotAllowedError
|
|
4
|
+
from datachain.project import Project
|
|
5
|
+
from datachain.query import Session
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create(
|
|
9
|
+
name: str,
|
|
10
|
+
namespace_name: str,
|
|
11
|
+
description: Optional[str] = None,
|
|
12
|
+
session: Optional[Session] = None,
|
|
13
|
+
) -> Project:
|
|
14
|
+
"""
|
|
15
|
+
Creates a new custom project.
|
|
16
|
+
A Project is an object used to organize datasets. It is created under a
|
|
17
|
+
specific namespace and has a list of datasets underneath it.
|
|
18
|
+
Note that creating projects is not allowed in the local environment, unlike
|
|
19
|
+
in Studio, where it is allowed.
|
|
20
|
+
In local environment all datasets are created under the default `local` project.
|
|
21
|
+
|
|
22
|
+
Parameters:
|
|
23
|
+
name : The name of the project.
|
|
24
|
+
namespace : The name of the namespace under which the new project is being
|
|
25
|
+
created.
|
|
26
|
+
description : A description of the project.
|
|
27
|
+
session : Session to use for creating project.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
```py
|
|
31
|
+
import datachain as dc
|
|
32
|
+
project = dc.projects.create("my-project", "dev", "My personal project")
|
|
33
|
+
```
|
|
34
|
+
"""
|
|
35
|
+
session = Session.get(session)
|
|
36
|
+
|
|
37
|
+
if not session.catalog.metastore.project_allowed_to_create:
|
|
38
|
+
raise ProjectCreateNotAllowedError("Creating custom project is not allowed")
|
|
39
|
+
|
|
40
|
+
Project.validate_name(name)
|
|
41
|
+
|
|
42
|
+
return session.catalog.metastore.create_project(name, namespace_name, description)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get(name: str, namespace_name: str, session: Optional[Session]) -> Project:
|
|
46
|
+
"""
|
|
47
|
+
Gets a project by name in some namespace.
|
|
48
|
+
If the project is not found, a `ProjectNotFoundError` is raised.
|
|
49
|
+
|
|
50
|
+
Parameters:
|
|
51
|
+
name : The name of the project.
|
|
52
|
+
namespace_name : The name of the namespace.
|
|
53
|
+
session : Session to use for getting project.
|
|
54
|
+
|
|
55
|
+
Example:
|
|
56
|
+
```py
|
|
57
|
+
import datachain as dc
|
|
58
|
+
project = dc.get_project("my-project", "local")
|
|
59
|
+
```
|
|
60
|
+
"""
|
|
61
|
+
return Session.get(session).catalog.metastore.get_project(name, namespace_name)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def ls(
|
|
65
|
+
namespace_name: Optional[str] = None, session: Optional[Session] = None
|
|
66
|
+
) -> list[Project]:
|
|
67
|
+
"""
|
|
68
|
+
Gets a list of projects in a specific namespace or from all namespaces.
|
|
69
|
+
|
|
70
|
+
Parameters:
|
|
71
|
+
namespace_name : An optional namespace name.
|
|
72
|
+
session : Session to use for getting project.
|
|
73
|
+
|
|
74
|
+
Example:
|
|
75
|
+
```py
|
|
76
|
+
import datachain as dc
|
|
77
|
+
local_namespace_projects = dc.projects.ls("local")
|
|
78
|
+
all_projects = dc.projects.ls()
|
|
79
|
+
```
|
|
80
|
+
"""
|
|
81
|
+
session = Session.get(session)
|
|
82
|
+
namespace_id = None
|
|
83
|
+
if namespace_name:
|
|
84
|
+
namespace_id = session.catalog.metastore.get_namespace(namespace_name).id
|
|
85
|
+
|
|
86
|
+
return session.catalog.metastore.list_projects(namespace_id)
|
datachain/lib/settings.py
CHANGED
|
@@ -14,12 +14,16 @@ class Settings:
|
|
|
14
14
|
workers=None,
|
|
15
15
|
min_task_size=None,
|
|
16
16
|
prefetch=None,
|
|
17
|
+
namespace=None,
|
|
18
|
+
project=None,
|
|
17
19
|
):
|
|
18
20
|
self._cache = cache
|
|
19
21
|
self.parallel = parallel
|
|
20
22
|
self._workers = workers
|
|
21
23
|
self.min_task_size = min_task_size
|
|
22
24
|
self.prefetch = prefetch
|
|
25
|
+
self.namespace = namespace
|
|
26
|
+
self.project = project
|
|
23
27
|
|
|
24
28
|
if not isinstance(cache, bool) and cache is not None:
|
|
25
29
|
raise SettingsError(
|
|
@@ -67,6 +71,10 @@ class Settings:
|
|
|
67
71
|
res["workers"] = self.workers
|
|
68
72
|
if self.min_task_size is not None:
|
|
69
73
|
res["min_task_size"] = self.min_task_size
|
|
74
|
+
if self.namespace is not None:
|
|
75
|
+
res["namespace"] = self.namespace
|
|
76
|
+
if self.project is not None:
|
|
77
|
+
res["project"] = self.project
|
|
70
78
|
return res
|
|
71
79
|
|
|
72
80
|
def add(self, settings: "Settings"):
|
|
@@ -74,5 +82,7 @@ class Settings:
|
|
|
74
82
|
self.parallel = settings.parallel or self.parallel
|
|
75
83
|
self._workers = settings._workers or self._workers
|
|
76
84
|
self.min_task_size = settings.min_task_size or self.min_task_size
|
|
85
|
+
self.namespace = settings.namespace or self.namespace
|
|
86
|
+
self.project = settings.project or self.project
|
|
77
87
|
if settings.prefetch is not None:
|
|
78
88
|
self.prefetch = settings.prefetch
|
datachain/listing.py
CHANGED
|
@@ -66,7 +66,9 @@ class Listing:
|
|
|
66
66
|
@cached_property
|
|
67
67
|
def dataset(self) -> "DatasetRecord":
|
|
68
68
|
assert self.dataset_name
|
|
69
|
-
return self.metastore.get_dataset(
|
|
69
|
+
return self.metastore.get_dataset(
|
|
70
|
+
self.dataset_name, self.metastore.listing_project.id
|
|
71
|
+
)
|
|
70
72
|
|
|
71
73
|
@cached_property
|
|
72
74
|
def dataset_rows(self):
|
datachain/namespace.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import builtins
|
|
2
|
+
from dataclasses import dataclass, fields
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Optional, TypeVar
|
|
5
|
+
|
|
6
|
+
from datachain.error import InvalidNamespaceNameError
|
|
7
|
+
|
|
8
|
+
N = TypeVar("N", bound="Namespace")
|
|
9
|
+
NAMESPACE_NAME_RESERVED_CHARS = ["."]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class Namespace:
|
|
14
|
+
id: int
|
|
15
|
+
uuid: str
|
|
16
|
+
name: str
|
|
17
|
+
description: Optional[str]
|
|
18
|
+
created_at: datetime
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def validate_name(name: str) -> None:
|
|
22
|
+
"""Throws exception if name is invalid, otherwise returns None"""
|
|
23
|
+
if not name:
|
|
24
|
+
raise InvalidNamespaceNameError("Namespace name cannot be empty")
|
|
25
|
+
|
|
26
|
+
for c in NAMESPACE_NAME_RESERVED_CHARS:
|
|
27
|
+
if c in name:
|
|
28
|
+
raise InvalidNamespaceNameError(
|
|
29
|
+
f"Character {c} is reserved and not allowed in namespace name"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
if name in [Namespace.default(), Namespace.system()]:
|
|
33
|
+
raise InvalidNamespaceNameError(
|
|
34
|
+
f"Namespace name {name} is reserved and cannot be used."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def default() -> str:
|
|
39
|
+
"""Name of default namespace"""
|
|
40
|
+
return "local"
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def system() -> str:
|
|
44
|
+
"""Name of the system namespace"""
|
|
45
|
+
return "system"
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def is_system(self):
|
|
49
|
+
return self.name == Namespace.system()
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def parse(
|
|
53
|
+
cls: builtins.type[N],
|
|
54
|
+
id: int,
|
|
55
|
+
uuid: str,
|
|
56
|
+
name: str,
|
|
57
|
+
description: Optional[str],
|
|
58
|
+
created_at: datetime,
|
|
59
|
+
) -> "Namespace":
|
|
60
|
+
return cls(id, uuid, name, description, created_at)
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def from_dict(cls, d: dict[str, Any]) -> "Namespace":
|
|
64
|
+
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
65
|
+
return cls(**kwargs)
|
datachain/project.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import builtins
|
|
2
|
+
from dataclasses import dataclass, fields
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Optional, TypeVar
|
|
5
|
+
|
|
6
|
+
from datachain.error import InvalidProjectNameError
|
|
7
|
+
from datachain.namespace import Namespace
|
|
8
|
+
|
|
9
|
+
P = TypeVar("P", bound="Project")
|
|
10
|
+
PROJECT_NAME_RESERVED_CHARS = ["."]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class Project:
|
|
15
|
+
id: int
|
|
16
|
+
uuid: str
|
|
17
|
+
name: str
|
|
18
|
+
description: Optional[str]
|
|
19
|
+
created_at: datetime
|
|
20
|
+
namespace: Namespace
|
|
21
|
+
|
|
22
|
+
@staticmethod
|
|
23
|
+
def validate_name(name: str) -> None:
|
|
24
|
+
"""Throws exception if name is invalid, otherwise returns None"""
|
|
25
|
+
if not name:
|
|
26
|
+
raise InvalidProjectNameError("Project name cannot be empty")
|
|
27
|
+
|
|
28
|
+
for c in PROJECT_NAME_RESERVED_CHARS:
|
|
29
|
+
if c in name:
|
|
30
|
+
raise InvalidProjectNameError(
|
|
31
|
+
f"Character {c} is reserved and not allowed in project name."
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
if name in [Project.default(), Project.listing()]:
|
|
35
|
+
raise InvalidProjectNameError(
|
|
36
|
+
f"Project name {name} is reserved and cannot be used."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def default() -> str:
|
|
41
|
+
"""Name of default project"""
|
|
42
|
+
return "local"
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def listing() -> str:
|
|
46
|
+
"""Name of listing project where all listing datasets will be saved"""
|
|
47
|
+
return "listing"
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def parse(
|
|
51
|
+
cls: builtins.type[P],
|
|
52
|
+
namespace_id: int,
|
|
53
|
+
namespace_uuid: str,
|
|
54
|
+
namespace_name: str,
|
|
55
|
+
namespace_description: Optional[str],
|
|
56
|
+
namespace_created_at: datetime,
|
|
57
|
+
project_id: int,
|
|
58
|
+
uuid: str,
|
|
59
|
+
name: str,
|
|
60
|
+
description: Optional[str],
|
|
61
|
+
created_at: datetime,
|
|
62
|
+
project_namespace_id: int,
|
|
63
|
+
) -> "Project":
|
|
64
|
+
namespace = Namespace.parse(
|
|
65
|
+
namespace_id,
|
|
66
|
+
namespace_uuid,
|
|
67
|
+
namespace_name,
|
|
68
|
+
namespace_description,
|
|
69
|
+
namespace_created_at,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return cls(project_id, uuid, name, description, created_at, namespace)
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def from_dict(cls, d: dict[str, Any]) -> "Project":
|
|
76
|
+
namespace = Namespace.from_dict(d.pop("namespace"))
|
|
77
|
+
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
78
|
+
return cls(**kwargs, namespace=namespace)
|
datachain/query/dataset.py
CHANGED
|
@@ -41,12 +41,13 @@ from datachain.data_storage.schema import (
|
|
|
41
41
|
partition_col_names,
|
|
42
42
|
partition_columns,
|
|
43
43
|
)
|
|
44
|
-
from datachain.dataset import
|
|
44
|
+
from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
|
|
45
45
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
46
46
|
from datachain.func.base import Function
|
|
47
47
|
from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
|
|
48
48
|
from datachain.lib.udf import UDFAdapter, _get_cache
|
|
49
49
|
from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
|
|
50
|
+
from datachain.project import Project
|
|
50
51
|
from datachain.query.schema import C, UDFParamSpec, normalize_param
|
|
51
52
|
from datachain.query.session import Session
|
|
52
53
|
from datachain.query.udf import UdfInfo
|
|
@@ -83,7 +84,7 @@ PartitionByType = Union[
|
|
|
83
84
|
Function, ColumnElement, Sequence[Union[Function, ColumnElement]]
|
|
84
85
|
]
|
|
85
86
|
JoinPredicateType = Union[str, ColumnClause, ColumnElement]
|
|
86
|
-
DatasetDependencyType = tuple[
|
|
87
|
+
DatasetDependencyType = tuple["DatasetRecord", str]
|
|
87
88
|
|
|
88
89
|
logger = logging.getLogger("datachain")
|
|
89
90
|
|
|
@@ -169,18 +170,17 @@ class QueryStep:
|
|
|
169
170
|
"""A query that returns all rows from specific dataset version"""
|
|
170
171
|
|
|
171
172
|
catalog: "Catalog"
|
|
172
|
-
|
|
173
|
+
dataset: "DatasetRecord"
|
|
173
174
|
dataset_version: str
|
|
174
175
|
|
|
175
176
|
def apply(self) -> "StepResult":
|
|
176
177
|
def q(*columns):
|
|
177
178
|
return sqlalchemy.select(*columns)
|
|
178
179
|
|
|
179
|
-
|
|
180
|
-
dr = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
|
|
180
|
+
dr = self.catalog.warehouse.dataset_rows(self.dataset, self.dataset_version)
|
|
181
181
|
|
|
182
182
|
return step_result(
|
|
183
|
-
q, dr.columns, dependencies=[(self.
|
|
183
|
+
q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
|
|
184
184
|
)
|
|
185
185
|
|
|
186
186
|
|
|
@@ -1095,6 +1095,8 @@ class DatasetQuery:
|
|
|
1095
1095
|
self,
|
|
1096
1096
|
name: str,
|
|
1097
1097
|
version: Optional[str] = None,
|
|
1098
|
+
project_name: Optional[str] = None,
|
|
1099
|
+
namespace_name: Optional[str] = None,
|
|
1098
1100
|
catalog: Optional["Catalog"] = None,
|
|
1099
1101
|
session: Optional[Session] = None,
|
|
1100
1102
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
@@ -1128,33 +1130,38 @@ class DatasetQuery:
|
|
|
1128
1130
|
if version:
|
|
1129
1131
|
self.version = version
|
|
1130
1132
|
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
# this point
|
|
1139
|
-
self.list_ds_name = name
|
|
1133
|
+
namespace_name = namespace_name or self.catalog.metastore.default_namespace_name
|
|
1134
|
+
project_name = project_name or self.catalog.metastore.default_project_name
|
|
1135
|
+
|
|
1136
|
+
if is_listing_dataset(name) and not version:
|
|
1137
|
+
# not setting query step yet as listing dataset might not exist at
|
|
1138
|
+
# this point
|
|
1139
|
+
self.list_ds_name = name
|
|
1140
1140
|
elif fallback_to_studio and is_token_set():
|
|
1141
1141
|
self._set_starting_step(
|
|
1142
|
-
self.catalog.get_dataset_with_remote_fallback(
|
|
1142
|
+
self.catalog.get_dataset_with_remote_fallback(
|
|
1143
|
+
name,
|
|
1144
|
+
namespace_name=namespace_name,
|
|
1145
|
+
project_name=project_name,
|
|
1146
|
+
version=version,
|
|
1147
|
+
)
|
|
1143
1148
|
)
|
|
1144
1149
|
else:
|
|
1145
|
-
self.
|
|
1150
|
+
project = self.catalog.metastore.get_project(project_name, namespace_name)
|
|
1151
|
+
self._set_starting_step(self.catalog.get_dataset(name, project=project))
|
|
1146
1152
|
|
|
1147
1153
|
def _set_starting_step(self, ds: "DatasetRecord") -> None:
|
|
1148
1154
|
if not self.version:
|
|
1149
1155
|
self.version = ds.latest_version
|
|
1150
1156
|
|
|
1151
|
-
self.starting_step = QueryStep(self.catalog, ds
|
|
1157
|
+
self.starting_step = QueryStep(self.catalog, ds, self.version)
|
|
1152
1158
|
|
|
1153
1159
|
# at this point we know our starting dataset so setting up schemas
|
|
1154
1160
|
self.feature_schema = ds.get_version(self.version).feature_schema
|
|
1155
1161
|
self.column_types = copy(ds.schema)
|
|
1156
1162
|
if "sys__id" in self.column_types:
|
|
1157
1163
|
self.column_types.pop("sys__id")
|
|
1164
|
+
self.project = ds.project
|
|
1158
1165
|
|
|
1159
1166
|
def __iter__(self):
|
|
1160
1167
|
return iter(self.db_results())
|
|
@@ -1162,21 +1169,6 @@ class DatasetQuery:
|
|
|
1162
1169
|
def __or__(self, other):
|
|
1163
1170
|
return self.union(other)
|
|
1164
1171
|
|
|
1165
|
-
def pull_dataset(self, name: str, version: Optional[str] = None) -> "DatasetRecord":
|
|
1166
|
-
print("Dataset not found in local catalog, trying to get from studio")
|
|
1167
|
-
|
|
1168
|
-
remote_ds_uri = f"{DATASET_PREFIX}{name}"
|
|
1169
|
-
if version:
|
|
1170
|
-
remote_ds_uri += f"@v{version}"
|
|
1171
|
-
|
|
1172
|
-
self.catalog.pull_dataset(
|
|
1173
|
-
remote_ds_uri=remote_ds_uri,
|
|
1174
|
-
local_ds_name=name,
|
|
1175
|
-
local_ds_version=version,
|
|
1176
|
-
)
|
|
1177
|
-
|
|
1178
|
-
return self.catalog.get_dataset(name)
|
|
1179
|
-
|
|
1180
1172
|
@staticmethod
|
|
1181
1173
|
def get_table() -> "TableClause":
|
|
1182
1174
|
table_name = "".join(
|
|
@@ -1657,6 +1649,8 @@ class DatasetQuery:
|
|
|
1657
1649
|
workers: Union[bool, int] = False,
|
|
1658
1650
|
min_task_size: Optional[int] = None,
|
|
1659
1651
|
partition_by: Optional[PartitionByType] = None,
|
|
1652
|
+
namespace: Optional[str] = None,
|
|
1653
|
+
project: Optional[str] = None,
|
|
1660
1654
|
cache: bool = False,
|
|
1661
1655
|
) -> "Self":
|
|
1662
1656
|
query = self.clone()
|
|
@@ -1676,26 +1670,36 @@ class DatasetQuery:
|
|
|
1676
1670
|
|
|
1677
1671
|
def _add_dependencies(self, dataset: "DatasetRecord", version: str):
|
|
1678
1672
|
dependencies: set[DatasetDependencyType] = set()
|
|
1679
|
-
for
|
|
1680
|
-
if Session.is_temp_dataset(
|
|
1673
|
+
for dep_dataset, dep_dataset_version in self.dependencies:
|
|
1674
|
+
if Session.is_temp_dataset(dep_dataset.name):
|
|
1681
1675
|
# temp dataset are created for optimization and they will be removed
|
|
1682
1676
|
# afterwards. Therefore, we should not put them as dependencies, but
|
|
1683
1677
|
# their own direct dependencies
|
|
1684
1678
|
for dep in self.catalog.get_dataset_dependencies(
|
|
1685
|
-
|
|
1679
|
+
dep_dataset.name,
|
|
1680
|
+
dep_dataset_version,
|
|
1681
|
+
dep_dataset.project,
|
|
1682
|
+
indirect=False,
|
|
1686
1683
|
):
|
|
1687
1684
|
if dep:
|
|
1688
|
-
|
|
1685
|
+
dep_project = self.catalog.metastore.get_project(
|
|
1686
|
+
dep.project, dep.namespace
|
|
1687
|
+
)
|
|
1688
|
+
dependencies.add(
|
|
1689
|
+
(
|
|
1690
|
+
self.catalog.get_dataset(dep.name, dep_project),
|
|
1691
|
+
dep.version,
|
|
1692
|
+
)
|
|
1693
|
+
)
|
|
1689
1694
|
else:
|
|
1690
|
-
dependencies.add((
|
|
1695
|
+
dependencies.add((dep_dataset, dep_dataset_version))
|
|
1691
1696
|
|
|
1692
|
-
for
|
|
1693
|
-
# ds_dependency_name, ds_dependency_version = dependency
|
|
1697
|
+
for dep_dataset, dep_dataset_version in dependencies:
|
|
1694
1698
|
self.catalog.metastore.add_dataset_dependency(
|
|
1695
|
-
dataset
|
|
1699
|
+
dataset,
|
|
1696
1700
|
version,
|
|
1697
|
-
|
|
1698
|
-
|
|
1701
|
+
dep_dataset,
|
|
1702
|
+
dep_dataset_version,
|
|
1699
1703
|
)
|
|
1700
1704
|
|
|
1701
1705
|
def exec(self) -> "Self":
|
|
@@ -1711,6 +1715,7 @@ class DatasetQuery:
|
|
|
1711
1715
|
self,
|
|
1712
1716
|
name: Optional[str] = None,
|
|
1713
1717
|
version: Optional[str] = None,
|
|
1718
|
+
project: Optional[Project] = None,
|
|
1714
1719
|
feature_schema: Optional[dict] = None,
|
|
1715
1720
|
dependencies: Optional[list[DatasetDependency]] = None,
|
|
1716
1721
|
description: Optional[str] = None,
|
|
@@ -1719,8 +1724,13 @@ class DatasetQuery:
|
|
|
1719
1724
|
**kwargs,
|
|
1720
1725
|
) -> "Self":
|
|
1721
1726
|
"""Save the query as a dataset."""
|
|
1727
|
+
project = project or self.catalog.metastore.default_project
|
|
1722
1728
|
try:
|
|
1723
|
-
if
|
|
1729
|
+
if (
|
|
1730
|
+
name
|
|
1731
|
+
and version
|
|
1732
|
+
and self.catalog.get_dataset(name, project).has_version(version)
|
|
1733
|
+
):
|
|
1724
1734
|
raise RuntimeError(f"Dataset {name} already has version {version}")
|
|
1725
1735
|
except DatasetNotFoundError:
|
|
1726
1736
|
pass
|
|
@@ -1745,6 +1755,7 @@ class DatasetQuery:
|
|
|
1745
1755
|
|
|
1746
1756
|
dataset = self.catalog.create_dataset(
|
|
1747
1757
|
name,
|
|
1758
|
+
project,
|
|
1748
1759
|
version=version,
|
|
1749
1760
|
feature_schema=feature_schema,
|
|
1750
1761
|
columns=columns,
|
|
@@ -1770,11 +1781,25 @@ class DatasetQuery:
|
|
|
1770
1781
|
|
|
1771
1782
|
if dependencies:
|
|
1772
1783
|
# overriding dependencies
|
|
1773
|
-
self.dependencies =
|
|
1784
|
+
self.dependencies = set()
|
|
1785
|
+
for dep in dependencies:
|
|
1786
|
+
dep_project = self.catalog.metastore.get_project(
|
|
1787
|
+
dep.project, dep.namespace
|
|
1788
|
+
)
|
|
1789
|
+
self.dependencies.add(
|
|
1790
|
+
(self.catalog.get_dataset(dep.name, dep_project), dep.version)
|
|
1791
|
+
)
|
|
1792
|
+
|
|
1774
1793
|
self._add_dependencies(dataset, version) # type: ignore [arg-type]
|
|
1775
1794
|
finally:
|
|
1776
1795
|
self.cleanup()
|
|
1777
|
-
return self.__class__(
|
|
1796
|
+
return self.__class__(
|
|
1797
|
+
name=name,
|
|
1798
|
+
namespace_name=project.namespace.name,
|
|
1799
|
+
project_name=project.name,
|
|
1800
|
+
version=version,
|
|
1801
|
+
catalog=self.catalog,
|
|
1802
|
+
)
|
|
1778
1803
|
|
|
1779
1804
|
@property
|
|
1780
1805
|
def is_ordered(self) -> bool:
|
datachain/query/session.py
CHANGED
|
@@ -108,7 +108,7 @@ class Session:
|
|
|
108
108
|
prefix = self.get_temp_prefix()
|
|
109
109
|
try:
|
|
110
110
|
for dataset in list(self.catalog.metastore.list_datasets_by_prefix(prefix)):
|
|
111
|
-
self.catalog.remove_dataset(dataset.name, force=True)
|
|
111
|
+
self.catalog.remove_dataset(dataset.name, dataset.project, force=True)
|
|
112
112
|
# suppress error when metastore has been reset during testing
|
|
113
113
|
except TableMissingError:
|
|
114
114
|
pass
|