dstack 0.19.30rc1__py3-none-any.whl → 0.19.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +8 -0
- dstack/_internal/cli/commands/project.py +27 -20
- dstack/_internal/cli/commands/server.py +5 -0
- dstack/_internal/cli/services/configurators/fleet.py +20 -6
- dstack/_internal/cli/utils/gpu.py +2 -2
- dstack/_internal/core/backends/aws/compute.py +13 -5
- dstack/_internal/core/backends/aws/resources.py +11 -6
- dstack/_internal/core/backends/azure/compute.py +17 -6
- dstack/_internal/core/backends/base/compute.py +57 -9
- dstack/_internal/core/backends/base/offers.py +1 -0
- dstack/_internal/core/backends/cloudrift/compute.py +2 -0
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
- dstack/_internal/core/backends/features.py +5 -0
- dstack/_internal/core/backends/gcp/compute.py +87 -38
- dstack/_internal/core/backends/gcp/configurator.py +1 -1
- dstack/_internal/core/backends/gcp/models.py +14 -1
- dstack/_internal/core/backends/gcp/resources.py +35 -12
- dstack/_internal/core/backends/hotaisle/compute.py +22 -0
- dstack/_internal/core/backends/kubernetes/compute.py +531 -215
- dstack/_internal/core/backends/kubernetes/models.py +13 -16
- dstack/_internal/core/backends/kubernetes/utils.py +145 -8
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +17 -0
- dstack/_internal/core/backends/nebius/configurator.py +15 -0
- dstack/_internal/core/backends/nebius/models.py +57 -5
- dstack/_internal/core/backends/nebius/resources.py +45 -2
- dstack/_internal/core/backends/oci/compute.py +7 -1
- dstack/_internal/core/backends/oci/resources.py +8 -3
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +2 -0
- dstack/_internal/core/compatibility/runs.py +8 -0
- dstack/_internal/core/consts.py +2 -0
- dstack/_internal/core/models/profiles.py +11 -4
- dstack/_internal/core/services/repos.py +101 -11
- dstack/_internal/server/background/tasks/common.py +2 -0
- dstack/_internal/server/background/tasks/process_fleets.py +75 -17
- dstack/_internal/server/background/tasks/process_instances.py +3 -5
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
- dstack/_internal/server/background/tasks/process_runs.py +27 -23
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +107 -54
- dstack/_internal/server/services/offers.py +7 -1
- dstack/_internal/server/testing/common.py +2 -0
- dstack/_internal/server/utils/provisioning.py +3 -10
- dstack/_internal/utils/ssh.py +22 -2
- dstack/version.py +2 -2
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/METADATA +20 -18
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/RECORD +54 -54
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/WHEEL +0 -0
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -5,12 +5,14 @@ from pydantic import Field, root_validator
|
|
|
5
5
|
from dstack._internal.core.backends.base.models import fill_data
|
|
6
6
|
from dstack._internal.core.models.common import CoreModel
|
|
7
7
|
|
|
8
|
+
DEFAULT_NAMESPACE = "default"
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
|
|
11
|
+
class KubernetesProxyJumpConfig(CoreModel):
|
|
12
|
+
hostname: Annotated[
|
|
13
|
+
Optional[str], Field(description="The external IP address or hostname of any node")
|
|
12
14
|
] = None
|
|
13
|
-
|
|
15
|
+
port: Annotated[
|
|
14
16
|
Optional[int], Field(description="Any port accessible outside of the cluster")
|
|
15
17
|
] = None
|
|
16
18
|
|
|
@@ -22,16 +24,15 @@ class KubeconfigConfig(CoreModel):
|
|
|
22
24
|
|
|
23
25
|
class KubernetesBackendConfig(CoreModel):
|
|
24
26
|
type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes"
|
|
25
|
-
|
|
26
|
-
Optional[
|
|
27
|
+
proxy_jump: Annotated[
|
|
28
|
+
Optional[KubernetesProxyJumpConfig], Field(description="The SSH proxy jump configuration")
|
|
27
29
|
] = None
|
|
30
|
+
namespace: Annotated[
|
|
31
|
+
str, Field(description="The namespace for resources managed by `dstack`")
|
|
32
|
+
] = DEFAULT_NAMESPACE
|
|
28
33
|
|
|
29
34
|
|
|
30
|
-
class KubernetesBackendConfigWithCreds(
|
|
31
|
-
type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes"
|
|
32
|
-
networking: Annotated[
|
|
33
|
-
Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
|
|
34
|
-
] = None
|
|
35
|
+
class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
|
|
35
36
|
kubeconfig: Annotated[KubeconfigConfig, Field(description="The kubeconfig configuration")]
|
|
36
37
|
|
|
37
38
|
|
|
@@ -53,11 +54,7 @@ class KubeconfigFileConfig(CoreModel):
|
|
|
53
54
|
return fill_data(values)
|
|
54
55
|
|
|
55
56
|
|
|
56
|
-
class KubernetesBackendFileConfigWithCreds(
|
|
57
|
-
type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes"
|
|
58
|
-
networking: Annotated[
|
|
59
|
-
Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
|
|
60
|
-
] = None
|
|
57
|
+
class KubernetesBackendFileConfigWithCreds(KubernetesBackendConfig):
|
|
61
58
|
kubeconfig: Annotated[KubeconfigFileConfig, Field(description="The kubeconfig configuration")]
|
|
62
59
|
|
|
63
60
|
|
|
@@ -1,20 +1,157 @@
|
|
|
1
|
-
|
|
1
|
+
import ast
|
|
2
|
+
from typing import Any, Callable, List, Literal, Optional, TypeVar, Union, get_origin, overload
|
|
2
3
|
|
|
3
|
-
import kubernetes
|
|
4
4
|
import yaml
|
|
5
|
+
from kubernetes import client as kubernetes_client
|
|
6
|
+
from kubernetes import config as kubernetes_config
|
|
7
|
+
from typing_extensions import ParamSpec
|
|
5
8
|
|
|
9
|
+
T = TypeVar("T")
|
|
10
|
+
P = ParamSpec("P")
|
|
6
11
|
|
|
7
|
-
|
|
12
|
+
|
|
13
|
+
def get_api_from_config_data(kubeconfig_data: str) -> kubernetes_client.CoreV1Api:
|
|
8
14
|
config_dict = yaml.load(kubeconfig_data, yaml.FullLoader)
|
|
9
15
|
return get_api_from_config_dict(config_dict)
|
|
10
16
|
|
|
11
17
|
|
|
12
|
-
def get_api_from_config_dict(kubeconfig:
|
|
13
|
-
api_client =
|
|
14
|
-
return
|
|
18
|
+
def get_api_from_config_dict(kubeconfig: dict) -> kubernetes_client.CoreV1Api:
|
|
19
|
+
api_client = kubernetes_config.new_client_from_config_dict(config_dict=kubeconfig)
|
|
20
|
+
return kubernetes_client.CoreV1Api(api_client=api_client)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@overload
|
|
24
|
+
def call_api_method(
|
|
25
|
+
method: Callable[P, Any],
|
|
26
|
+
type_: type[T],
|
|
27
|
+
expected: None = None,
|
|
28
|
+
*args: P.args,
|
|
29
|
+
**kwargs: P.kwargs,
|
|
30
|
+
) -> T: ...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@overload
|
|
34
|
+
def call_api_method(
|
|
35
|
+
method: Callable[P, Any],
|
|
36
|
+
type_: type[T],
|
|
37
|
+
expected: Union[int, tuple[int, ...], list[int]],
|
|
38
|
+
*args: P.args,
|
|
39
|
+
**kwargs: P.kwargs,
|
|
40
|
+
) -> Optional[T]: ...
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def call_api_method(
|
|
44
|
+
method: Callable[P, Any],
|
|
45
|
+
type_: type[T],
|
|
46
|
+
expected: Optional[Union[int, tuple[int, ...], list[int]]] = None,
|
|
47
|
+
*args: P.args,
|
|
48
|
+
**kwargs: P.kwargs,
|
|
49
|
+
) -> Optional[T]:
|
|
50
|
+
"""
|
|
51
|
+
Returns the result of the API method call, optionally ignoring specified HTTP status codes.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
method: the `CoreV1Api` bound method.
|
|
55
|
+
type_: The expected type of the return value, used for runtime type checking and
|
|
56
|
+
as a type hint for a static type checker (as kubernetes package is not type-annotated).
|
|
57
|
+
NB: For composite types, only "origin" type is checked, e.g., list, not list[Node]
|
|
58
|
+
expected: Expected error statuses, e.g., 404.
|
|
59
|
+
args: positional arguments of the method.
|
|
60
|
+
kwargs: keyword arguments of the method.
|
|
61
|
+
Returns:
|
|
62
|
+
The return value or `None` in case of the expected error.
|
|
63
|
+
"""
|
|
64
|
+
if isinstance(expected, int):
|
|
65
|
+
expected = (expected,)
|
|
66
|
+
result: T
|
|
67
|
+
try:
|
|
68
|
+
result = method(*args, **kwargs)
|
|
69
|
+
except kubernetes_client.ApiException as e:
|
|
70
|
+
if expected is None or e.status not in expected:
|
|
71
|
+
raise
|
|
72
|
+
return None
|
|
73
|
+
if not isinstance(result, get_origin(type_) or type_):
|
|
74
|
+
raise TypeError(
|
|
75
|
+
f"{method.__name__} returned {type(result).__name__}, expected {type_.__name__}"
|
|
76
|
+
)
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@overload
|
|
81
|
+
def get_value(
|
|
82
|
+
obj: object, path: str, type_: type[T], *, required: Literal[False] = False
|
|
83
|
+
) -> Optional[T]: ...
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@overload
|
|
87
|
+
def get_value(obj: object, path: str, type_: type[T], *, required: Literal[True]) -> T: ...
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_value(obj: object, path: str, type_: type[T], *, required: bool = False) -> Optional[T]:
|
|
91
|
+
"""
|
|
92
|
+
Returns the value at a given path.
|
|
93
|
+
Supports object attributes, sequence indices, and mapping keys.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
obj: The object to traverse.
|
|
97
|
+
path: The path to the value, regular Python syntax. The leading dot is optional, all the
|
|
98
|
+
following are correct: `.attr`, `attr`, `.[0]`, `[0]`, `.['key']`, `['key']`.
|
|
99
|
+
type_: The expected type of the value, used for runtime type checking and as a type hint
|
|
100
|
+
for a static type checker (as kubernetes package is not type-annotated).
|
|
101
|
+
NB: For composite types, only "origin" type is checked, e.g., list, not list[Node]
|
|
102
|
+
required: If `True`, the value must exist and must not be `None`. If `False` (safe
|
|
103
|
+
navigation mode), the may not exist and may be `None`.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
The requested value or `None` in case of failed traverse when required=False.
|
|
107
|
+
"""
|
|
108
|
+
_path = path.removeprefix(".")
|
|
109
|
+
if _path.startswith("["):
|
|
110
|
+
src = f"obj{_path}"
|
|
111
|
+
else:
|
|
112
|
+
src = f"obj.{_path}"
|
|
113
|
+
module = ast.parse(src)
|
|
114
|
+
assert len(module.body) == 1, ast.dump(module, indent=4)
|
|
115
|
+
root_expr = module.body[0]
|
|
116
|
+
assert isinstance(root_expr, ast.Expr), ast.dump(module, indent=4)
|
|
117
|
+
varname: Optional[str] = None
|
|
118
|
+
expr = root_expr.value
|
|
119
|
+
while True:
|
|
120
|
+
if isinstance(expr, ast.Name):
|
|
121
|
+
varname = expr.id
|
|
122
|
+
break
|
|
123
|
+
if __debug__:
|
|
124
|
+
if isinstance(expr, ast.Subscript):
|
|
125
|
+
if isinstance(expr.slice, ast.UnaryOp):
|
|
126
|
+
# .items[-1]
|
|
127
|
+
assert isinstance(expr.slice.op, ast.USub), ast.dump(expr, indent=4)
|
|
128
|
+
assert isinstance(expr.slice.operand, ast.Constant), ast.dump(expr, indent=4)
|
|
129
|
+
assert isinstance(expr.slice.operand.value, int), ast.dump(expr, indent=4)
|
|
130
|
+
else:
|
|
131
|
+
# .items[0], .labels["name"]
|
|
132
|
+
assert isinstance(expr.slice, ast.Constant), ast.dump(expr, indent=4)
|
|
133
|
+
else:
|
|
134
|
+
assert isinstance(expr, ast.Attribute), ast.dump(expr, indent=4)
|
|
135
|
+
else:
|
|
136
|
+
assert isinstance(expr, (ast.Attribute, ast.Subscript))
|
|
137
|
+
expr = expr.value
|
|
138
|
+
assert varname is not None, ast.dump(module)
|
|
139
|
+
try:
|
|
140
|
+
value = eval(src, {"__builtins__": {}}, {"obj": obj})
|
|
141
|
+
except (AttributeError, KeyError, IndexError, TypeError) as e:
|
|
142
|
+
if required:
|
|
143
|
+
raise type(e)(f"Failed to traverse {path}: {e}") from e
|
|
144
|
+
return None
|
|
145
|
+
if value is None:
|
|
146
|
+
if required:
|
|
147
|
+
raise TypeError(f"Required {path} is None")
|
|
148
|
+
return value
|
|
149
|
+
if not isinstance(value, get_origin(type_) or type_):
|
|
150
|
+
raise TypeError(f"{path} value is {type(value).__name__}, expected {type_.__name__}")
|
|
151
|
+
return value
|
|
15
152
|
|
|
16
153
|
|
|
17
|
-
def get_cluster_public_ip(api_client:
|
|
154
|
+
def get_cluster_public_ip(api_client: kubernetes_client.CoreV1Api) -> Optional[str]:
|
|
18
155
|
"""
|
|
19
156
|
Returns public IP of any cluster node.
|
|
20
157
|
"""
|
|
@@ -24,7 +161,7 @@ def get_cluster_public_ip(api_client: kubernetes.client.CoreV1Api) -> Optional[s
|
|
|
24
161
|
return public_ips[0]
|
|
25
162
|
|
|
26
163
|
|
|
27
|
-
def get_cluster_public_ips(api_client:
|
|
164
|
+
def get_cluster_public_ips(api_client: kubernetes_client.CoreV1Api) -> List[str]:
|
|
28
165
|
"""
|
|
29
166
|
Returns public IPs of all cluster nodes.
|
|
30
167
|
"""
|
|
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
9
9
|
Compute,
|
|
10
10
|
ComputeWithAllOffersCached,
|
|
11
11
|
ComputeWithCreateInstanceSupport,
|
|
12
|
+
ComputeWithPrivilegedSupport,
|
|
12
13
|
generate_unique_instance_name,
|
|
13
14
|
get_shim_commands,
|
|
14
15
|
)
|
|
@@ -31,6 +32,7 @@ MAX_INSTANCE_NAME_LEN = 60
|
|
|
31
32
|
class LambdaCompute(
|
|
32
33
|
ComputeWithAllOffersCached,
|
|
33
34
|
ComputeWithCreateInstanceSupport,
|
|
35
|
+
ComputeWithPrivilegedSupport,
|
|
34
36
|
Compute,
|
|
35
37
|
):
|
|
36
38
|
def __init__(self, config: LambdaConfig):
|
|
@@ -3,6 +3,7 @@ from typing import List, Optional
|
|
|
3
3
|
from dstack._internal.core.backends.base.compute import (
|
|
4
4
|
Compute,
|
|
5
5
|
ComputeWithCreateInstanceSupport,
|
|
6
|
+
ComputeWithPrivilegedSupport,
|
|
6
7
|
ComputeWithVolumeSupport,
|
|
7
8
|
)
|
|
8
9
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
@@ -25,6 +26,7 @@ logger = get_logger(__name__)
|
|
|
25
26
|
|
|
26
27
|
class LocalCompute(
|
|
27
28
|
ComputeWithCreateInstanceSupport,
|
|
29
|
+
ComputeWithPrivilegedSupport,
|
|
28
30
|
ComputeWithVolumeSupport,
|
|
29
31
|
Compute,
|
|
30
32
|
):
|
|
@@ -16,8 +16,10 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
16
16
|
ComputeWithCreateInstanceSupport,
|
|
17
17
|
ComputeWithMultinodeSupport,
|
|
18
18
|
ComputeWithPlacementGroupSupport,
|
|
19
|
+
ComputeWithPrivilegedSupport,
|
|
19
20
|
generate_unique_instance_name,
|
|
20
21
|
get_user_data,
|
|
22
|
+
merge_tags,
|
|
21
23
|
)
|
|
22
24
|
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
23
25
|
from dstack._internal.core.backends.nebius import resources
|
|
@@ -79,6 +81,7 @@ SUPPORTED_PLATFORMS = [
|
|
|
79
81
|
class NebiusCompute(
|
|
80
82
|
ComputeWithAllOffersCached,
|
|
81
83
|
ComputeWithCreateInstanceSupport,
|
|
84
|
+
ComputeWithPrivilegedSupport,
|
|
82
85
|
ComputeWithMultinodeSupport,
|
|
83
86
|
ComputeWithPlacementGroupSupport,
|
|
84
87
|
Compute,
|
|
@@ -148,6 +151,18 @@ class NebiusCompute(
|
|
|
148
151
|
if backend_data.cluster is not None:
|
|
149
152
|
cluster_id = backend_data.cluster.id
|
|
150
153
|
|
|
154
|
+
labels = {
|
|
155
|
+
"owner": "dstack",
|
|
156
|
+
"dstack_project": instance_config.project_name.lower(),
|
|
157
|
+
"dstack_name": instance_config.instance_name,
|
|
158
|
+
"dstack_user": instance_config.user.lower(),
|
|
159
|
+
}
|
|
160
|
+
labels = merge_tags(
|
|
161
|
+
base_tags=labels,
|
|
162
|
+
backend_tags=self.config.tags,
|
|
163
|
+
resource_tags=instance_config.tags,
|
|
164
|
+
)
|
|
165
|
+
labels = resources.filter_invalid_labels(labels)
|
|
151
166
|
gpus = instance_offer.instance.resources.gpus
|
|
152
167
|
create_disk_op = resources.create_disk(
|
|
153
168
|
sdk=self._sdk,
|
|
@@ -157,6 +172,7 @@ class NebiusCompute(
|
|
|
157
172
|
image_family="ubuntu24.04-cuda12"
|
|
158
173
|
if gpus and gpus[0].name == "B200"
|
|
159
174
|
else "ubuntu22.04-cuda12",
|
|
175
|
+
labels=labels,
|
|
160
176
|
)
|
|
161
177
|
create_instance_op = None
|
|
162
178
|
try:
|
|
@@ -182,6 +198,7 @@ class NebiusCompute(
|
|
|
182
198
|
disk_id=create_disk_op.resource_id,
|
|
183
199
|
subnet_id=self._get_subnet_id(instance_offer.region),
|
|
184
200
|
preemptible=instance_offer.instance.resources.spot,
|
|
201
|
+
labels=labels,
|
|
185
202
|
)
|
|
186
203
|
_wait_for_instance(self._sdk, create_instance_op)
|
|
187
204
|
except BaseException:
|
|
@@ -3,6 +3,7 @@ import json
|
|
|
3
3
|
from nebius.aio.service_error import RequestError
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.backends.base.configurator import (
|
|
6
|
+
TAGS_MAX_NUM,
|
|
6
7
|
BackendRecord,
|
|
7
8
|
Configurator,
|
|
8
9
|
raise_invalid_credentials_error,
|
|
@@ -18,6 +19,7 @@ from dstack._internal.core.backends.nebius.models import (
|
|
|
18
19
|
NebiusServiceAccountCreds,
|
|
19
20
|
NebiusStoredConfig,
|
|
20
21
|
)
|
|
22
|
+
from dstack._internal.core.errors import BackendError, ServerClientError
|
|
21
23
|
from dstack._internal.core.models.backends.base import BackendType
|
|
22
24
|
|
|
23
25
|
|
|
@@ -53,6 +55,19 @@ class NebiusConfigurator(
|
|
|
53
55
|
f" some of the valid options: {sorted(valid_fabrics)}"
|
|
54
56
|
),
|
|
55
57
|
)
|
|
58
|
+
self._check_config_tags(config)
|
|
59
|
+
|
|
60
|
+
def _check_config_tags(self, config: NebiusBackendConfigWithCreds):
|
|
61
|
+
if not config.tags:
|
|
62
|
+
return
|
|
63
|
+
if len(config.tags) > TAGS_MAX_NUM:
|
|
64
|
+
raise ServerClientError(
|
|
65
|
+
f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed."
|
|
66
|
+
)
|
|
67
|
+
try:
|
|
68
|
+
resources.validate_labels(config.tags)
|
|
69
|
+
except BackendError as e:
|
|
70
|
+
raise ServerClientError(e.args[0])
|
|
56
71
|
|
|
57
72
|
def create_backend(
|
|
58
73
|
self, project_name: str, config: NebiusBackendConfigWithCreds
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Annotated, Dict, Literal, Optional, Union
|
|
2
4
|
|
|
3
5
|
from pydantic import Field, root_validator
|
|
4
6
|
|
|
@@ -27,16 +29,38 @@ class NebiusServiceAccountCreds(CoreModel):
|
|
|
27
29
|
)
|
|
28
30
|
),
|
|
29
31
|
]
|
|
32
|
+
filename: Annotated[
|
|
33
|
+
Optional[str], Field(description="The path to the service account credentials file")
|
|
34
|
+
] = None
|
|
30
35
|
|
|
31
36
|
|
|
32
37
|
class NebiusServiceAccountFileCreds(CoreModel):
|
|
33
38
|
type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
|
|
34
39
|
"service_account"
|
|
35
40
|
)
|
|
36
|
-
service_account_id: Annotated[
|
|
37
|
-
|
|
41
|
+
service_account_id: Annotated[
|
|
42
|
+
Optional[str],
|
|
43
|
+
Field(
|
|
44
|
+
description=(
|
|
45
|
+
"Service account ID. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
|
|
46
|
+
)
|
|
47
|
+
),
|
|
48
|
+
] = None
|
|
49
|
+
public_key_id: Annotated[
|
|
50
|
+
Optional[str],
|
|
51
|
+
Field(
|
|
52
|
+
description=(
|
|
53
|
+
"ID of the service account public key. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
|
|
54
|
+
)
|
|
55
|
+
),
|
|
56
|
+
] = None
|
|
38
57
|
private_key_file: Annotated[
|
|
39
|
-
Optional[str],
|
|
58
|
+
Optional[str],
|
|
59
|
+
Field(
|
|
60
|
+
description=(
|
|
61
|
+
"Path to the service account private key. Set automatically if `filename` or `private_key_content` is specified. When configuring via the UI, it must be specified explicitly"
|
|
62
|
+
)
|
|
63
|
+
),
|
|
40
64
|
] = None
|
|
41
65
|
private_key_content: Annotated[
|
|
42
66
|
Optional[str],
|
|
@@ -44,13 +68,35 @@ class NebiusServiceAccountFileCreds(CoreModel):
|
|
|
44
68
|
description=(
|
|
45
69
|
"Content of the service account private key. When configuring via"
|
|
46
70
|
" `server/config.yml`, it's automatically filled from `private_key_file`."
|
|
47
|
-
" When configuring via UI, it has to be specified explicitly
|
|
71
|
+
" When configuring via UI, it has to be specified explicitly"
|
|
48
72
|
)
|
|
49
73
|
),
|
|
50
74
|
] = None
|
|
75
|
+
filename: Annotated[
|
|
76
|
+
Optional[str], Field(description="The path to the service account credentials file")
|
|
77
|
+
] = None
|
|
51
78
|
|
|
52
79
|
@root_validator
|
|
53
80
|
def fill_data(cls, values):
|
|
81
|
+
if filename := values.get("filename"):
|
|
82
|
+
try:
|
|
83
|
+
with open(Path(filename).expanduser()) as f:
|
|
84
|
+
data = json.load(f)
|
|
85
|
+
from nebius.base.service_account.credentials_file import (
|
|
86
|
+
ServiceAccountCredentials,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
credentials = ServiceAccountCredentials.from_json(data)
|
|
90
|
+
subject = credentials.subject_credentials
|
|
91
|
+
values["service_account_id"] = subject.sub
|
|
92
|
+
values["public_key_id"] = subject.kid
|
|
93
|
+
values["private_key_content"] = subject.private_key
|
|
94
|
+
except OSError:
|
|
95
|
+
raise ValueError(f"No such file {filename}")
|
|
96
|
+
except Exception as e:
|
|
97
|
+
raise ValueError(f"Failed to parse credentials file {filename}: {e}")
|
|
98
|
+
return values
|
|
99
|
+
|
|
54
100
|
return fill_data(
|
|
55
101
|
values, filename_field="private_key_file", data_field="private_key_content"
|
|
56
102
|
)
|
|
@@ -95,6 +141,12 @@ class NebiusBackendConfig(CoreModel):
|
|
|
95
141
|
)
|
|
96
142
|
),
|
|
97
143
|
] = None
|
|
144
|
+
tags: Annotated[
|
|
145
|
+
Optional[Dict[str, str]],
|
|
146
|
+
Field(
|
|
147
|
+
description="The tags (labels) that will be assigned to resources created by `dstack`"
|
|
148
|
+
),
|
|
149
|
+
] = None
|
|
98
150
|
|
|
99
151
|
|
|
100
152
|
class NebiusBackendConfigWithCreds(NebiusBackendConfig):
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import re
|
|
2
3
|
import time
|
|
3
4
|
from collections import defaultdict
|
|
4
5
|
from collections.abc import Container as ContainerT
|
|
5
6
|
from collections.abc import Generator, Iterable, Sequence
|
|
6
7
|
from contextlib import contextmanager
|
|
7
8
|
from tempfile import NamedTemporaryFile
|
|
8
|
-
from typing import Optional
|
|
9
|
+
from typing import Dict, Optional
|
|
9
10
|
|
|
10
11
|
from nebius.aio.authorization.options import options_to_metadata
|
|
11
12
|
from nebius.aio.operation import Operation as SDKOperation
|
|
@@ -249,13 +250,14 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
|
|
|
249
250
|
|
|
250
251
|
|
|
251
252
|
def create_disk(
|
|
252
|
-
sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str
|
|
253
|
+
sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str, labels: Dict[str, str]
|
|
253
254
|
) -> SDKOperation[Operation]:
|
|
254
255
|
client = DiskServiceClient(sdk)
|
|
255
256
|
request = CreateDiskRequest(
|
|
256
257
|
metadata=ResourceMetadata(
|
|
257
258
|
name=name,
|
|
258
259
|
parent_id=project_id,
|
|
260
|
+
labels=labels,
|
|
259
261
|
),
|
|
260
262
|
spec=DiskSpec(
|
|
261
263
|
size_mebibytes=size_mib,
|
|
@@ -288,12 +290,14 @@ def create_instance(
|
|
|
288
290
|
disk_id: str,
|
|
289
291
|
subnet_id: str,
|
|
290
292
|
preemptible: bool,
|
|
293
|
+
labels: Dict[str, str],
|
|
291
294
|
) -> SDKOperation[Operation]:
|
|
292
295
|
client = InstanceServiceClient(sdk)
|
|
293
296
|
request = CreateInstanceRequest(
|
|
294
297
|
metadata=ResourceMetadata(
|
|
295
298
|
name=name,
|
|
296
299
|
parent_id=project_id,
|
|
300
|
+
labels=labels,
|
|
297
301
|
),
|
|
298
302
|
spec=InstanceSpec(
|
|
299
303
|
cloud_init_user_data=user_data,
|
|
@@ -367,3 +371,42 @@ def delete_cluster(sdk: SDK, cluster_id: str) -> None:
|
|
|
367
371
|
metadata=REQUEST_MD,
|
|
368
372
|
)
|
|
369
373
|
)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
|
|
377
|
+
filtered_labels = {}
|
|
378
|
+
for k, v in labels.items():
|
|
379
|
+
if not _is_valid_label(k, v):
|
|
380
|
+
logger.warning("Skipping invalid label '%s: %s'", k, v)
|
|
381
|
+
continue
|
|
382
|
+
filtered_labels[k] = v
|
|
383
|
+
return filtered_labels
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def validate_labels(labels: Dict[str, str]):
|
|
387
|
+
for k, v in labels.items():
|
|
388
|
+
if not _is_valid_label(k, v):
|
|
389
|
+
raise BackendError("Invalid resource labels")
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _is_valid_label(key: str, value: str) -> bool:
|
|
393
|
+
# TODO: [Nebius] current validation logic reuses GCP's approach.
|
|
394
|
+
# There is no public information on Nebius labels restrictions.
|
|
395
|
+
return is_valid_resource_name(key) and is_valid_label_value(value)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
MAX_RESOURCE_NAME_LEN = 63
|
|
399
|
+
NAME_PATTERN = re.compile(r"^[a-z][_\-a-z0-9]{0,62}$")
|
|
400
|
+
LABEL_VALUE_PATTERN = re.compile(r"^[_\-a-z0-9]{0,63}$")
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def is_valid_resource_name(name: str) -> bool:
|
|
404
|
+
if len(name) < 1 or len(name) > MAX_RESOURCE_NAME_LEN:
|
|
405
|
+
return False
|
|
406
|
+
match = re.match(NAME_PATTERN, name)
|
|
407
|
+
return match is not None
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def is_valid_label_value(value: str) -> bool:
|
|
411
|
+
match = re.match(LABEL_VALUE_PATTERN, value)
|
|
412
|
+
return match is not None
|
|
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
9
9
|
ComputeWithAllOffersCached,
|
|
10
10
|
ComputeWithCreateInstanceSupport,
|
|
11
11
|
ComputeWithMultinodeSupport,
|
|
12
|
+
ComputeWithPrivilegedSupport,
|
|
12
13
|
generate_unique_instance_name,
|
|
13
14
|
get_user_data,
|
|
14
15
|
)
|
|
@@ -50,6 +51,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("50GB"), max=Memory.pars
|
|
|
50
51
|
class OCICompute(
|
|
51
52
|
ComputeWithAllOffersCached,
|
|
52
53
|
ComputeWithCreateInstanceSupport,
|
|
54
|
+
ComputeWithPrivilegedSupport,
|
|
53
55
|
ComputeWithMultinodeSupport,
|
|
54
56
|
Compute,
|
|
55
57
|
):
|
|
@@ -118,7 +120,11 @@ class OCICompute(
|
|
|
118
120
|
availability_domain = instance_offer.availability_zones[0]
|
|
119
121
|
|
|
120
122
|
listing, package = resources.get_marketplace_listing_and_package(
|
|
121
|
-
|
|
123
|
+
gpu_name=(
|
|
124
|
+
instance_offer.instance.resources.gpus[0].name
|
|
125
|
+
if len(instance_offer.instance.resources.gpus) > 0
|
|
126
|
+
else None
|
|
127
|
+
),
|
|
122
128
|
client=region.marketplace_client,
|
|
123
129
|
)
|
|
124
130
|
resources.accept_marketplace_listing_agreements(
|
|
@@ -23,7 +23,9 @@ import oci
|
|
|
23
23
|
from oci.object_storage.models import CreatePreauthenticatedRequestDetails
|
|
24
24
|
|
|
25
25
|
from dstack import version
|
|
26
|
+
from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules
|
|
26
27
|
from dstack._internal.core.backends.oci.region import OCIRegionClient
|
|
28
|
+
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
|
|
27
29
|
from dstack._internal.core.errors import BackendError
|
|
28
30
|
from dstack._internal.core.models.instances import InstanceOffer
|
|
29
31
|
from dstack._internal.utils.common import batched
|
|
@@ -352,11 +354,14 @@ def terminate_instance_if_exists(client: oci.core.ComputeClient, instance_id: st
|
|
|
352
354
|
|
|
353
355
|
|
|
354
356
|
def get_marketplace_listing_and_package(
|
|
355
|
-
|
|
357
|
+
gpu_name: Optional[str], client: oci.marketplace.MarketplaceClient
|
|
356
358
|
) -> Tuple[oci.marketplace.models.Listing, oci.marketplace.models.ImageListingPackage]:
|
|
357
359
|
listing_name = f"dstack-{version.base_image}"
|
|
358
|
-
if
|
|
359
|
-
|
|
360
|
+
if gpu_name is not None:
|
|
361
|
+
if not requires_nvidia_proprietary_kernel_modules(gpu_name):
|
|
362
|
+
listing_name = f"dstack-cuda-{version.base_image}"
|
|
363
|
+
else:
|
|
364
|
+
listing_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
|
|
360
365
|
|
|
361
366
|
listing_summaries = list_marketplace_listings(listing_name, client)
|
|
362
367
|
if len(listing_summaries) != 1:
|
|
@@ -8,6 +8,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
8
8
|
ComputeWithMultinodeSupport,
|
|
9
9
|
ComputeWithPlacementGroupSupport,
|
|
10
10
|
ComputeWithPrivateGatewaySupport,
|
|
11
|
+
ComputeWithPrivilegedSupport,
|
|
11
12
|
ComputeWithReservationSupport,
|
|
12
13
|
ComputeWithVolumeSupport,
|
|
13
14
|
)
|
|
@@ -31,6 +32,7 @@ class {{ backend_name }}Compute(
|
|
|
31
32
|
# TODO: Choose ComputeWith* classes to extend and implement
|
|
32
33
|
# ComputeWithAllOffersCached,
|
|
33
34
|
# ComputeWithCreateInstanceSupport,
|
|
35
|
+
# ComputeWithPrivilegedSupport,
|
|
34
36
|
# ComputeWithMultinodeSupport,
|
|
35
37
|
# ComputeWithReservationSupport,
|
|
36
38
|
# ComputeWithPlacementGroupSupport,
|
|
@@ -6,6 +6,7 @@ import requests
|
|
|
6
6
|
from dstack._internal.core.backends.base.backend import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
8
|
ComputeWithCreateInstanceSupport,
|
|
9
|
+
ComputeWithPrivilegedSupport,
|
|
9
10
|
generate_unique_instance_name,
|
|
10
11
|
get_shim_commands,
|
|
11
12
|
)
|
|
@@ -32,6 +33,7 @@ MAX_INSTANCE_NAME_LEN = 60
|
|
|
32
33
|
|
|
33
34
|
class TensorDockCompute(
|
|
34
35
|
ComputeWithCreateInstanceSupport,
|
|
36
|
+
ComputeWithPrivilegedSupport,
|
|
35
37
|
Compute,
|
|
36
38
|
):
|
|
37
39
|
def __init__(self, config: TensorDockConfig):
|
|
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
9
9
|
ComputeWithAllOffersCached,
|
|
10
10
|
ComputeWithCreateInstanceSupport,
|
|
11
11
|
ComputeWithMultinodeSupport,
|
|
12
|
+
ComputeWithPrivilegedSupport,
|
|
12
13
|
generate_unique_instance_name,
|
|
13
14
|
get_user_data,
|
|
14
15
|
)
|
|
@@ -35,6 +36,7 @@ MAX_INSTANCE_NAME_LEN = 64
|
|
|
35
36
|
class VultrCompute(
|
|
36
37
|
ComputeWithAllOffersCached,
|
|
37
38
|
ComputeWithCreateInstanceSupport,
|
|
39
|
+
ComputeWithPrivilegedSupport,
|
|
38
40
|
ComputeWithMultinodeSupport,
|
|
39
41
|
Compute,
|
|
40
42
|
):
|
|
@@ -53,6 +53,10 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
|
|
|
53
53
|
}
|
|
54
54
|
if all(js.exit_status is None for js in job_submissions):
|
|
55
55
|
job_submissions_excludes["exit_status"] = True
|
|
56
|
+
if all(js.status_message == "" for js in job_submissions):
|
|
57
|
+
job_submissions_excludes["status_message"] = True
|
|
58
|
+
if all(js.error is None for js in job_submissions):
|
|
59
|
+
job_submissions_excludes["error"] = True
|
|
56
60
|
if all(js.deployment_num == 0 for js in job_submissions):
|
|
57
61
|
job_submissions_excludes["deployment_num"] = True
|
|
58
62
|
if all(not js.probes for js in job_submissions):
|
|
@@ -71,6 +75,10 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
|
|
|
71
75
|
}
|
|
72
76
|
if latest_job_submission.exit_status is None:
|
|
73
77
|
latest_job_submission_excludes["exit_status"] = True
|
|
78
|
+
if latest_job_submission.status_message == "":
|
|
79
|
+
latest_job_submission_excludes["status_message"] = True
|
|
80
|
+
if latest_job_submission.error is None:
|
|
81
|
+
latest_job_submission_excludes["error"] = True
|
|
74
82
|
if latest_job_submission.deployment_num == 0:
|
|
75
83
|
latest_job_submission_excludes["deployment_num"] = True
|
|
76
84
|
if not latest_job_submission.probes:
|
dstack/_internal/core/consts.py
CHANGED
|
@@ -4,3 +4,5 @@ DSTACK_SHIM_HTTP_PORT = 10998
|
|
|
4
4
|
DSTACK_RUNNER_HTTP_PORT = 10999
|
|
5
5
|
# ssh server (runs alongside the runner inside a container) listen port
|
|
6
6
|
DSTACK_RUNNER_SSH_PORT = 10022
|
|
7
|
+
# legacy AWS, Azure, GCP, and OCI image for older GPUs
|
|
8
|
+
DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES = "0.10"
|