dstack 0.19.30__py3-none-any.whl → 0.19.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +8 -0
- dstack/_internal/cli/commands/project.py +27 -20
- dstack/_internal/cli/commands/server.py +5 -0
- dstack/_internal/cli/main.py +1 -3
- dstack/_internal/core/backends/aws/compute.py +2 -0
- dstack/_internal/core/backends/azure/compute.py +2 -0
- dstack/_internal/core/backends/base/compute.py +32 -9
- dstack/_internal/core/backends/base/offers.py +1 -0
- dstack/_internal/core/backends/cloudrift/compute.py +2 -0
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
- dstack/_internal/core/backends/features.py +5 -0
- dstack/_internal/core/backends/gcp/compute.py +74 -34
- dstack/_internal/core/backends/gcp/configurator.py +1 -1
- dstack/_internal/core/backends/gcp/models.py +14 -1
- dstack/_internal/core/backends/gcp/resources.py +35 -12
- dstack/_internal/core/backends/hotaisle/compute.py +2 -0
- dstack/_internal/core/backends/kubernetes/compute.py +466 -213
- dstack/_internal/core/backends/kubernetes/models.py +13 -16
- dstack/_internal/core/backends/kubernetes/utils.py +145 -8
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +2 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +2 -0
- dstack/_internal/server/background/tasks/common.py +2 -0
- dstack/_internal/server/background/tasks/process_instances.py +2 -2
- dstack/_internal/server/services/offers.py +7 -1
- dstack/_internal/server/testing/common.py +2 -0
- dstack/_internal/server/utils/provisioning.py +3 -10
- dstack/version.py +1 -1
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/METADATA +11 -9
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/RECORD +39 -39
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/WHEEL +0 -0
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -5,12 +5,14 @@ from pydantic import Field, root_validator
|
|
|
5
5
|
from dstack._internal.core.backends.base.models import fill_data
|
|
6
6
|
from dstack._internal.core.models.common import CoreModel
|
|
7
7
|
|
|
8
|
+
DEFAULT_NAMESPACE = "default"
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
|
|
11
|
+
class KubernetesProxyJumpConfig(CoreModel):
|
|
12
|
+
hostname: Annotated[
|
|
13
|
+
Optional[str], Field(description="The external IP address or hostname of any node")
|
|
12
14
|
] = None
|
|
13
|
-
|
|
15
|
+
port: Annotated[
|
|
14
16
|
Optional[int], Field(description="Any port accessible outside of the cluster")
|
|
15
17
|
] = None
|
|
16
18
|
|
|
@@ -22,16 +24,15 @@ class KubeconfigConfig(CoreModel):
|
|
|
22
24
|
|
|
23
25
|
class KubernetesBackendConfig(CoreModel):
|
|
24
26
|
type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes"
|
|
25
|
-
|
|
26
|
-
Optional[
|
|
27
|
+
proxy_jump: Annotated[
|
|
28
|
+
Optional[KubernetesProxyJumpConfig], Field(description="The SSH proxy jump configuration")
|
|
27
29
|
] = None
|
|
30
|
+
namespace: Annotated[
|
|
31
|
+
str, Field(description="The namespace for resources managed by `dstack`")
|
|
32
|
+
] = DEFAULT_NAMESPACE
|
|
28
33
|
|
|
29
34
|
|
|
30
|
-
class KubernetesBackendConfigWithCreds(
|
|
31
|
-
type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes"
|
|
32
|
-
networking: Annotated[
|
|
33
|
-
Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
|
|
34
|
-
] = None
|
|
35
|
+
class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
|
|
35
36
|
kubeconfig: Annotated[KubeconfigConfig, Field(description="The kubeconfig configuration")]
|
|
36
37
|
|
|
37
38
|
|
|
@@ -53,11 +54,7 @@ class KubeconfigFileConfig(CoreModel):
|
|
|
53
54
|
return fill_data(values)
|
|
54
55
|
|
|
55
56
|
|
|
56
|
-
class KubernetesBackendFileConfigWithCreds(
|
|
57
|
-
type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes"
|
|
58
|
-
networking: Annotated[
|
|
59
|
-
Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
|
|
60
|
-
] = None
|
|
57
|
+
class KubernetesBackendFileConfigWithCreds(KubernetesBackendConfig):
|
|
61
58
|
kubeconfig: Annotated[KubeconfigFileConfig, Field(description="The kubeconfig configuration")]
|
|
62
59
|
|
|
63
60
|
|
|
@@ -1,20 +1,157 @@
|
|
|
1
|
-
|
|
1
|
+
import ast
|
|
2
|
+
from typing import Any, Callable, List, Literal, Optional, TypeVar, Union, get_origin, overload
|
|
2
3
|
|
|
3
|
-
import kubernetes
|
|
4
4
|
import yaml
|
|
5
|
+
from kubernetes import client as kubernetes_client
|
|
6
|
+
from kubernetes import config as kubernetes_config
|
|
7
|
+
from typing_extensions import ParamSpec
|
|
5
8
|
|
|
9
|
+
T = TypeVar("T")
|
|
10
|
+
P = ParamSpec("P")
|
|
6
11
|
|
|
7
|
-
|
|
12
|
+
|
|
13
|
+
def get_api_from_config_data(kubeconfig_data: str) -> kubernetes_client.CoreV1Api:
|
|
8
14
|
config_dict = yaml.load(kubeconfig_data, yaml.FullLoader)
|
|
9
15
|
return get_api_from_config_dict(config_dict)
|
|
10
16
|
|
|
11
17
|
|
|
12
|
-
def get_api_from_config_dict(kubeconfig:
|
|
13
|
-
api_client =
|
|
14
|
-
return
|
|
18
|
+
def get_api_from_config_dict(kubeconfig: dict) -> kubernetes_client.CoreV1Api:
|
|
19
|
+
api_client = kubernetes_config.new_client_from_config_dict(config_dict=kubeconfig)
|
|
20
|
+
return kubernetes_client.CoreV1Api(api_client=api_client)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@overload
|
|
24
|
+
def call_api_method(
|
|
25
|
+
method: Callable[P, Any],
|
|
26
|
+
type_: type[T],
|
|
27
|
+
expected: None = None,
|
|
28
|
+
*args: P.args,
|
|
29
|
+
**kwargs: P.kwargs,
|
|
30
|
+
) -> T: ...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@overload
|
|
34
|
+
def call_api_method(
|
|
35
|
+
method: Callable[P, Any],
|
|
36
|
+
type_: type[T],
|
|
37
|
+
expected: Union[int, tuple[int, ...], list[int]],
|
|
38
|
+
*args: P.args,
|
|
39
|
+
**kwargs: P.kwargs,
|
|
40
|
+
) -> Optional[T]: ...
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def call_api_method(
|
|
44
|
+
method: Callable[P, Any],
|
|
45
|
+
type_: type[T],
|
|
46
|
+
expected: Optional[Union[int, tuple[int, ...], list[int]]] = None,
|
|
47
|
+
*args: P.args,
|
|
48
|
+
**kwargs: P.kwargs,
|
|
49
|
+
) -> Optional[T]:
|
|
50
|
+
"""
|
|
51
|
+
Returns the result of the API method call, optionally ignoring specified HTTP status codes.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
method: the `CoreV1Api` bound method.
|
|
55
|
+
type_: The expected type of the return value, used for runtime type checking and
|
|
56
|
+
as a type hint for a static type checker (as kubernetes package is not type-annotated).
|
|
57
|
+
NB: For composite types, only "origin" type is checked, e.g., list, not list[Node]
|
|
58
|
+
expected: Expected error statuses, e.g., 404.
|
|
59
|
+
args: positional arguments of the method.
|
|
60
|
+
kwargs: keyword arguments of the method.
|
|
61
|
+
Returns:
|
|
62
|
+
The return value or `None` in case of the expected error.
|
|
63
|
+
"""
|
|
64
|
+
if isinstance(expected, int):
|
|
65
|
+
expected = (expected,)
|
|
66
|
+
result: T
|
|
67
|
+
try:
|
|
68
|
+
result = method(*args, **kwargs)
|
|
69
|
+
except kubernetes_client.ApiException as e:
|
|
70
|
+
if expected is None or e.status not in expected:
|
|
71
|
+
raise
|
|
72
|
+
return None
|
|
73
|
+
if not isinstance(result, get_origin(type_) or type_):
|
|
74
|
+
raise TypeError(
|
|
75
|
+
f"{method.__name__} returned {type(result).__name__}, expected {type_.__name__}"
|
|
76
|
+
)
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@overload
|
|
81
|
+
def get_value(
|
|
82
|
+
obj: object, path: str, type_: type[T], *, required: Literal[False] = False
|
|
83
|
+
) -> Optional[T]: ...
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@overload
|
|
87
|
+
def get_value(obj: object, path: str, type_: type[T], *, required: Literal[True]) -> T: ...
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_value(obj: object, path: str, type_: type[T], *, required: bool = False) -> Optional[T]:
|
|
91
|
+
"""
|
|
92
|
+
Returns the value at a given path.
|
|
93
|
+
Supports object attributes, sequence indices, and mapping keys.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
obj: The object to traverse.
|
|
97
|
+
path: The path to the value, regular Python syntax. The leading dot is optional, all the
|
|
98
|
+
following are correct: `.attr`, `attr`, `.[0]`, `[0]`, `.['key']`, `['key']`.
|
|
99
|
+
type_: The expected type of the value, used for runtime type checking and as a type hint
|
|
100
|
+
for a static type checker (as kubernetes package is not type-annotated).
|
|
101
|
+
NB: For composite types, only "origin" type is checked, e.g., list, not list[Node]
|
|
102
|
+
required: If `True`, the value must exist and must not be `None`. If `False` (safe
|
|
103
|
+
navigation mode), the may not exist and may be `None`.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
The requested value or `None` in case of failed traverse when required=False.
|
|
107
|
+
"""
|
|
108
|
+
_path = path.removeprefix(".")
|
|
109
|
+
if _path.startswith("["):
|
|
110
|
+
src = f"obj{_path}"
|
|
111
|
+
else:
|
|
112
|
+
src = f"obj.{_path}"
|
|
113
|
+
module = ast.parse(src)
|
|
114
|
+
assert len(module.body) == 1, ast.dump(module, indent=4)
|
|
115
|
+
root_expr = module.body[0]
|
|
116
|
+
assert isinstance(root_expr, ast.Expr), ast.dump(module, indent=4)
|
|
117
|
+
varname: Optional[str] = None
|
|
118
|
+
expr = root_expr.value
|
|
119
|
+
while True:
|
|
120
|
+
if isinstance(expr, ast.Name):
|
|
121
|
+
varname = expr.id
|
|
122
|
+
break
|
|
123
|
+
if __debug__:
|
|
124
|
+
if isinstance(expr, ast.Subscript):
|
|
125
|
+
if isinstance(expr.slice, ast.UnaryOp):
|
|
126
|
+
# .items[-1]
|
|
127
|
+
assert isinstance(expr.slice.op, ast.USub), ast.dump(expr, indent=4)
|
|
128
|
+
assert isinstance(expr.slice.operand, ast.Constant), ast.dump(expr, indent=4)
|
|
129
|
+
assert isinstance(expr.slice.operand.value, int), ast.dump(expr, indent=4)
|
|
130
|
+
else:
|
|
131
|
+
# .items[0], .labels["name"]
|
|
132
|
+
assert isinstance(expr.slice, ast.Constant), ast.dump(expr, indent=4)
|
|
133
|
+
else:
|
|
134
|
+
assert isinstance(expr, ast.Attribute), ast.dump(expr, indent=4)
|
|
135
|
+
else:
|
|
136
|
+
assert isinstance(expr, (ast.Attribute, ast.Subscript))
|
|
137
|
+
expr = expr.value
|
|
138
|
+
assert varname is not None, ast.dump(module)
|
|
139
|
+
try:
|
|
140
|
+
value = eval(src, {"__builtins__": {}}, {"obj": obj})
|
|
141
|
+
except (AttributeError, KeyError, IndexError, TypeError) as e:
|
|
142
|
+
if required:
|
|
143
|
+
raise type(e)(f"Failed to traverse {path}: {e}") from e
|
|
144
|
+
return None
|
|
145
|
+
if value is None:
|
|
146
|
+
if required:
|
|
147
|
+
raise TypeError(f"Required {path} is None")
|
|
148
|
+
return value
|
|
149
|
+
if not isinstance(value, get_origin(type_) or type_):
|
|
150
|
+
raise TypeError(f"{path} value is {type(value).__name__}, expected {type_.__name__}")
|
|
151
|
+
return value
|
|
15
152
|
|
|
16
153
|
|
|
17
|
-
def get_cluster_public_ip(api_client:
|
|
154
|
+
def get_cluster_public_ip(api_client: kubernetes_client.CoreV1Api) -> Optional[str]:
|
|
18
155
|
"""
|
|
19
156
|
Returns public IP of any cluster node.
|
|
20
157
|
"""
|
|
@@ -24,7 +161,7 @@ def get_cluster_public_ip(api_client: kubernetes.client.CoreV1Api) -> Optional[s
|
|
|
24
161
|
return public_ips[0]
|
|
25
162
|
|
|
26
163
|
|
|
27
|
-
def get_cluster_public_ips(api_client:
|
|
164
|
+
def get_cluster_public_ips(api_client: kubernetes_client.CoreV1Api) -> List[str]:
|
|
28
165
|
"""
|
|
29
166
|
Returns public IPs of all cluster nodes.
|
|
30
167
|
"""
|
|
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
9
9
|
Compute,
|
|
10
10
|
ComputeWithAllOffersCached,
|
|
11
11
|
ComputeWithCreateInstanceSupport,
|
|
12
|
+
ComputeWithPrivilegedSupport,
|
|
12
13
|
generate_unique_instance_name,
|
|
13
14
|
get_shim_commands,
|
|
14
15
|
)
|
|
@@ -31,6 +32,7 @@ MAX_INSTANCE_NAME_LEN = 60
|
|
|
31
32
|
class LambdaCompute(
|
|
32
33
|
ComputeWithAllOffersCached,
|
|
33
34
|
ComputeWithCreateInstanceSupport,
|
|
35
|
+
ComputeWithPrivilegedSupport,
|
|
34
36
|
Compute,
|
|
35
37
|
):
|
|
36
38
|
def __init__(self, config: LambdaConfig):
|
|
@@ -3,6 +3,7 @@ from typing import List, Optional
|
|
|
3
3
|
from dstack._internal.core.backends.base.compute import (
|
|
4
4
|
Compute,
|
|
5
5
|
ComputeWithCreateInstanceSupport,
|
|
6
|
+
ComputeWithPrivilegedSupport,
|
|
6
7
|
ComputeWithVolumeSupport,
|
|
7
8
|
)
|
|
8
9
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
@@ -25,6 +26,7 @@ logger = get_logger(__name__)
|
|
|
25
26
|
|
|
26
27
|
class LocalCompute(
|
|
27
28
|
ComputeWithCreateInstanceSupport,
|
|
29
|
+
ComputeWithPrivilegedSupport,
|
|
28
30
|
ComputeWithVolumeSupport,
|
|
29
31
|
Compute,
|
|
30
32
|
):
|
|
@@ -16,6 +16,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
16
16
|
ComputeWithCreateInstanceSupport,
|
|
17
17
|
ComputeWithMultinodeSupport,
|
|
18
18
|
ComputeWithPlacementGroupSupport,
|
|
19
|
+
ComputeWithPrivilegedSupport,
|
|
19
20
|
generate_unique_instance_name,
|
|
20
21
|
get_user_data,
|
|
21
22
|
)
|
|
@@ -79,6 +80,7 @@ SUPPORTED_PLATFORMS = [
|
|
|
79
80
|
class NebiusCompute(
|
|
80
81
|
ComputeWithAllOffersCached,
|
|
81
82
|
ComputeWithCreateInstanceSupport,
|
|
83
|
+
ComputeWithPrivilegedSupport,
|
|
82
84
|
ComputeWithMultinodeSupport,
|
|
83
85
|
ComputeWithPlacementGroupSupport,
|
|
84
86
|
Compute,
|
|
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
9
9
|
ComputeWithAllOffersCached,
|
|
10
10
|
ComputeWithCreateInstanceSupport,
|
|
11
11
|
ComputeWithMultinodeSupport,
|
|
12
|
+
ComputeWithPrivilegedSupport,
|
|
12
13
|
generate_unique_instance_name,
|
|
13
14
|
get_user_data,
|
|
14
15
|
)
|
|
@@ -50,6 +51,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("50GB"), max=Memory.pars
|
|
|
50
51
|
class OCICompute(
|
|
51
52
|
ComputeWithAllOffersCached,
|
|
52
53
|
ComputeWithCreateInstanceSupport,
|
|
54
|
+
ComputeWithPrivilegedSupport,
|
|
53
55
|
ComputeWithMultinodeSupport,
|
|
54
56
|
Compute,
|
|
55
57
|
):
|
|
@@ -8,6 +8,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
8
8
|
ComputeWithMultinodeSupport,
|
|
9
9
|
ComputeWithPlacementGroupSupport,
|
|
10
10
|
ComputeWithPrivateGatewaySupport,
|
|
11
|
+
ComputeWithPrivilegedSupport,
|
|
11
12
|
ComputeWithReservationSupport,
|
|
12
13
|
ComputeWithVolumeSupport,
|
|
13
14
|
)
|
|
@@ -31,6 +32,7 @@ class {{ backend_name }}Compute(
|
|
|
31
32
|
# TODO: Choose ComputeWith* classes to extend and implement
|
|
32
33
|
# ComputeWithAllOffersCached,
|
|
33
34
|
# ComputeWithCreateInstanceSupport,
|
|
35
|
+
# ComputeWithPrivilegedSupport,
|
|
34
36
|
# ComputeWithMultinodeSupport,
|
|
35
37
|
# ComputeWithReservationSupport,
|
|
36
38
|
# ComputeWithPlacementGroupSupport,
|
|
@@ -6,6 +6,7 @@ import requests
|
|
|
6
6
|
from dstack._internal.core.backends.base.backend import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
8
|
ComputeWithCreateInstanceSupport,
|
|
9
|
+
ComputeWithPrivilegedSupport,
|
|
9
10
|
generate_unique_instance_name,
|
|
10
11
|
get_shim_commands,
|
|
11
12
|
)
|
|
@@ -32,6 +33,7 @@ MAX_INSTANCE_NAME_LEN = 60
|
|
|
32
33
|
|
|
33
34
|
class TensorDockCompute(
|
|
34
35
|
ComputeWithCreateInstanceSupport,
|
|
36
|
+
ComputeWithPrivilegedSupport,
|
|
35
37
|
Compute,
|
|
36
38
|
):
|
|
37
39
|
def __init__(self, config: TensorDockConfig):
|
|
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
9
9
|
ComputeWithAllOffersCached,
|
|
10
10
|
ComputeWithCreateInstanceSupport,
|
|
11
11
|
ComputeWithMultinodeSupport,
|
|
12
|
+
ComputeWithPrivilegedSupport,
|
|
12
13
|
generate_unique_instance_name,
|
|
13
14
|
get_user_data,
|
|
14
15
|
)
|
|
@@ -35,6 +36,7 @@ MAX_INSTANCE_NAME_LEN = 64
|
|
|
35
36
|
class VultrCompute(
|
|
36
37
|
ComputeWithAllOffersCached,
|
|
37
38
|
ComputeWithCreateInstanceSupport,
|
|
39
|
+
ComputeWithPrivilegedSupport,
|
|
38
40
|
ComputeWithMultinodeSupport,
|
|
39
41
|
Compute,
|
|
40
42
|
):
|
|
@@ -19,4 +19,6 @@ def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str)
|
|
|
19
19
|
return timedelta(minutes=20)
|
|
20
20
|
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
|
|
21
21
|
return timedelta(minutes=55)
|
|
22
|
+
if backend_type == BackendType.GCP and instance_type_name == "a4-highgpu-8g":
|
|
23
|
+
return timedelta(minutes=16)
|
|
22
24
|
return timedelta(minutes=10)
|
|
@@ -307,7 +307,7 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
307
307
|
)
|
|
308
308
|
deploy_timeout = 20 * 60 # 20 minutes
|
|
309
309
|
result = await asyncio.wait_for(future, timeout=deploy_timeout)
|
|
310
|
-
health, host_info,
|
|
310
|
+
health, host_info, arch = result
|
|
311
311
|
except (asyncio.TimeoutError, TimeoutError) as e:
|
|
312
312
|
raise ProvisioningError(f"Deploy timeout: {e}") from e
|
|
313
313
|
except Exception as e:
|
|
@@ -327,7 +327,7 @@ async def _add_remote(instance: InstanceModel) -> None:
|
|
|
327
327
|
instance.status = InstanceStatus.PENDING
|
|
328
328
|
return
|
|
329
329
|
|
|
330
|
-
instance_type = host_info_to_instance_type(host_info,
|
|
330
|
+
instance_type = host_info_to_instance_type(host_info, arch)
|
|
331
331
|
instance_network = None
|
|
332
332
|
internal_ip = None
|
|
333
333
|
try:
|
|
@@ -7,6 +7,7 @@ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGrou
|
|
|
7
7
|
from dstack._internal.core.backends.features import (
|
|
8
8
|
BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
|
|
9
9
|
BACKENDS_WITH_MULTINODE_SUPPORT,
|
|
10
|
+
BACKENDS_WITH_PRIVILEGED_SUPPORT,
|
|
10
11
|
BACKENDS_WITH_RESERVATION_SUPPORT,
|
|
11
12
|
)
|
|
12
13
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -67,7 +68,12 @@ async def get_offers_by_requirements(
|
|
|
67
68
|
backend_types = BACKENDS_WITH_MULTINODE_SUPPORT
|
|
68
69
|
backend_types = [b for b in backend_types if b in BACKENDS_WITH_MULTINODE_SUPPORT]
|
|
69
70
|
|
|
70
|
-
if privileged
|
|
71
|
+
if privileged:
|
|
72
|
+
if backend_types is None:
|
|
73
|
+
backend_types = BACKENDS_WITH_PRIVILEGED_SUPPORT
|
|
74
|
+
backend_types = [b for b in backend_types if b in BACKENDS_WITH_PRIVILEGED_SUPPORT]
|
|
75
|
+
|
|
76
|
+
if instance_mounts:
|
|
71
77
|
if backend_types is None:
|
|
72
78
|
backend_types = BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
|
|
73
79
|
backend_types = [b for b in backend_types if b in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT]
|
|
@@ -16,6 +16,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
16
16
|
ComputeWithMultinodeSupport,
|
|
17
17
|
ComputeWithPlacementGroupSupport,
|
|
18
18
|
ComputeWithPrivateGatewaySupport,
|
|
19
|
+
ComputeWithPrivilegedSupport,
|
|
19
20
|
ComputeWithReservationSupport,
|
|
20
21
|
ComputeWithVolumeSupport,
|
|
21
22
|
)
|
|
@@ -1131,6 +1132,7 @@ class AsyncContextManager:
|
|
|
1131
1132
|
class ComputeMockSpec(
|
|
1132
1133
|
Compute,
|
|
1133
1134
|
ComputeWithCreateInstanceSupport,
|
|
1135
|
+
ComputeWithPrivilegedSupport,
|
|
1134
1136
|
ComputeWithMultinodeSupport,
|
|
1135
1137
|
ComputeWithReservationSupport,
|
|
1136
1138
|
ComputeWithPlacementGroupSupport,
|
|
@@ -6,7 +6,7 @@ from textwrap import dedent
|
|
|
6
6
|
from typing import Any, Dict, Generator, List, Optional
|
|
7
7
|
|
|
8
8
|
import paramiko
|
|
9
|
-
from gpuhunt import AcceleratorVendor,
|
|
9
|
+
from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
|
|
10
10
|
|
|
11
11
|
from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch
|
|
12
12
|
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
@@ -248,14 +248,7 @@ def _get_shim_healthcheck(client: paramiko.SSHClient) -> Optional[str]:
|
|
|
248
248
|
return out
|
|
249
249
|
|
|
250
250
|
|
|
251
|
-
def host_info_to_instance_type(host_info: Dict[str, Any],
|
|
252
|
-
_cpu_arch: CPUArchitecture
|
|
253
|
-
if cpu_arch == "amd64":
|
|
254
|
-
_cpu_arch = CPUArchitecture.X86
|
|
255
|
-
elif cpu_arch == "arm64":
|
|
256
|
-
_cpu_arch = CPUArchitecture.ARM
|
|
257
|
-
else:
|
|
258
|
-
raise ValueError(f"Unexpected cpu_arch: {cpu_arch}")
|
|
251
|
+
def host_info_to_instance_type(host_info: Dict[str, Any], arch: GoArchType) -> InstanceType:
|
|
259
252
|
gpu_count = host_info.get("gpu_count", 0)
|
|
260
253
|
if gpu_count > 0:
|
|
261
254
|
gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia"))
|
|
@@ -280,7 +273,7 @@ def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType)
|
|
|
280
273
|
instance_type = InstanceType(
|
|
281
274
|
name="instance",
|
|
282
275
|
resources=Resources(
|
|
283
|
-
cpu_arch=
|
|
276
|
+
cpu_arch=arch.to_cpu_architecture(),
|
|
284
277
|
cpus=host_info["cpus"],
|
|
285
278
|
memory_mib=host_info["memory"] / 1024 / 1024,
|
|
286
279
|
spot=False,
|
dstack/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dstack
|
|
3
|
-
Version: 0.19.
|
|
3
|
+
Version: 0.19.31
|
|
4
4
|
Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
|
|
5
5
|
Project-URL: Homepage, https://dstack.ai
|
|
6
6
|
Project-URL: Source, https://github.com/dstackai/dstack
|
|
@@ -331,9 +331,11 @@ Description-Content-Type: text/markdown
|
|
|
331
331
|
|
|
332
332
|
</div>
|
|
333
333
|
|
|
334
|
-
`dstack`
|
|
334
|
+
`dstack` is a unified control plane for GPU provisioning and orchestration that works with any GPU cloud, Kubernetes, or on-prem clusters.
|
|
335
335
|
|
|
336
|
-
|
|
336
|
+
It streamlines development, training, and inference, and is compatible with any hardware, open-source tools, and frameworks.
|
|
337
|
+
|
|
338
|
+
#### Hardware
|
|
337
339
|
|
|
338
340
|
`dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box.
|
|
339
341
|
|
|
@@ -358,15 +360,15 @@ Description-Content-Type: text/markdown
|
|
|
358
360
|
|
|
359
361
|
#### Set up the server
|
|
360
362
|
|
|
361
|
-
#####
|
|
363
|
+
##### Configure backends
|
|
364
|
+
|
|
365
|
+
To orchestrate compute across cloud providers or existing Kubernetes clusters, you need to configure backends.
|
|
362
366
|
|
|
363
|
-
|
|
364
|
-
via the `~/.dstack/server/config.yml` file.
|
|
367
|
+
Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](../concepts/projects.md#backends) in the UI.
|
|
365
368
|
|
|
366
|
-
For more details
|
|
369
|
+
For more details, see [Backends](../concepts/backends.md).
|
|
367
370
|
|
|
368
|
-
>
|
|
369
|
-
> once the server is up.
|
|
371
|
+
> When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](../concepts/fleets.md#ssh) once the server is up.
|
|
370
372
|
|
|
371
373
|
##### Start the server
|
|
372
374
|
|