dstack 0.19.30rc1__py3-none-any.whl → 0.19.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (54) hide show
  1. dstack/_internal/cli/commands/__init__.py +8 -0
  2. dstack/_internal/cli/commands/project.py +27 -20
  3. dstack/_internal/cli/commands/server.py +5 -0
  4. dstack/_internal/cli/services/configurators/fleet.py +20 -6
  5. dstack/_internal/cli/utils/gpu.py +2 -2
  6. dstack/_internal/core/backends/aws/compute.py +13 -5
  7. dstack/_internal/core/backends/aws/resources.py +11 -6
  8. dstack/_internal/core/backends/azure/compute.py +17 -6
  9. dstack/_internal/core/backends/base/compute.py +57 -9
  10. dstack/_internal/core/backends/base/offers.py +1 -0
  11. dstack/_internal/core/backends/cloudrift/compute.py +2 -0
  12. dstack/_internal/core/backends/cudo/compute.py +2 -0
  13. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  14. dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
  15. dstack/_internal/core/backends/features.py +5 -0
  16. dstack/_internal/core/backends/gcp/compute.py +87 -38
  17. dstack/_internal/core/backends/gcp/configurator.py +1 -1
  18. dstack/_internal/core/backends/gcp/models.py +14 -1
  19. dstack/_internal/core/backends/gcp/resources.py +35 -12
  20. dstack/_internal/core/backends/hotaisle/compute.py +22 -0
  21. dstack/_internal/core/backends/kubernetes/compute.py +531 -215
  22. dstack/_internal/core/backends/kubernetes/models.py +13 -16
  23. dstack/_internal/core/backends/kubernetes/utils.py +145 -8
  24. dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
  25. dstack/_internal/core/backends/local/compute.py +2 -0
  26. dstack/_internal/core/backends/nebius/compute.py +17 -0
  27. dstack/_internal/core/backends/nebius/configurator.py +15 -0
  28. dstack/_internal/core/backends/nebius/models.py +57 -5
  29. dstack/_internal/core/backends/nebius/resources.py +45 -2
  30. dstack/_internal/core/backends/oci/compute.py +7 -1
  31. dstack/_internal/core/backends/oci/resources.py +8 -3
  32. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  33. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  34. dstack/_internal/core/backends/vultr/compute.py +2 -0
  35. dstack/_internal/core/compatibility/runs.py +8 -0
  36. dstack/_internal/core/consts.py +2 -0
  37. dstack/_internal/core/models/profiles.py +11 -4
  38. dstack/_internal/core/services/repos.py +101 -11
  39. dstack/_internal/server/background/tasks/common.py +2 -0
  40. dstack/_internal/server/background/tasks/process_fleets.py +75 -17
  41. dstack/_internal/server/background/tasks/process_instances.py +3 -5
  42. dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
  43. dstack/_internal/server/background/tasks/process_runs.py +27 -23
  44. dstack/_internal/server/background/tasks/process_submitted_jobs.py +107 -54
  45. dstack/_internal/server/services/offers.py +7 -1
  46. dstack/_internal/server/testing/common.py +2 -0
  47. dstack/_internal/server/utils/provisioning.py +3 -10
  48. dstack/_internal/utils/ssh.py +22 -2
  49. dstack/version.py +2 -2
  50. {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/METADATA +20 -18
  51. {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/RECORD +54 -54
  52. {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/WHEEL +0 -0
  53. {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/entry_points.txt +0 -0
  54. {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/licenses/LICENSE.md +0 -0
@@ -5,12 +5,14 @@ from pydantic import Field, root_validator
5
5
  from dstack._internal.core.backends.base.models import fill_data
6
6
  from dstack._internal.core.models.common import CoreModel
7
7
 
8
+ DEFAULT_NAMESPACE = "default"
8
9
 
9
- class KubernetesNetworkingConfig(CoreModel):
10
- ssh_host: Annotated[
11
- Optional[str], Field(description="The external IP address of any node")
10
+
11
+ class KubernetesProxyJumpConfig(CoreModel):
12
+ hostname: Annotated[
13
+ Optional[str], Field(description="The external IP address or hostname of any node")
12
14
  ] = None
13
- ssh_port: Annotated[
15
+ port: Annotated[
14
16
  Optional[int], Field(description="Any port accessible outside of the cluster")
15
17
  ] = None
16
18
 
@@ -22,16 +24,15 @@ class KubeconfigConfig(CoreModel):
22
24
 
23
25
  class KubernetesBackendConfig(CoreModel):
24
26
  type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes"
25
- networking: Annotated[
26
- Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
27
+ proxy_jump: Annotated[
28
+ Optional[KubernetesProxyJumpConfig], Field(description="The SSH proxy jump configuration")
27
29
  ] = None
30
+ namespace: Annotated[
31
+ str, Field(description="The namespace for resources managed by `dstack`")
32
+ ] = DEFAULT_NAMESPACE
28
33
 
29
34
 
30
- class KubernetesBackendConfigWithCreds(CoreModel):
31
- type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes"
32
- networking: Annotated[
33
- Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
34
- ] = None
35
+ class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
35
36
  kubeconfig: Annotated[KubeconfigConfig, Field(description="The kubeconfig configuration")]
36
37
 
37
38
 
@@ -53,11 +54,7 @@ class KubeconfigFileConfig(CoreModel):
53
54
  return fill_data(values)
54
55
 
55
56
 
56
- class KubernetesBackendFileConfigWithCreds(CoreModel):
57
- type: Annotated[Literal["kubernetes"], Field(description="The type of backend")] = "kubernetes"
58
- networking: Annotated[
59
- Optional[KubernetesNetworkingConfig], Field(description="The networking configuration")
60
- ] = None
57
+ class KubernetesBackendFileConfigWithCreds(KubernetesBackendConfig):
61
58
  kubeconfig: Annotated[KubeconfigFileConfig, Field(description="The kubeconfig configuration")]
62
59
 
63
60
 
@@ -1,20 +1,157 @@
1
- from typing import Dict, List, Optional
1
+ import ast
2
+ from typing import Any, Callable, List, Literal, Optional, TypeVar, Union, get_origin, overload
2
3
 
3
- import kubernetes
4
4
  import yaml
5
+ from kubernetes import client as kubernetes_client
6
+ from kubernetes import config as kubernetes_config
7
+ from typing_extensions import ParamSpec
5
8
 
9
+ T = TypeVar("T")
10
+ P = ParamSpec("P")
6
11
 
7
- def get_api_from_config_data(kubeconfig_data: str) -> kubernetes.client.CoreV1Api:
12
+
13
+ def get_api_from_config_data(kubeconfig_data: str) -> kubernetes_client.CoreV1Api:
8
14
  config_dict = yaml.load(kubeconfig_data, yaml.FullLoader)
9
15
  return get_api_from_config_dict(config_dict)
10
16
 
11
17
 
12
- def get_api_from_config_dict(kubeconfig: Dict) -> kubernetes.client.CoreV1Api:
13
- api_client = kubernetes.config.new_client_from_config_dict(config_dict=kubeconfig)
14
- return kubernetes.client.CoreV1Api(api_client=api_client)
18
+ def get_api_from_config_dict(kubeconfig: dict) -> kubernetes_client.CoreV1Api:
19
+ api_client = kubernetes_config.new_client_from_config_dict(config_dict=kubeconfig)
20
+ return kubernetes_client.CoreV1Api(api_client=api_client)
21
+
22
+
23
+ @overload
24
+ def call_api_method(
25
+ method: Callable[P, Any],
26
+ type_: type[T],
27
+ expected: None = None,
28
+ *args: P.args,
29
+ **kwargs: P.kwargs,
30
+ ) -> T: ...
31
+
32
+
33
+ @overload
34
+ def call_api_method(
35
+ method: Callable[P, Any],
36
+ type_: type[T],
37
+ expected: Union[int, tuple[int, ...], list[int]],
38
+ *args: P.args,
39
+ **kwargs: P.kwargs,
40
+ ) -> Optional[T]: ...
41
+
42
+
43
+ def call_api_method(
44
+ method: Callable[P, Any],
45
+ type_: type[T],
46
+ expected: Optional[Union[int, tuple[int, ...], list[int]]] = None,
47
+ *args: P.args,
48
+ **kwargs: P.kwargs,
49
+ ) -> Optional[T]:
50
+ """
51
+ Returns the result of the API method call, optionally ignoring specified HTTP status codes.
52
+
53
+ Args:
54
+ method: the `CoreV1Api` bound method.
55
+ type_: The expected type of the return value, used for runtime type checking and
56
+ as a type hint for a static type checker (as kubernetes package is not type-annotated).
57
+ NB: For composite types, only "origin" type is checked, e.g., list, not list[Node]
58
+ expected: Expected error statuses, e.g., 404.
59
+ args: positional arguments of the method.
60
+ kwargs: keyword arguments of the method.
61
+ Returns:
62
+ The return value or `None` in case of the expected error.
63
+ """
64
+ if isinstance(expected, int):
65
+ expected = (expected,)
66
+ result: T
67
+ try:
68
+ result = method(*args, **kwargs)
69
+ except kubernetes_client.ApiException as e:
70
+ if expected is None or e.status not in expected:
71
+ raise
72
+ return None
73
+ if not isinstance(result, get_origin(type_) or type_):
74
+ raise TypeError(
75
+ f"{method.__name__} returned {type(result).__name__}, expected {type_.__name__}"
76
+ )
77
+ return result
78
+
79
+
80
+ @overload
81
+ def get_value(
82
+ obj: object, path: str, type_: type[T], *, required: Literal[False] = False
83
+ ) -> Optional[T]: ...
84
+
85
+
86
+ @overload
87
+ def get_value(obj: object, path: str, type_: type[T], *, required: Literal[True]) -> T: ...
88
+
89
+
90
+ def get_value(obj: object, path: str, type_: type[T], *, required: bool = False) -> Optional[T]:
91
+ """
92
+ Returns the value at a given path.
93
+ Supports object attributes, sequence indices, and mapping keys.
94
+
95
+ Args:
96
+ obj: The object to traverse.
97
+ path: The path to the value, regular Python syntax. The leading dot is optional, all the
98
+ following are correct: `.attr`, `attr`, `.[0]`, `[0]`, `.['key']`, `['key']`.
99
+ type_: The expected type of the value, used for runtime type checking and as a type hint
100
+ for a static type checker (as kubernetes package is not type-annotated).
101
+ NB: For composite types, only "origin" type is checked, e.g., list, not list[Node]
102
+ required: If `True`, the value must exist and must not be `None`. If `False` (safe
103
+ navigation mode), the may not exist and may be `None`.
104
+
105
+ Returns:
106
+ The requested value or `None` in case of failed traverse when required=False.
107
+ """
108
+ _path = path.removeprefix(".")
109
+ if _path.startswith("["):
110
+ src = f"obj{_path}"
111
+ else:
112
+ src = f"obj.{_path}"
113
+ module = ast.parse(src)
114
+ assert len(module.body) == 1, ast.dump(module, indent=4)
115
+ root_expr = module.body[0]
116
+ assert isinstance(root_expr, ast.Expr), ast.dump(module, indent=4)
117
+ varname: Optional[str] = None
118
+ expr = root_expr.value
119
+ while True:
120
+ if isinstance(expr, ast.Name):
121
+ varname = expr.id
122
+ break
123
+ if __debug__:
124
+ if isinstance(expr, ast.Subscript):
125
+ if isinstance(expr.slice, ast.UnaryOp):
126
+ # .items[-1]
127
+ assert isinstance(expr.slice.op, ast.USub), ast.dump(expr, indent=4)
128
+ assert isinstance(expr.slice.operand, ast.Constant), ast.dump(expr, indent=4)
129
+ assert isinstance(expr.slice.operand.value, int), ast.dump(expr, indent=4)
130
+ else:
131
+ # .items[0], .labels["name"]
132
+ assert isinstance(expr.slice, ast.Constant), ast.dump(expr, indent=4)
133
+ else:
134
+ assert isinstance(expr, ast.Attribute), ast.dump(expr, indent=4)
135
+ else:
136
+ assert isinstance(expr, (ast.Attribute, ast.Subscript))
137
+ expr = expr.value
138
+ assert varname is not None, ast.dump(module)
139
+ try:
140
+ value = eval(src, {"__builtins__": {}}, {"obj": obj})
141
+ except (AttributeError, KeyError, IndexError, TypeError) as e:
142
+ if required:
143
+ raise type(e)(f"Failed to traverse {path}: {e}") from e
144
+ return None
145
+ if value is None:
146
+ if required:
147
+ raise TypeError(f"Required {path} is None")
148
+ return value
149
+ if not isinstance(value, get_origin(type_) or type_):
150
+ raise TypeError(f"{path} value is {type(value).__name__}, expected {type_.__name__}")
151
+ return value
15
152
 
16
153
 
17
- def get_cluster_public_ip(api_client: kubernetes.client.CoreV1Api) -> Optional[str]:
154
+ def get_cluster_public_ip(api_client: kubernetes_client.CoreV1Api) -> Optional[str]:
18
155
  """
19
156
  Returns public IP of any cluster node.
20
157
  """
@@ -24,7 +161,7 @@ def get_cluster_public_ip(api_client: kubernetes.client.CoreV1Api) -> Optional[s
24
161
  return public_ips[0]
25
162
 
26
163
 
27
- def get_cluster_public_ips(api_client: kubernetes.client.CoreV1Api) -> List[str]:
164
+ def get_cluster_public_ips(api_client: kubernetes_client.CoreV1Api) -> List[str]:
28
165
  """
29
166
  Returns public IPs of all cluster nodes.
30
167
  """
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.compute import (
9
9
  Compute,
10
10
  ComputeWithAllOffersCached,
11
11
  ComputeWithCreateInstanceSupport,
12
+ ComputeWithPrivilegedSupport,
12
13
  generate_unique_instance_name,
13
14
  get_shim_commands,
14
15
  )
@@ -31,6 +32,7 @@ MAX_INSTANCE_NAME_LEN = 60
31
32
  class LambdaCompute(
32
33
  ComputeWithAllOffersCached,
33
34
  ComputeWithCreateInstanceSupport,
35
+ ComputeWithPrivilegedSupport,
34
36
  Compute,
35
37
  ):
36
38
  def __init__(self, config: LambdaConfig):
@@ -3,6 +3,7 @@ from typing import List, Optional
3
3
  from dstack._internal.core.backends.base.compute import (
4
4
  Compute,
5
5
  ComputeWithCreateInstanceSupport,
6
+ ComputeWithPrivilegedSupport,
6
7
  ComputeWithVolumeSupport,
7
8
  )
8
9
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
@@ -25,6 +26,7 @@ logger = get_logger(__name__)
25
26
 
26
27
  class LocalCompute(
27
28
  ComputeWithCreateInstanceSupport,
29
+ ComputeWithPrivilegedSupport,
28
30
  ComputeWithVolumeSupport,
29
31
  Compute,
30
32
  ):
@@ -16,8 +16,10 @@ from dstack._internal.core.backends.base.compute import (
16
16
  ComputeWithCreateInstanceSupport,
17
17
  ComputeWithMultinodeSupport,
18
18
  ComputeWithPlacementGroupSupport,
19
+ ComputeWithPrivilegedSupport,
19
20
  generate_unique_instance_name,
20
21
  get_user_data,
22
+ merge_tags,
21
23
  )
22
24
  from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
23
25
  from dstack._internal.core.backends.nebius import resources
@@ -79,6 +81,7 @@ SUPPORTED_PLATFORMS = [
79
81
  class NebiusCompute(
80
82
  ComputeWithAllOffersCached,
81
83
  ComputeWithCreateInstanceSupport,
84
+ ComputeWithPrivilegedSupport,
82
85
  ComputeWithMultinodeSupport,
83
86
  ComputeWithPlacementGroupSupport,
84
87
  Compute,
@@ -148,6 +151,18 @@ class NebiusCompute(
148
151
  if backend_data.cluster is not None:
149
152
  cluster_id = backend_data.cluster.id
150
153
 
154
+ labels = {
155
+ "owner": "dstack",
156
+ "dstack_project": instance_config.project_name.lower(),
157
+ "dstack_name": instance_config.instance_name,
158
+ "dstack_user": instance_config.user.lower(),
159
+ }
160
+ labels = merge_tags(
161
+ base_tags=labels,
162
+ backend_tags=self.config.tags,
163
+ resource_tags=instance_config.tags,
164
+ )
165
+ labels = resources.filter_invalid_labels(labels)
151
166
  gpus = instance_offer.instance.resources.gpus
152
167
  create_disk_op = resources.create_disk(
153
168
  sdk=self._sdk,
@@ -157,6 +172,7 @@ class NebiusCompute(
157
172
  image_family="ubuntu24.04-cuda12"
158
173
  if gpus and gpus[0].name == "B200"
159
174
  else "ubuntu22.04-cuda12",
175
+ labels=labels,
160
176
  )
161
177
  create_instance_op = None
162
178
  try:
@@ -182,6 +198,7 @@ class NebiusCompute(
182
198
  disk_id=create_disk_op.resource_id,
183
199
  subnet_id=self._get_subnet_id(instance_offer.region),
184
200
  preemptible=instance_offer.instance.resources.spot,
201
+ labels=labels,
185
202
  )
186
203
  _wait_for_instance(self._sdk, create_instance_op)
187
204
  except BaseException:
@@ -3,6 +3,7 @@ import json
3
3
  from nebius.aio.service_error import RequestError
4
4
 
5
5
  from dstack._internal.core.backends.base.configurator import (
6
+ TAGS_MAX_NUM,
6
7
  BackendRecord,
7
8
  Configurator,
8
9
  raise_invalid_credentials_error,
@@ -18,6 +19,7 @@ from dstack._internal.core.backends.nebius.models import (
18
19
  NebiusServiceAccountCreds,
19
20
  NebiusStoredConfig,
20
21
  )
22
+ from dstack._internal.core.errors import BackendError, ServerClientError
21
23
  from dstack._internal.core.models.backends.base import BackendType
22
24
 
23
25
 
@@ -53,6 +55,19 @@ class NebiusConfigurator(
53
55
  f" some of the valid options: {sorted(valid_fabrics)}"
54
56
  ),
55
57
  )
58
+ self._check_config_tags(config)
59
+
60
+ def _check_config_tags(self, config: NebiusBackendConfigWithCreds):
61
+ if not config.tags:
62
+ return
63
+ if len(config.tags) > TAGS_MAX_NUM:
64
+ raise ServerClientError(
65
+ f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed."
66
+ )
67
+ try:
68
+ resources.validate_labels(config.tags)
69
+ except BackendError as e:
70
+ raise ServerClientError(e.args[0])
56
71
 
57
72
  def create_backend(
58
73
  self, project_name: str, config: NebiusBackendConfigWithCreds
@@ -1,4 +1,6 @@
1
- from typing import Annotated, Literal, Optional, Union
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Annotated, Dict, Literal, Optional, Union
2
4
 
3
5
  from pydantic import Field, root_validator
4
6
 
@@ -27,16 +29,38 @@ class NebiusServiceAccountCreds(CoreModel):
27
29
  )
28
30
  ),
29
31
  ]
32
+ filename: Annotated[
33
+ Optional[str], Field(description="The path to the service account credentials file")
34
+ ] = None
30
35
 
31
36
 
32
37
  class NebiusServiceAccountFileCreds(CoreModel):
33
38
  type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
34
39
  "service_account"
35
40
  )
36
- service_account_id: Annotated[str, Field(description="Service account ID")]
37
- public_key_id: Annotated[str, Field(description="ID of the service account public key")]
41
+ service_account_id: Annotated[
42
+ Optional[str],
43
+ Field(
44
+ description=(
45
+ "Service account ID. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
46
+ )
47
+ ),
48
+ ] = None
49
+ public_key_id: Annotated[
50
+ Optional[str],
51
+ Field(
52
+ description=(
53
+ "ID of the service account public key. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
54
+ )
55
+ ),
56
+ ] = None
38
57
  private_key_file: Annotated[
39
- Optional[str], Field(description=("Path to the service account private key"))
58
+ Optional[str],
59
+ Field(
60
+ description=(
61
+ "Path to the service account private key. Set automatically if `filename` or `private_key_content` is specified. When configuring via the UI, it must be specified explicitly"
62
+ )
63
+ ),
40
64
  ] = None
41
65
  private_key_content: Annotated[
42
66
  Optional[str],
@@ -44,13 +68,35 @@ class NebiusServiceAccountFileCreds(CoreModel):
44
68
  description=(
45
69
  "Content of the service account private key. When configuring via"
46
70
  " `server/config.yml`, it's automatically filled from `private_key_file`."
47
- " When configuring via UI, it has to be specified explicitly."
71
+ " When configuring via UI, it has to be specified explicitly"
48
72
  )
49
73
  ),
50
74
  ] = None
75
+ filename: Annotated[
76
+ Optional[str], Field(description="The path to the service account credentials file")
77
+ ] = None
51
78
 
52
79
  @root_validator
53
80
  def fill_data(cls, values):
81
+ if filename := values.get("filename"):
82
+ try:
83
+ with open(Path(filename).expanduser()) as f:
84
+ data = json.load(f)
85
+ from nebius.base.service_account.credentials_file import (
86
+ ServiceAccountCredentials,
87
+ )
88
+
89
+ credentials = ServiceAccountCredentials.from_json(data)
90
+ subject = credentials.subject_credentials
91
+ values["service_account_id"] = subject.sub
92
+ values["public_key_id"] = subject.kid
93
+ values["private_key_content"] = subject.private_key
94
+ except OSError:
95
+ raise ValueError(f"No such file {filename}")
96
+ except Exception as e:
97
+ raise ValueError(f"Failed to parse credentials file {filename}: {e}")
98
+ return values
99
+
54
100
  return fill_data(
55
101
  values, filename_field="private_key_file", data_field="private_key_content"
56
102
  )
@@ -95,6 +141,12 @@ class NebiusBackendConfig(CoreModel):
95
141
  )
96
142
  ),
97
143
  ] = None
144
+ tags: Annotated[
145
+ Optional[Dict[str, str]],
146
+ Field(
147
+ description="The tags (labels) that will be assigned to resources created by `dstack`"
148
+ ),
149
+ ] = None
98
150
 
99
151
 
100
152
  class NebiusBackendConfigWithCreds(NebiusBackendConfig):
@@ -1,11 +1,12 @@
1
1
  import logging
2
+ import re
2
3
  import time
3
4
  from collections import defaultdict
4
5
  from collections.abc import Container as ContainerT
5
6
  from collections.abc import Generator, Iterable, Sequence
6
7
  from contextlib import contextmanager
7
8
  from tempfile import NamedTemporaryFile
8
- from typing import Optional
9
+ from typing import Dict, Optional
9
10
 
10
11
  from nebius.aio.authorization.options import options_to_metadata
11
12
  from nebius.aio.operation import Operation as SDKOperation
@@ -249,13 +250,14 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
249
250
 
250
251
 
251
252
  def create_disk(
252
- sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str
253
+ sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str, labels: Dict[str, str]
253
254
  ) -> SDKOperation[Operation]:
254
255
  client = DiskServiceClient(sdk)
255
256
  request = CreateDiskRequest(
256
257
  metadata=ResourceMetadata(
257
258
  name=name,
258
259
  parent_id=project_id,
260
+ labels=labels,
259
261
  ),
260
262
  spec=DiskSpec(
261
263
  size_mebibytes=size_mib,
@@ -288,12 +290,14 @@ def create_instance(
288
290
  disk_id: str,
289
291
  subnet_id: str,
290
292
  preemptible: bool,
293
+ labels: Dict[str, str],
291
294
  ) -> SDKOperation[Operation]:
292
295
  client = InstanceServiceClient(sdk)
293
296
  request = CreateInstanceRequest(
294
297
  metadata=ResourceMetadata(
295
298
  name=name,
296
299
  parent_id=project_id,
300
+ labels=labels,
297
301
  ),
298
302
  spec=InstanceSpec(
299
303
  cloud_init_user_data=user_data,
@@ -367,3 +371,42 @@ def delete_cluster(sdk: SDK, cluster_id: str) -> None:
367
371
  metadata=REQUEST_MD,
368
372
  )
369
373
  )
374
+
375
+
376
+ def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
377
+ filtered_labels = {}
378
+ for k, v in labels.items():
379
+ if not _is_valid_label(k, v):
380
+ logger.warning("Skipping invalid label '%s: %s'", k, v)
381
+ continue
382
+ filtered_labels[k] = v
383
+ return filtered_labels
384
+
385
+
386
+ def validate_labels(labels: Dict[str, str]):
387
+ for k, v in labels.items():
388
+ if not _is_valid_label(k, v):
389
+ raise BackendError("Invalid resource labels")
390
+
391
+
392
+ def _is_valid_label(key: str, value: str) -> bool:
393
+ # TODO: [Nebius] current validation logic reuses GCP's approach.
394
+ # There is no public information on Nebius labels restrictions.
395
+ return is_valid_resource_name(key) and is_valid_label_value(value)
396
+
397
+
398
+ MAX_RESOURCE_NAME_LEN = 63
399
+ NAME_PATTERN = re.compile(r"^[a-z][_\-a-z0-9]{0,62}$")
400
+ LABEL_VALUE_PATTERN = re.compile(r"^[_\-a-z0-9]{0,63}$")
401
+
402
+
403
+ def is_valid_resource_name(name: str) -> bool:
404
+ if len(name) < 1 or len(name) > MAX_RESOURCE_NAME_LEN:
405
+ return False
406
+ match = re.match(NAME_PATTERN, name)
407
+ return match is not None
408
+
409
+
410
+ def is_valid_label_value(value: str) -> bool:
411
+ match = re.match(LABEL_VALUE_PATTERN, value)
412
+ return match is not None
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.compute import (
9
9
  ComputeWithAllOffersCached,
10
10
  ComputeWithCreateInstanceSupport,
11
11
  ComputeWithMultinodeSupport,
12
+ ComputeWithPrivilegedSupport,
12
13
  generate_unique_instance_name,
13
14
  get_user_data,
14
15
  )
@@ -50,6 +51,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("50GB"), max=Memory.pars
50
51
  class OCICompute(
51
52
  ComputeWithAllOffersCached,
52
53
  ComputeWithCreateInstanceSupport,
54
+ ComputeWithPrivilegedSupport,
53
55
  ComputeWithMultinodeSupport,
54
56
  Compute,
55
57
  ):
@@ -118,7 +120,11 @@ class OCICompute(
118
120
  availability_domain = instance_offer.availability_zones[0]
119
121
 
120
122
  listing, package = resources.get_marketplace_listing_and_package(
121
- cuda=len(instance_offer.instance.resources.gpus) > 0,
123
+ gpu_name=(
124
+ instance_offer.instance.resources.gpus[0].name
125
+ if len(instance_offer.instance.resources.gpus) > 0
126
+ else None
127
+ ),
122
128
  client=region.marketplace_client,
123
129
  )
124
130
  resources.accept_marketplace_listing_agreements(
@@ -23,7 +23,9 @@ import oci
23
23
  from oci.object_storage.models import CreatePreauthenticatedRequestDetails
24
24
 
25
25
  from dstack import version
26
+ from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules
26
27
  from dstack._internal.core.backends.oci.region import OCIRegionClient
28
+ from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
27
29
  from dstack._internal.core.errors import BackendError
28
30
  from dstack._internal.core.models.instances import InstanceOffer
29
31
  from dstack._internal.utils.common import batched
@@ -352,11 +354,14 @@ def terminate_instance_if_exists(client: oci.core.ComputeClient, instance_id: st
352
354
 
353
355
 
354
356
  def get_marketplace_listing_and_package(
355
- cuda: bool, client: oci.marketplace.MarketplaceClient
357
+ gpu_name: Optional[str], client: oci.marketplace.MarketplaceClient
356
358
  ) -> Tuple[oci.marketplace.models.Listing, oci.marketplace.models.ImageListingPackage]:
357
359
  listing_name = f"dstack-{version.base_image}"
358
- if cuda:
359
- listing_name = f"dstack-cuda-{version.base_image}"
360
+ if gpu_name is not None:
361
+ if not requires_nvidia_proprietary_kernel_modules(gpu_name):
362
+ listing_name = f"dstack-cuda-{version.base_image}"
363
+ else:
364
+ listing_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
360
365
 
361
366
  listing_summaries = list_marketplace_listings(listing_name, client)
362
367
  if len(listing_summaries) != 1:
@@ -8,6 +8,7 @@ from dstack._internal.core.backends.base.compute import (
8
8
  ComputeWithMultinodeSupport,
9
9
  ComputeWithPlacementGroupSupport,
10
10
  ComputeWithPrivateGatewaySupport,
11
+ ComputeWithPrivilegedSupport,
11
12
  ComputeWithReservationSupport,
12
13
  ComputeWithVolumeSupport,
13
14
  )
@@ -31,6 +32,7 @@ class {{ backend_name }}Compute(
31
32
  # TODO: Choose ComputeWith* classes to extend and implement
32
33
  # ComputeWithAllOffersCached,
33
34
  # ComputeWithCreateInstanceSupport,
35
+ # ComputeWithPrivilegedSupport,
34
36
  # ComputeWithMultinodeSupport,
35
37
  # ComputeWithReservationSupport,
36
38
  # ComputeWithPlacementGroupSupport,
@@ -6,6 +6,7 @@ import requests
6
6
  from dstack._internal.core.backends.base.backend import Compute
7
7
  from dstack._internal.core.backends.base.compute import (
8
8
  ComputeWithCreateInstanceSupport,
9
+ ComputeWithPrivilegedSupport,
9
10
  generate_unique_instance_name,
10
11
  get_shim_commands,
11
12
  )
@@ -32,6 +33,7 @@ MAX_INSTANCE_NAME_LEN = 60
32
33
 
33
34
  class TensorDockCompute(
34
35
  ComputeWithCreateInstanceSupport,
36
+ ComputeWithPrivilegedSupport,
35
37
  Compute,
36
38
  ):
37
39
  def __init__(self, config: TensorDockConfig):
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.compute import (
9
9
  ComputeWithAllOffersCached,
10
10
  ComputeWithCreateInstanceSupport,
11
11
  ComputeWithMultinodeSupport,
12
+ ComputeWithPrivilegedSupport,
12
13
  generate_unique_instance_name,
13
14
  get_user_data,
14
15
  )
@@ -35,6 +36,7 @@ MAX_INSTANCE_NAME_LEN = 64
35
36
  class VultrCompute(
36
37
  ComputeWithAllOffersCached,
37
38
  ComputeWithCreateInstanceSupport,
39
+ ComputeWithPrivilegedSupport,
38
40
  ComputeWithMultinodeSupport,
39
41
  Compute,
40
42
  ):
@@ -53,6 +53,10 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
53
53
  }
54
54
  if all(js.exit_status is None for js in job_submissions):
55
55
  job_submissions_excludes["exit_status"] = True
56
+ if all(js.status_message == "" for js in job_submissions):
57
+ job_submissions_excludes["status_message"] = True
58
+ if all(js.error is None for js in job_submissions):
59
+ job_submissions_excludes["error"] = True
56
60
  if all(js.deployment_num == 0 for js in job_submissions):
57
61
  job_submissions_excludes["deployment_num"] = True
58
62
  if all(not js.probes for js in job_submissions):
@@ -71,6 +75,10 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
71
75
  }
72
76
  if latest_job_submission.exit_status is None:
73
77
  latest_job_submission_excludes["exit_status"] = True
78
+ if latest_job_submission.status_message == "":
79
+ latest_job_submission_excludes["status_message"] = True
80
+ if latest_job_submission.error is None:
81
+ latest_job_submission_excludes["error"] = True
74
82
  if latest_job_submission.deployment_num == 0:
75
83
  latest_job_submission_excludes["deployment_num"] = True
76
84
  if not latest_job_submission.probes:
@@ -4,3 +4,5 @@ DSTACK_SHIM_HTTP_PORT = 10998
4
4
  DSTACK_RUNNER_HTTP_PORT = 10999
5
5
  # ssh server (runs alongside the runner inside a container) listen port
6
6
  DSTACK_RUNNER_SSH_PORT = 10022
7
+ # legacy AWS, Azure, GCP, and OCI image for older GPUs
8
+ DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES = "0.10"