dstack 0.19.31__py3-none-any.whl → 0.19.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/offer.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +1 -5
- dstack/_internal/core/backends/aws/compute.py +8 -5
- dstack/_internal/core/backends/azure/compute.py +9 -6
- dstack/_internal/core/backends/base/compute.py +40 -17
- dstack/_internal/core/backends/base/offers.py +5 -1
- dstack/_internal/core/backends/datacrunch/compute.py +9 -6
- dstack/_internal/core/backends/gcp/compute.py +137 -7
- dstack/_internal/core/backends/gcp/models.py +7 -0
- dstack/_internal/core/backends/gcp/resources.py +87 -5
- dstack/_internal/core/backends/hotaisle/compute.py +30 -0
- dstack/_internal/core/backends/kubernetes/compute.py +218 -77
- dstack/_internal/core/backends/kubernetes/models.py +4 -2
- dstack/_internal/core/backends/nebius/compute.py +24 -6
- dstack/_internal/core/backends/nebius/configurator.py +15 -0
- dstack/_internal/core/backends/nebius/models.py +57 -5
- dstack/_internal/core/backends/nebius/resources.py +45 -2
- dstack/_internal/core/backends/oci/compute.py +9 -6
- dstack/_internal/core/backends/runpod/compute.py +10 -6
- dstack/_internal/core/backends/vastai/compute.py +3 -1
- dstack/_internal/core/backends/vastai/configurator.py +0 -1
- dstack/_internal/core/compatibility/runs.py +8 -0
- dstack/_internal/core/models/fleets.py +1 -1
- dstack/_internal/core/models/profiles.py +12 -5
- dstack/_internal/core/models/runs.py +3 -2
- dstack/_internal/core/models/users.py +10 -0
- dstack/_internal/core/services/configs/__init__.py +1 -0
- dstack/_internal/server/background/tasks/process_fleets.py +75 -17
- dstack/_internal/server/background/tasks/process_instances.py +6 -4
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -0
- dstack/_internal/server/background/tasks/process_runs.py +27 -23
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +63 -20
- dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
- dstack/_internal/server/models.py +3 -0
- dstack/_internal/server/routers/runs.py +5 -1
- dstack/_internal/server/routers/users.py +14 -2
- dstack/_internal/server/services/runs.py +9 -4
- dstack/_internal/server/services/users.py +35 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/main-720ce3a11140daa480cc.css +3 -0
- dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js → main-97c7e184573ca23f9fe4.js} +12218 -7625
- dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js.map → main-97c7e184573ca23f9fe4.js.map} +1 -1
- dstack/api/_public/__init__.py +9 -12
- dstack/api/_public/repos.py +0 -21
- dstack/api/_public/runs.py +64 -9
- dstack/api/server/_users.py +17 -2
- dstack/version.py +2 -2
- {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/METADATA +12 -14
- {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/RECORD +52 -51
- dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +0 -3
- {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/WHEEL +0 -0
- {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -42,6 +42,36 @@ INSTANCE_TYPE_SPECS = {
|
|
|
42
42
|
"cpu_frequency": 2000000000,
|
|
43
43
|
"cpu_manufacturer": "Intel",
|
|
44
44
|
},
|
|
45
|
+
"2x MI300X 26x Xeon Platinum 8470": {
|
|
46
|
+
"cpu_model": "Xeon Platinum 8470",
|
|
47
|
+
"cpu_frequency": 2000000000,
|
|
48
|
+
"cpu_manufacturer": "Intel",
|
|
49
|
+
},
|
|
50
|
+
"2x MI300X 26x Xeon Platinum 8462Y+": {
|
|
51
|
+
"cpu_model": "Xeon Platinum 8462Y+",
|
|
52
|
+
"cpu_frequency": 2800000000,
|
|
53
|
+
"cpu_manufacturer": "Intel",
|
|
54
|
+
},
|
|
55
|
+
"4x MI300X 52x Xeon Platinum 8470": {
|
|
56
|
+
"cpu_model": "Xeon Platinum 8470",
|
|
57
|
+
"cpu_frequency": 2000000000,
|
|
58
|
+
"cpu_manufacturer": "Intel",
|
|
59
|
+
},
|
|
60
|
+
"4x MI300X 52x Xeon Platinum 8462Y+": {
|
|
61
|
+
"cpu_model": "Xeon Platinum 8462Y+",
|
|
62
|
+
"cpu_frequency": 2800000000,
|
|
63
|
+
"cpu_manufacturer": "Intel",
|
|
64
|
+
},
|
|
65
|
+
"8x MI300X 104x Xeon Platinum 8470": {
|
|
66
|
+
"cpu_model": "Xeon Platinum 8470",
|
|
67
|
+
"cpu_frequency": 2000000000,
|
|
68
|
+
"cpu_manufacturer": "Intel",
|
|
69
|
+
},
|
|
70
|
+
"8x MI300X 104x Xeon Platinum 8462Y+": {
|
|
71
|
+
"cpu_model": "Xeon Platinum 8462Y+",
|
|
72
|
+
"cpu_frequency": 2800000000,
|
|
73
|
+
"cpu_manufacturer": "Intel",
|
|
74
|
+
},
|
|
45
75
|
}
|
|
46
76
|
|
|
47
77
|
|
|
@@ -2,9 +2,10 @@ import subprocess
|
|
|
2
2
|
import tempfile
|
|
3
3
|
import threading
|
|
4
4
|
import time
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from typing import List, Optional, Tuple
|
|
6
7
|
|
|
7
|
-
from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
|
|
8
|
+
from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor
|
|
8
9
|
from kubernetes import client
|
|
9
10
|
|
|
10
11
|
from dstack._internal.core.backends.base.compute import (
|
|
@@ -58,11 +59,42 @@ from dstack._internal.utils.logging import get_logger
|
|
|
58
59
|
logger = get_logger(__name__)
|
|
59
60
|
|
|
60
61
|
JUMP_POD_SSH_PORT = 22
|
|
62
|
+
DUMMY_REGION = "-"
|
|
63
|
+
|
|
64
|
+
NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
|
|
65
|
+
NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
|
|
66
|
+
NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
|
|
67
|
+
|
|
68
|
+
AMD_GPU_RESOURCE = "amd.com/gpu"
|
|
69
|
+
AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE
|
|
70
|
+
# The oldest but still supported label format, the safest option, see the commit message:
|
|
71
|
+
# https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48
|
|
72
|
+
# E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs
|
|
73
|
+
# We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue:
|
|
74
|
+
# https://github.com/ROCm/k8s-device-plugin/issues/112
|
|
75
|
+
AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id."
|
|
76
|
+
|
|
77
|
+
# Taints we know and tolerate when creating our objects, e.g., the jump pod.
|
|
78
|
+
TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT)
|
|
61
79
|
|
|
62
80
|
NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
|
|
63
81
|
NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
|
|
64
82
|
|
|
65
|
-
|
|
83
|
+
AMD_GPU_DEVICE_ID_TO_GPU_INFO = {
|
|
84
|
+
device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids
|
|
85
|
+
}
|
|
86
|
+
AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class Operator(str, Enum):
|
|
90
|
+
EXISTS = "Exists"
|
|
91
|
+
IN = "In"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class TaintEffect(str, Enum):
|
|
95
|
+
NO_EXECUTE = "NoExecute"
|
|
96
|
+
NO_SCHEDULE = "NoSchedule"
|
|
97
|
+
PREFER_NO_SCHEDULE = "PreferNoSchedule"
|
|
66
98
|
|
|
67
99
|
|
|
68
100
|
class KubernetesCompute(
|
|
@@ -92,21 +124,15 @@ class KubernetesCompute(
|
|
|
92
124
|
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
93
125
|
for node in nodes:
|
|
94
126
|
try:
|
|
95
|
-
labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
|
|
96
127
|
name = get_value(node, ".metadata.name", str, required=True)
|
|
97
|
-
cpus = _parse_cpu(
|
|
98
|
-
get_value(node, ".status.allocatable['cpu']", str, required=True)
|
|
99
|
-
)
|
|
100
128
|
cpu_arch = normalize_arch(
|
|
101
129
|
get_value(node, ".status.node_info.architecture", str)
|
|
102
130
|
).to_cpu_architecture()
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
|
|
109
|
-
)
|
|
131
|
+
allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
|
|
132
|
+
cpus = _parse_cpu(allocatable["cpu"])
|
|
133
|
+
memory_mib = _parse_memory(allocatable["memory"])
|
|
134
|
+
disk_size_mib = _parse_memory(allocatable["ephemeral-storage"])
|
|
135
|
+
gpus = _get_node_gpus(node)
|
|
110
136
|
except (AttributeError, KeyError, ValueError) as e:
|
|
111
137
|
logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
|
|
112
138
|
continue
|
|
@@ -141,6 +167,7 @@ class KubernetesCompute(
|
|
|
141
167
|
volumes: List[Volume],
|
|
142
168
|
) -> JobProvisioningData:
|
|
143
169
|
instance_name = generate_unique_instance_name_for_job(run, job)
|
|
170
|
+
assert run.run_spec.ssh_key_pub is not None
|
|
144
171
|
commands = get_docker_commands(
|
|
145
172
|
[run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
|
|
146
173
|
)
|
|
@@ -181,6 +208,7 @@ class KubernetesCompute(
|
|
|
181
208
|
resources_requests: dict[str, str] = {}
|
|
182
209
|
resources_limits: dict[str, str] = {}
|
|
183
210
|
node_affinity: Optional[client.V1NodeAffinity] = None
|
|
211
|
+
tolerations: list[client.V1Toleration] = []
|
|
184
212
|
volumes_: list[client.V1Volume] = []
|
|
185
213
|
volume_mounts: list[client.V1VolumeMount] = []
|
|
186
214
|
|
|
@@ -196,52 +224,18 @@ class KubernetesCompute(
|
|
|
196
224
|
"GPU is requested but the offer has no GPUs:"
|
|
197
225
|
f" {gpu_spec=} {instance_offer=}",
|
|
198
226
|
)
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
# We cannot generate an expected GPU label value from the Gpu model instance
|
|
202
|
-
# as the actual values may have additional components (socket, memory type, etc.)
|
|
203
|
-
# that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
|
|
204
|
-
# Moreover, a single Gpu may match multiple label values.
|
|
205
|
-
# As a workaround, we iterate and process all node labels once again (we already
|
|
206
|
-
# processed them in `get_offers_by_requirements()`).
|
|
207
|
-
node_list = call_api_method(
|
|
208
|
-
self.api.list_node,
|
|
209
|
-
client.V1NodeList,
|
|
227
|
+
gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu(
|
|
228
|
+
self.api, offer_gpus[0]
|
|
210
229
|
)
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
if gpus[0] == offer_gpu:
|
|
220
|
-
matching_gpu_label_values.add(gpu_label_value)
|
|
221
|
-
if not matching_gpu_label_values:
|
|
222
|
-
raise ComputeError(
|
|
223
|
-
f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
|
|
230
|
+
logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_min)
|
|
231
|
+
resources_requests[gpu_resource] = resources_limits[gpu_resource] = str(gpu_min)
|
|
232
|
+
# It should be NoSchedule, but we also add NoExecute toleration just in case.
|
|
233
|
+
for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
|
|
234
|
+
tolerations.append(
|
|
235
|
+
client.V1Toleration(
|
|
236
|
+
key=node_taint, operator=Operator.EXISTS, effect=effect
|
|
237
|
+
)
|
|
224
238
|
)
|
|
225
|
-
logger.debug(
|
|
226
|
-
"Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
|
|
227
|
-
)
|
|
228
|
-
# TODO: support other GPU vendors
|
|
229
|
-
resources_requests["nvidia.com/gpu"] = str(gpu_min)
|
|
230
|
-
resources_limits["nvidia.com/gpu"] = str(gpu_min)
|
|
231
|
-
node_affinity = client.V1NodeAffinity(
|
|
232
|
-
required_during_scheduling_ignored_during_execution=[
|
|
233
|
-
client.V1NodeSelectorTerm(
|
|
234
|
-
match_expressions=[
|
|
235
|
-
client.V1NodeSelectorRequirement(
|
|
236
|
-
key="nvidia.com/gpu.product",
|
|
237
|
-
operator="In",
|
|
238
|
-
values=list(matching_gpu_label_values),
|
|
239
|
-
),
|
|
240
|
-
],
|
|
241
|
-
),
|
|
242
|
-
],
|
|
243
|
-
)
|
|
244
|
-
|
|
245
239
|
if (memory_min := resources_spec.memory.min) is not None:
|
|
246
240
|
resources_requests["memory"] = _render_memory(memory_min)
|
|
247
241
|
if (
|
|
@@ -303,7 +297,10 @@ class KubernetesCompute(
|
|
|
303
297
|
volume_mounts=volume_mounts,
|
|
304
298
|
)
|
|
305
299
|
],
|
|
306
|
-
affinity=
|
|
300
|
+
affinity=client.V1Affinity(
|
|
301
|
+
node_affinity=node_affinity,
|
|
302
|
+
),
|
|
303
|
+
tolerations=tolerations,
|
|
307
304
|
volumes=volumes_,
|
|
308
305
|
),
|
|
309
306
|
)
|
|
@@ -521,34 +518,144 @@ def _render_memory(memory: Memory) -> str:
|
|
|
521
518
|
return f"{float(memory)}Gi"
|
|
522
519
|
|
|
523
520
|
|
|
524
|
-
def
|
|
521
|
+
def _get_node_gpus(node: client.V1Node) -> list[Gpu]:
|
|
522
|
+
node_name = get_value(node, ".metadata.name", str, required=True)
|
|
523
|
+
allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
|
|
524
|
+
labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
|
|
525
|
+
for gpu_resource, gpu_getter in (
|
|
526
|
+
(NVIDIA_GPU_RESOURCE, _get_nvidia_gpu_from_node_labels),
|
|
527
|
+
(AMD_GPU_RESOURCE, _get_amd_gpu_from_node_labels),
|
|
528
|
+
):
|
|
529
|
+
_gpu_count = allocatable.get(gpu_resource)
|
|
530
|
+
if not _gpu_count:
|
|
531
|
+
continue
|
|
532
|
+
gpu_count = int(_gpu_count)
|
|
533
|
+
if gpu_count < 1:
|
|
534
|
+
continue
|
|
535
|
+
gpu = gpu_getter(labels)
|
|
536
|
+
if gpu is None:
|
|
537
|
+
logger.warning(
|
|
538
|
+
"Node %s: GPU resource found, but failed to detect its model: %s=%d",
|
|
539
|
+
node_name,
|
|
540
|
+
gpu_resource,
|
|
541
|
+
gpu_count,
|
|
542
|
+
)
|
|
543
|
+
return []
|
|
544
|
+
return [gpu] * gpu_count
|
|
545
|
+
logger.debug("Node %s: no GPU resource found", node_name)
|
|
546
|
+
return []
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
|
|
525
550
|
# We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
|
|
526
551
|
# to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
|
|
527
552
|
# "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
|
|
528
553
|
# Thus, we convert the product name to a known gpu name.
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
gpu_count = int(gpu_count)
|
|
535
|
-
gpu_name = None
|
|
536
|
-
for known_gpu_name in NVIDIA_GPU_NAMES:
|
|
537
|
-
if known_gpu_name.lower() in gpu_product.lower().split("-"):
|
|
538
|
-
gpu_name = known_gpu_name
|
|
554
|
+
gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
|
|
555
|
+
if gpu_product is None:
|
|
556
|
+
return None
|
|
557
|
+
for gpu_name in NVIDIA_GPU_NAMES:
|
|
558
|
+
if gpu_name.lower() in gpu_product.lower().split("-"):
|
|
539
559
|
break
|
|
540
|
-
|
|
541
|
-
return
|
|
560
|
+
else:
|
|
561
|
+
return None
|
|
542
562
|
gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
|
|
543
563
|
gpu_memory = gpu_info.memory * 1024
|
|
544
564
|
# A100 may come in two variants
|
|
545
565
|
if "40GB" in gpu_product:
|
|
546
566
|
gpu_memory = 40 * 1024
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
567
|
+
return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def _get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
|
|
571
|
+
# (AMDGPUInfo.name, AMDGPUInfo.memory) pairs
|
|
572
|
+
gpus: set[tuple[str, int]] = set()
|
|
573
|
+
for label in labels:
|
|
574
|
+
if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX):
|
|
575
|
+
continue
|
|
576
|
+
_, _, _device_id = label.rpartition(".")
|
|
577
|
+
device_id = int(_device_id, 16)
|
|
578
|
+
gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id)
|
|
579
|
+
if gpu_info is None:
|
|
580
|
+
logger.warning("Unknown AMD GPU device id: %X", device_id)
|
|
581
|
+
continue
|
|
582
|
+
gpus.add((gpu_info.name, gpu_info.memory))
|
|
583
|
+
if not gpus:
|
|
584
|
+
return None
|
|
585
|
+
if len(gpus) == 1:
|
|
586
|
+
gpu_name, gpu_memory_gib = next(iter(gpus))
|
|
587
|
+
return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024)
|
|
588
|
+
logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus)
|
|
589
|
+
return None
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def _get_pod_spec_parameters_for_gpu(
|
|
593
|
+
api: client.CoreV1Api, gpu: Gpu
|
|
594
|
+
) -> tuple[str, client.V1NodeAffinity, str]:
|
|
595
|
+
gpu_vendor = gpu.vendor
|
|
596
|
+
assert gpu_vendor is not None
|
|
597
|
+
if gpu_vendor == AcceleratorVendor.NVIDIA:
|
|
598
|
+
node_affinity = _get_nvidia_gpu_node_affinity(api, gpu)
|
|
599
|
+
return NVIDIA_GPU_RESOURCE, node_affinity, NVIDIA_GPU_NODE_TAINT
|
|
600
|
+
if gpu_vendor == AcceleratorVendor.AMD:
|
|
601
|
+
node_affinity = _get_amd_gpu_node_affinity(gpu)
|
|
602
|
+
return AMD_GPU_RESOURCE, node_affinity, AMD_GPU_NODE_TAINT
|
|
603
|
+
raise ComputeError(f"Unsupported GPU vendor: {gpu_vendor}")
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def _get_nvidia_gpu_node_affinity(api: client.CoreV1Api, gpu: Gpu) -> client.V1NodeAffinity:
|
|
607
|
+
matching_gpu_label_values: set[str] = set()
|
|
608
|
+
# We cannot generate an expected GPU label value from the Gpu model instance
|
|
609
|
+
# as the actual values may have additional components (socket, memory type, etc.)
|
|
610
|
+
# that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
|
|
611
|
+
# Moreover, a single Gpu may match multiple label values.
|
|
612
|
+
# As a workaround, we iterate and process all node labels once again (we already
|
|
613
|
+
# processed them in `get_offers_by_requirements()`).
|
|
614
|
+
node_list = call_api_method(api.list_node, client.V1NodeList)
|
|
615
|
+
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
616
|
+
for node in nodes:
|
|
617
|
+
labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
|
|
618
|
+
if _get_nvidia_gpu_from_node_labels(labels) == gpu:
|
|
619
|
+
matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL])
|
|
620
|
+
if not matching_gpu_label_values:
|
|
621
|
+
raise ComputeError(f"NVIDIA GPU is requested but no matching GPU labels found: {gpu=}")
|
|
622
|
+
logger.debug("Selecting nodes by labels %s for NVIDIA %s", matching_gpu_label_values, gpu.name)
|
|
623
|
+
return client.V1NodeAffinity(
|
|
624
|
+
required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
|
|
625
|
+
node_selector_terms=[
|
|
626
|
+
client.V1NodeSelectorTerm(
|
|
627
|
+
match_expressions=[
|
|
628
|
+
client.V1NodeSelectorRequirement(
|
|
629
|
+
key=NVIDIA_GPU_PRODUCT_LABEL,
|
|
630
|
+
operator=Operator.IN,
|
|
631
|
+
values=list(matching_gpu_label_values),
|
|
632
|
+
),
|
|
633
|
+
],
|
|
634
|
+
),
|
|
635
|
+
],
|
|
636
|
+
),
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def _get_amd_gpu_node_affinity(gpu: Gpu) -> client.V1NodeAffinity:
|
|
641
|
+
device_ids = AMD_GPU_NAME_TO_DEVICE_IDS.get(gpu.name)
|
|
642
|
+
if device_ids is None:
|
|
643
|
+
raise ComputeError(f"AMD GPU is requested but no matching device ids found: {gpu=}")
|
|
644
|
+
return client.V1NodeAffinity(
|
|
645
|
+
required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
|
|
646
|
+
node_selector_terms=[
|
|
647
|
+
client.V1NodeSelectorTerm(
|
|
648
|
+
match_expressions=[
|
|
649
|
+
client.V1NodeSelectorRequirement(
|
|
650
|
+
key=f"{AMD_GPU_DEVICE_ID_LABEL_PREFIX}{device_id:x}",
|
|
651
|
+
operator=Operator.EXISTS,
|
|
652
|
+
),
|
|
653
|
+
],
|
|
654
|
+
)
|
|
655
|
+
for device_id in device_ids
|
|
656
|
+
],
|
|
657
|
+
),
|
|
658
|
+
)
|
|
552
659
|
|
|
553
660
|
|
|
554
661
|
def _continue_setup_jump_pod(
|
|
@@ -647,6 +754,39 @@ def _create_jump_pod_service(
|
|
|
647
754
|
namespace=namespace,
|
|
648
755
|
name=pod_name,
|
|
649
756
|
)
|
|
757
|
+
|
|
758
|
+
node_list = call_api_method(api.list_node, client.V1NodeList)
|
|
759
|
+
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
760
|
+
# False if we found at least one node without any "hard" taint, that is, if we don't need to
|
|
761
|
+
# specify the toleration.
|
|
762
|
+
toleration_required = True
|
|
763
|
+
# (key, effect) pairs.
|
|
764
|
+
tolerated_taints: set[tuple[str, str]] = set()
|
|
765
|
+
for node in nodes:
|
|
766
|
+
# True if the node has at least one NoExecute or NoSchedule taint.
|
|
767
|
+
has_hard_taint = False
|
|
768
|
+
taints = get_value(node, ".spec.taints", list[client.V1Taint]) or []
|
|
769
|
+
for taint in taints:
|
|
770
|
+
effect = get_value(taint, ".effect", str, required=True)
|
|
771
|
+
# A "soft" taint, ignore.
|
|
772
|
+
if effect == TaintEffect.PREFER_NO_SCHEDULE:
|
|
773
|
+
continue
|
|
774
|
+
has_hard_taint = True
|
|
775
|
+
key = get_value(taint, ".key", str, required=True)
|
|
776
|
+
if key in TOLERATED_NODE_TAINTS:
|
|
777
|
+
tolerated_taints.add((key, effect))
|
|
778
|
+
if not has_hard_taint:
|
|
779
|
+
toleration_required = False
|
|
780
|
+
break
|
|
781
|
+
tolerations: list[client.V1Toleration] = []
|
|
782
|
+
if toleration_required:
|
|
783
|
+
for key, effect in tolerated_taints:
|
|
784
|
+
tolerations.append(
|
|
785
|
+
client.V1Toleration(key=key, operator=Operator.EXISTS, effect=effect)
|
|
786
|
+
)
|
|
787
|
+
if not tolerations:
|
|
788
|
+
logger.warning("No appropriate node found, the jump pod may never be scheduled")
|
|
789
|
+
|
|
650
790
|
commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys)
|
|
651
791
|
pod = client.V1Pod(
|
|
652
792
|
metadata=client.V1ObjectMeta(
|
|
@@ -667,7 +807,8 @@ def _create_jump_pod_service(
|
|
|
667
807
|
)
|
|
668
808
|
],
|
|
669
809
|
)
|
|
670
|
-
]
|
|
810
|
+
],
|
|
811
|
+
tolerations=tolerations,
|
|
671
812
|
),
|
|
672
813
|
)
|
|
673
814
|
call_api_method(
|
|
@@ -37,7 +37,7 @@ class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class KubeconfigFileConfig(CoreModel):
|
|
40
|
-
filename: Annotated[str, Field(description="The path to the kubeconfig file")]
|
|
40
|
+
filename: Annotated[str, Field(description="The path to the kubeconfig file")] = ""
|
|
41
41
|
data: Annotated[
|
|
42
42
|
Optional[str],
|
|
43
43
|
Field(
|
|
@@ -50,7 +50,9 @@ class KubeconfigFileConfig(CoreModel):
|
|
|
50
50
|
] = None
|
|
51
51
|
|
|
52
52
|
@root_validator
|
|
53
|
-
def fill_data(cls, values):
|
|
53
|
+
def fill_data(cls, values: dict) -> dict:
|
|
54
|
+
if values.get("filename") == "" and values.get("data") is None:
|
|
55
|
+
raise ValueError("filename or data must be specified")
|
|
54
56
|
return fill_data(values)
|
|
55
57
|
|
|
56
58
|
|
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import random
|
|
3
3
|
import shlex
|
|
4
4
|
import time
|
|
5
|
+
from collections.abc import Iterable
|
|
5
6
|
from functools import cached_property
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import List, Optional
|
|
7
8
|
|
|
8
9
|
from nebius.aio.operation import Operation as SDKOperation
|
|
9
10
|
from nebius.aio.service_error import RequestError, StatusCode
|
|
@@ -19,8 +20,13 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
19
20
|
ComputeWithPrivilegedSupport,
|
|
20
21
|
generate_unique_instance_name,
|
|
21
22
|
get_user_data,
|
|
23
|
+
merge_tags,
|
|
24
|
+
)
|
|
25
|
+
from dstack._internal.core.backends.base.offers import (
|
|
26
|
+
OfferModifier,
|
|
27
|
+
get_catalog_offers,
|
|
28
|
+
get_offers_disk_modifier,
|
|
22
29
|
)
|
|
23
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
24
30
|
from dstack._internal.core.backends.nebius import resources
|
|
25
31
|
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
|
|
26
32
|
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
|
|
@@ -124,10 +130,8 @@ class NebiusCompute(
|
|
|
124
130
|
for offer in offers
|
|
125
131
|
]
|
|
126
132
|
|
|
127
|
-
def
|
|
128
|
-
|
|
129
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
130
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
133
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
134
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
131
135
|
|
|
132
136
|
def create_instance(
|
|
133
137
|
self,
|
|
@@ -150,6 +154,18 @@ class NebiusCompute(
|
|
|
150
154
|
if backend_data.cluster is not None:
|
|
151
155
|
cluster_id = backend_data.cluster.id
|
|
152
156
|
|
|
157
|
+
labels = {
|
|
158
|
+
"owner": "dstack",
|
|
159
|
+
"dstack_project": instance_config.project_name.lower(),
|
|
160
|
+
"dstack_name": instance_config.instance_name,
|
|
161
|
+
"dstack_user": instance_config.user.lower(),
|
|
162
|
+
}
|
|
163
|
+
labels = merge_tags(
|
|
164
|
+
base_tags=labels,
|
|
165
|
+
backend_tags=self.config.tags,
|
|
166
|
+
resource_tags=instance_config.tags,
|
|
167
|
+
)
|
|
168
|
+
labels = resources.filter_invalid_labels(labels)
|
|
153
169
|
gpus = instance_offer.instance.resources.gpus
|
|
154
170
|
create_disk_op = resources.create_disk(
|
|
155
171
|
sdk=self._sdk,
|
|
@@ -159,6 +175,7 @@ class NebiusCompute(
|
|
|
159
175
|
image_family="ubuntu24.04-cuda12"
|
|
160
176
|
if gpus and gpus[0].name == "B200"
|
|
161
177
|
else "ubuntu22.04-cuda12",
|
|
178
|
+
labels=labels,
|
|
162
179
|
)
|
|
163
180
|
create_instance_op = None
|
|
164
181
|
try:
|
|
@@ -184,6 +201,7 @@ class NebiusCompute(
|
|
|
184
201
|
disk_id=create_disk_op.resource_id,
|
|
185
202
|
subnet_id=self._get_subnet_id(instance_offer.region),
|
|
186
203
|
preemptible=instance_offer.instance.resources.spot,
|
|
204
|
+
labels=labels,
|
|
187
205
|
)
|
|
188
206
|
_wait_for_instance(self._sdk, create_instance_op)
|
|
189
207
|
except BaseException:
|
|
@@ -3,6 +3,7 @@ import json
|
|
|
3
3
|
from nebius.aio.service_error import RequestError
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.backends.base.configurator import (
|
|
6
|
+
TAGS_MAX_NUM,
|
|
6
7
|
BackendRecord,
|
|
7
8
|
Configurator,
|
|
8
9
|
raise_invalid_credentials_error,
|
|
@@ -18,6 +19,7 @@ from dstack._internal.core.backends.nebius.models import (
|
|
|
18
19
|
NebiusServiceAccountCreds,
|
|
19
20
|
NebiusStoredConfig,
|
|
20
21
|
)
|
|
22
|
+
from dstack._internal.core.errors import BackendError, ServerClientError
|
|
21
23
|
from dstack._internal.core.models.backends.base import BackendType
|
|
22
24
|
|
|
23
25
|
|
|
@@ -53,6 +55,19 @@ class NebiusConfigurator(
|
|
|
53
55
|
f" some of the valid options: {sorted(valid_fabrics)}"
|
|
54
56
|
),
|
|
55
57
|
)
|
|
58
|
+
self._check_config_tags(config)
|
|
59
|
+
|
|
60
|
+
def _check_config_tags(self, config: NebiusBackendConfigWithCreds):
|
|
61
|
+
if not config.tags:
|
|
62
|
+
return
|
|
63
|
+
if len(config.tags) > TAGS_MAX_NUM:
|
|
64
|
+
raise ServerClientError(
|
|
65
|
+
f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed."
|
|
66
|
+
)
|
|
67
|
+
try:
|
|
68
|
+
resources.validate_labels(config.tags)
|
|
69
|
+
except BackendError as e:
|
|
70
|
+
raise ServerClientError(e.args[0])
|
|
56
71
|
|
|
57
72
|
def create_backend(
|
|
58
73
|
self, project_name: str, config: NebiusBackendConfigWithCreds
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Annotated, Dict, Literal, Optional, Union
|
|
2
4
|
|
|
3
5
|
from pydantic import Field, root_validator
|
|
4
6
|
|
|
@@ -27,16 +29,38 @@ class NebiusServiceAccountCreds(CoreModel):
|
|
|
27
29
|
)
|
|
28
30
|
),
|
|
29
31
|
]
|
|
32
|
+
filename: Annotated[
|
|
33
|
+
Optional[str], Field(description="The path to the service account credentials file")
|
|
34
|
+
] = None
|
|
30
35
|
|
|
31
36
|
|
|
32
37
|
class NebiusServiceAccountFileCreds(CoreModel):
|
|
33
38
|
type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
|
|
34
39
|
"service_account"
|
|
35
40
|
)
|
|
36
|
-
service_account_id: Annotated[
|
|
37
|
-
|
|
41
|
+
service_account_id: Annotated[
|
|
42
|
+
Optional[str],
|
|
43
|
+
Field(
|
|
44
|
+
description=(
|
|
45
|
+
"Service account ID. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
|
|
46
|
+
)
|
|
47
|
+
),
|
|
48
|
+
] = None
|
|
49
|
+
public_key_id: Annotated[
|
|
50
|
+
Optional[str],
|
|
51
|
+
Field(
|
|
52
|
+
description=(
|
|
53
|
+
"ID of the service account public key. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
|
|
54
|
+
)
|
|
55
|
+
),
|
|
56
|
+
] = None
|
|
38
57
|
private_key_file: Annotated[
|
|
39
|
-
Optional[str],
|
|
58
|
+
Optional[str],
|
|
59
|
+
Field(
|
|
60
|
+
description=(
|
|
61
|
+
"Path to the service account private key. Set automatically if `filename` or `private_key_content` is specified. When configuring via the UI, it must be specified explicitly"
|
|
62
|
+
)
|
|
63
|
+
),
|
|
40
64
|
] = None
|
|
41
65
|
private_key_content: Annotated[
|
|
42
66
|
Optional[str],
|
|
@@ -44,13 +68,35 @@ class NebiusServiceAccountFileCreds(CoreModel):
|
|
|
44
68
|
description=(
|
|
45
69
|
"Content of the service account private key. When configuring via"
|
|
46
70
|
" `server/config.yml`, it's automatically filled from `private_key_file`."
|
|
47
|
-
" When configuring via UI, it has to be specified explicitly
|
|
71
|
+
" When configuring via UI, it has to be specified explicitly"
|
|
48
72
|
)
|
|
49
73
|
),
|
|
50
74
|
] = None
|
|
75
|
+
filename: Annotated[
|
|
76
|
+
Optional[str], Field(description="The path to the service account credentials file")
|
|
77
|
+
] = None
|
|
51
78
|
|
|
52
79
|
@root_validator
|
|
53
80
|
def fill_data(cls, values):
|
|
81
|
+
if filename := values.get("filename"):
|
|
82
|
+
try:
|
|
83
|
+
with open(Path(filename).expanduser()) as f:
|
|
84
|
+
data = json.load(f)
|
|
85
|
+
from nebius.base.service_account.credentials_file import (
|
|
86
|
+
ServiceAccountCredentials,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
credentials = ServiceAccountCredentials.from_json(data)
|
|
90
|
+
subject = credentials.subject_credentials
|
|
91
|
+
values["service_account_id"] = subject.sub
|
|
92
|
+
values["public_key_id"] = subject.kid
|
|
93
|
+
values["private_key_content"] = subject.private_key
|
|
94
|
+
except OSError:
|
|
95
|
+
raise ValueError(f"No such file {filename}")
|
|
96
|
+
except Exception as e:
|
|
97
|
+
raise ValueError(f"Failed to parse credentials file {filename}: {e}")
|
|
98
|
+
return values
|
|
99
|
+
|
|
54
100
|
return fill_data(
|
|
55
101
|
values, filename_field="private_key_file", data_field="private_key_content"
|
|
56
102
|
)
|
|
@@ -95,6 +141,12 @@ class NebiusBackendConfig(CoreModel):
|
|
|
95
141
|
)
|
|
96
142
|
),
|
|
97
143
|
] = None
|
|
144
|
+
tags: Annotated[
|
|
145
|
+
Optional[Dict[str, str]],
|
|
146
|
+
Field(
|
|
147
|
+
description="The tags (labels) that will be assigned to resources created by `dstack`"
|
|
148
|
+
),
|
|
149
|
+
] = None
|
|
98
150
|
|
|
99
151
|
|
|
100
152
|
class NebiusBackendConfigWithCreds(NebiusBackendConfig):
|