dstack 0.19.32__py3-none-any.whl → 0.19.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/offer.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +1 -5
- dstack/_internal/core/backends/aws/compute.py +8 -5
- dstack/_internal/core/backends/azure/compute.py +9 -6
- dstack/_internal/core/backends/base/compute.py +40 -17
- dstack/_internal/core/backends/base/offers.py +5 -1
- dstack/_internal/core/backends/datacrunch/compute.py +9 -6
- dstack/_internal/core/backends/gcp/compute.py +137 -7
- dstack/_internal/core/backends/gcp/models.py +7 -0
- dstack/_internal/core/backends/gcp/resources.py +87 -5
- dstack/_internal/core/backends/hotaisle/compute.py +11 -1
- dstack/_internal/core/backends/kubernetes/compute.py +161 -83
- dstack/_internal/core/backends/kubernetes/models.py +4 -2
- dstack/_internal/core/backends/nebius/compute.py +9 -6
- dstack/_internal/core/backends/oci/compute.py +9 -6
- dstack/_internal/core/backends/runpod/compute.py +10 -6
- dstack/_internal/core/backends/vastai/compute.py +3 -1
- dstack/_internal/core/backends/vastai/configurator.py +0 -1
- dstack/_internal/core/models/fleets.py +1 -1
- dstack/_internal/core/models/profiles.py +1 -1
- dstack/_internal/core/models/runs.py +3 -2
- dstack/_internal/core/models/users.py +10 -0
- dstack/_internal/core/services/configs/__init__.py +1 -0
- dstack/_internal/server/background/tasks/process_instances.py +5 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -0
- dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
- dstack/_internal/server/models.py +3 -0
- dstack/_internal/server/routers/runs.py +5 -1
- dstack/_internal/server/routers/users.py +14 -2
- dstack/_internal/server/services/runs.py +9 -4
- dstack/_internal/server/services/users.py +35 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/main-720ce3a11140daa480cc.css +3 -0
- dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js → main-97c7e184573ca23f9fe4.js} +12218 -7625
- dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js.map → main-97c7e184573ca23f9fe4.js.map} +1 -1
- dstack/api/_public/__init__.py +9 -12
- dstack/api/_public/repos.py +0 -21
- dstack/api/_public/runs.py +64 -9
- dstack/api/server/_users.py +17 -2
- dstack/version.py +2 -2
- {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/METADATA +2 -2
- {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/RECORD +45 -44
- dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +0 -3
- {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/WHEEL +0 -0
- {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -52,7 +52,7 @@ INSTANCE_TYPE_SPECS = {
|
|
|
52
52
|
"cpu_frequency": 2800000000,
|
|
53
53
|
"cpu_manufacturer": "Intel",
|
|
54
54
|
},
|
|
55
|
-
"4x MI300X 52x Xeon Platinum
|
|
55
|
+
"4x MI300X 52x Xeon Platinum 8470": {
|
|
56
56
|
"cpu_model": "Xeon Platinum 8470",
|
|
57
57
|
"cpu_frequency": 2000000000,
|
|
58
58
|
"cpu_manufacturer": "Intel",
|
|
@@ -62,6 +62,16 @@ INSTANCE_TYPE_SPECS = {
|
|
|
62
62
|
"cpu_frequency": 2800000000,
|
|
63
63
|
"cpu_manufacturer": "Intel",
|
|
64
64
|
},
|
|
65
|
+
"8x MI300X 104x Xeon Platinum 8470": {
|
|
66
|
+
"cpu_model": "Xeon Platinum 8470",
|
|
67
|
+
"cpu_frequency": 2000000000,
|
|
68
|
+
"cpu_manufacturer": "Intel",
|
|
69
|
+
},
|
|
70
|
+
"8x MI300X 104x Xeon Platinum 8462Y+": {
|
|
71
|
+
"cpu_model": "Xeon Platinum 8462Y+",
|
|
72
|
+
"cpu_frequency": 2800000000,
|
|
73
|
+
"cpu_manufacturer": "Intel",
|
|
74
|
+
},
|
|
65
75
|
}
|
|
66
76
|
|
|
67
77
|
|
|
@@ -5,7 +5,7 @@ import time
|
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from typing import List, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
|
|
8
|
+
from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor
|
|
9
9
|
from kubernetes import client
|
|
10
10
|
|
|
11
11
|
from dstack._internal.core.backends.base.compute import (
|
|
@@ -59,19 +59,31 @@ from dstack._internal.utils.logging import get_logger
|
|
|
59
59
|
logger = get_logger(__name__)
|
|
60
60
|
|
|
61
61
|
JUMP_POD_SSH_PORT = 22
|
|
62
|
-
|
|
63
|
-
NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
|
|
64
|
-
NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
|
|
62
|
+
DUMMY_REGION = "-"
|
|
65
63
|
|
|
66
64
|
NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
|
|
67
|
-
NVIDIA_GPU_COUNT_LABEL = f"{NVIDIA_GPU_RESOURCE}.count"
|
|
68
|
-
NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
|
|
69
65
|
NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
|
|
66
|
+
NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
|
|
67
|
+
|
|
68
|
+
AMD_GPU_RESOURCE = "amd.com/gpu"
|
|
69
|
+
AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE
|
|
70
|
+
# The oldest but still supported label format, the safest option, see the commit message:
|
|
71
|
+
# https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48
|
|
72
|
+
# E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs
|
|
73
|
+
# We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue:
|
|
74
|
+
# https://github.com/ROCm/k8s-device-plugin/issues/112
|
|
75
|
+
AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id."
|
|
70
76
|
|
|
71
77
|
# Taints we know and tolerate when creating our objects, e.g., the jump pod.
|
|
72
|
-
TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT,)
|
|
78
|
+
TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT)
|
|
73
79
|
|
|
74
|
-
|
|
80
|
+
NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
|
|
81
|
+
NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
|
|
82
|
+
|
|
83
|
+
AMD_GPU_DEVICE_ID_TO_GPU_INFO = {
|
|
84
|
+
device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids
|
|
85
|
+
}
|
|
86
|
+
AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS}
|
|
75
87
|
|
|
76
88
|
|
|
77
89
|
class Operator(str, Enum):
|
|
@@ -112,21 +124,15 @@ class KubernetesCompute(
|
|
|
112
124
|
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
113
125
|
for node in nodes:
|
|
114
126
|
try:
|
|
115
|
-
labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
|
|
116
127
|
name = get_value(node, ".metadata.name", str, required=True)
|
|
117
|
-
cpus = _parse_cpu(
|
|
118
|
-
get_value(node, ".status.allocatable['cpu']", str, required=True)
|
|
119
|
-
)
|
|
120
128
|
cpu_arch = normalize_arch(
|
|
121
129
|
get_value(node, ".status.node_info.architecture", str)
|
|
122
130
|
).to_cpu_architecture()
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
|
|
129
|
-
)
|
|
131
|
+
allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
|
|
132
|
+
cpus = _parse_cpu(allocatable["cpu"])
|
|
133
|
+
memory_mib = _parse_memory(allocatable["memory"])
|
|
134
|
+
disk_size_mib = _parse_memory(allocatable["ephemeral-storage"])
|
|
135
|
+
gpus = _get_node_gpus(node)
|
|
130
136
|
except (AttributeError, KeyError, ValueError) as e:
|
|
131
137
|
logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
|
|
132
138
|
continue
|
|
@@ -161,6 +167,7 @@ class KubernetesCompute(
|
|
|
161
167
|
volumes: List[Volume],
|
|
162
168
|
) -> JobProvisioningData:
|
|
163
169
|
instance_name = generate_unique_instance_name_for_job(run, job)
|
|
170
|
+
assert run.run_spec.ssh_key_pub is not None
|
|
164
171
|
commands = get_docker_commands(
|
|
165
172
|
[run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
|
|
166
173
|
)
|
|
@@ -217,59 +224,18 @@ class KubernetesCompute(
|
|
|
217
224
|
"GPU is requested but the offer has no GPUs:"
|
|
218
225
|
f" {gpu_spec=} {instance_offer=}",
|
|
219
226
|
)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
# We cannot generate an expected GPU label value from the Gpu model instance
|
|
223
|
-
# as the actual values may have additional components (socket, memory type, etc.)
|
|
224
|
-
# that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
|
|
225
|
-
# Moreover, a single Gpu may match multiple label values.
|
|
226
|
-
# As a workaround, we iterate and process all node labels once again (we already
|
|
227
|
-
# processed them in `get_offers_by_requirements()`).
|
|
228
|
-
node_list = call_api_method(
|
|
229
|
-
self.api.list_node,
|
|
230
|
-
client.V1NodeList,
|
|
231
|
-
)
|
|
232
|
-
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
233
|
-
for node in nodes:
|
|
234
|
-
labels = get_value(node, ".metadata.labels", dict[str, str])
|
|
235
|
-
if not labels:
|
|
236
|
-
continue
|
|
237
|
-
gpus, gpu_label_value = _get_gpus_from_node_labels(labels)
|
|
238
|
-
if not gpus or gpu_label_value is None:
|
|
239
|
-
continue
|
|
240
|
-
if gpus[0] == offer_gpu:
|
|
241
|
-
matching_gpu_label_values.add(gpu_label_value)
|
|
242
|
-
if not matching_gpu_label_values:
|
|
243
|
-
raise ComputeError(
|
|
244
|
-
f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
|
|
245
|
-
)
|
|
246
|
-
logger.debug(
|
|
247
|
-
"Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
|
|
248
|
-
)
|
|
249
|
-
# TODO: support other GPU vendors
|
|
250
|
-
resources_requests[NVIDIA_GPU_RESOURCE] = str(gpu_min)
|
|
251
|
-
resources_limits[NVIDIA_GPU_RESOURCE] = str(gpu_min)
|
|
252
|
-
node_affinity = client.V1NodeAffinity(
|
|
253
|
-
required_during_scheduling_ignored_during_execution=[
|
|
254
|
-
client.V1NodeSelectorTerm(
|
|
255
|
-
match_expressions=[
|
|
256
|
-
client.V1NodeSelectorRequirement(
|
|
257
|
-
key=NVIDIA_GPU_PRODUCT_LABEL,
|
|
258
|
-
operator=Operator.IN,
|
|
259
|
-
values=list(matching_gpu_label_values),
|
|
260
|
-
),
|
|
261
|
-
],
|
|
262
|
-
),
|
|
263
|
-
],
|
|
227
|
+
gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu(
|
|
228
|
+
self.api, offer_gpus[0]
|
|
264
229
|
)
|
|
230
|
+
logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_min)
|
|
231
|
+
resources_requests[gpu_resource] = resources_limits[gpu_resource] = str(gpu_min)
|
|
265
232
|
# It should be NoSchedule, but we also add NoExecute toleration just in case.
|
|
266
233
|
for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
|
|
267
234
|
tolerations.append(
|
|
268
235
|
client.V1Toleration(
|
|
269
|
-
key=
|
|
236
|
+
key=node_taint, operator=Operator.EXISTS, effect=effect
|
|
270
237
|
)
|
|
271
238
|
)
|
|
272
|
-
|
|
273
239
|
if (memory_min := resources_spec.memory.min) is not None:
|
|
274
240
|
resources_requests["memory"] = _render_memory(memory_min)
|
|
275
241
|
if (
|
|
@@ -331,7 +297,9 @@ class KubernetesCompute(
|
|
|
331
297
|
volume_mounts=volume_mounts,
|
|
332
298
|
)
|
|
333
299
|
],
|
|
334
|
-
affinity=
|
|
300
|
+
affinity=client.V1Affinity(
|
|
301
|
+
node_affinity=node_affinity,
|
|
302
|
+
),
|
|
335
303
|
tolerations=tolerations,
|
|
336
304
|
volumes=volumes_,
|
|
337
305
|
),
|
|
@@ -550,34 +518,144 @@ def _render_memory(memory: Memory) -> str:
|
|
|
550
518
|
return f"{float(memory)}Gi"
|
|
551
519
|
|
|
552
520
|
|
|
553
|
-
def
|
|
521
|
+
def _get_node_gpus(node: client.V1Node) -> list[Gpu]:
|
|
522
|
+
node_name = get_value(node, ".metadata.name", str, required=True)
|
|
523
|
+
allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
|
|
524
|
+
labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
|
|
525
|
+
for gpu_resource, gpu_getter in (
|
|
526
|
+
(NVIDIA_GPU_RESOURCE, _get_nvidia_gpu_from_node_labels),
|
|
527
|
+
(AMD_GPU_RESOURCE, _get_amd_gpu_from_node_labels),
|
|
528
|
+
):
|
|
529
|
+
_gpu_count = allocatable.get(gpu_resource)
|
|
530
|
+
if not _gpu_count:
|
|
531
|
+
continue
|
|
532
|
+
gpu_count = int(_gpu_count)
|
|
533
|
+
if gpu_count < 1:
|
|
534
|
+
continue
|
|
535
|
+
gpu = gpu_getter(labels)
|
|
536
|
+
if gpu is None:
|
|
537
|
+
logger.warning(
|
|
538
|
+
"Node %s: GPU resource found, but failed to detect its model: %s=%d",
|
|
539
|
+
node_name,
|
|
540
|
+
gpu_resource,
|
|
541
|
+
gpu_count,
|
|
542
|
+
)
|
|
543
|
+
return []
|
|
544
|
+
return [gpu] * gpu_count
|
|
545
|
+
logger.debug("Node %s: no GPU resource found", node_name)
|
|
546
|
+
return []
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
|
|
554
550
|
# We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
|
|
555
551
|
# to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
|
|
556
552
|
# "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
|
|
557
553
|
# Thus, we convert the product name to a known gpu name.
|
|
558
|
-
# TODO: support other GPU vendors
|
|
559
|
-
gpu_count = labels.get(NVIDIA_GPU_COUNT_LABEL)
|
|
560
554
|
gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
|
|
561
|
-
if
|
|
562
|
-
return
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
for known_gpu_name in NVIDIA_GPU_NAMES:
|
|
566
|
-
if known_gpu_name.lower() in gpu_product.lower().split("-"):
|
|
567
|
-
gpu_name = known_gpu_name
|
|
555
|
+
if gpu_product is None:
|
|
556
|
+
return None
|
|
557
|
+
for gpu_name in NVIDIA_GPU_NAMES:
|
|
558
|
+
if gpu_name.lower() in gpu_product.lower().split("-"):
|
|
568
559
|
break
|
|
569
|
-
|
|
570
|
-
return
|
|
560
|
+
else:
|
|
561
|
+
return None
|
|
571
562
|
gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
|
|
572
563
|
gpu_memory = gpu_info.memory * 1024
|
|
573
564
|
# A100 may come in two variants
|
|
574
565
|
if "40GB" in gpu_product:
|
|
575
566
|
gpu_memory = 40 * 1024
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
567
|
+
return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def _get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
|
|
571
|
+
# (AMDGPUInfo.name, AMDGPUInfo.memory) pairs
|
|
572
|
+
gpus: set[tuple[str, int]] = set()
|
|
573
|
+
for label in labels:
|
|
574
|
+
if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX):
|
|
575
|
+
continue
|
|
576
|
+
_, _, _device_id = label.rpartition(".")
|
|
577
|
+
device_id = int(_device_id, 16)
|
|
578
|
+
gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id)
|
|
579
|
+
if gpu_info is None:
|
|
580
|
+
logger.warning("Unknown AMD GPU device id: %X", device_id)
|
|
581
|
+
continue
|
|
582
|
+
gpus.add((gpu_info.name, gpu_info.memory))
|
|
583
|
+
if not gpus:
|
|
584
|
+
return None
|
|
585
|
+
if len(gpus) == 1:
|
|
586
|
+
gpu_name, gpu_memory_gib = next(iter(gpus))
|
|
587
|
+
return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024)
|
|
588
|
+
logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus)
|
|
589
|
+
return None
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def _get_pod_spec_parameters_for_gpu(
|
|
593
|
+
api: client.CoreV1Api, gpu: Gpu
|
|
594
|
+
) -> tuple[str, client.V1NodeAffinity, str]:
|
|
595
|
+
gpu_vendor = gpu.vendor
|
|
596
|
+
assert gpu_vendor is not None
|
|
597
|
+
if gpu_vendor == AcceleratorVendor.NVIDIA:
|
|
598
|
+
node_affinity = _get_nvidia_gpu_node_affinity(api, gpu)
|
|
599
|
+
return NVIDIA_GPU_RESOURCE, node_affinity, NVIDIA_GPU_NODE_TAINT
|
|
600
|
+
if gpu_vendor == AcceleratorVendor.AMD:
|
|
601
|
+
node_affinity = _get_amd_gpu_node_affinity(gpu)
|
|
602
|
+
return AMD_GPU_RESOURCE, node_affinity, AMD_GPU_NODE_TAINT
|
|
603
|
+
raise ComputeError(f"Unsupported GPU vendor: {gpu_vendor}")
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def _get_nvidia_gpu_node_affinity(api: client.CoreV1Api, gpu: Gpu) -> client.V1NodeAffinity:
|
|
607
|
+
matching_gpu_label_values: set[str] = set()
|
|
608
|
+
# We cannot generate an expected GPU label value from the Gpu model instance
|
|
609
|
+
# as the actual values may have additional components (socket, memory type, etc.)
|
|
610
|
+
# that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
|
|
611
|
+
# Moreover, a single Gpu may match multiple label values.
|
|
612
|
+
# As a workaround, we iterate and process all node labels once again (we already
|
|
613
|
+
# processed them in `get_offers_by_requirements()`).
|
|
614
|
+
node_list = call_api_method(api.list_node, client.V1NodeList)
|
|
615
|
+
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
616
|
+
for node in nodes:
|
|
617
|
+
labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
|
|
618
|
+
if _get_nvidia_gpu_from_node_labels(labels) == gpu:
|
|
619
|
+
matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL])
|
|
620
|
+
if not matching_gpu_label_values:
|
|
621
|
+
raise ComputeError(f"NVIDIA GPU is requested but no matching GPU labels found: {gpu=}")
|
|
622
|
+
logger.debug("Selecting nodes by labels %s for NVIDIA %s", matching_gpu_label_values, gpu.name)
|
|
623
|
+
return client.V1NodeAffinity(
|
|
624
|
+
required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
|
|
625
|
+
node_selector_terms=[
|
|
626
|
+
client.V1NodeSelectorTerm(
|
|
627
|
+
match_expressions=[
|
|
628
|
+
client.V1NodeSelectorRequirement(
|
|
629
|
+
key=NVIDIA_GPU_PRODUCT_LABEL,
|
|
630
|
+
operator=Operator.IN,
|
|
631
|
+
values=list(matching_gpu_label_values),
|
|
632
|
+
),
|
|
633
|
+
],
|
|
634
|
+
),
|
|
635
|
+
],
|
|
636
|
+
),
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def _get_amd_gpu_node_affinity(gpu: Gpu) -> client.V1NodeAffinity:
|
|
641
|
+
device_ids = AMD_GPU_NAME_TO_DEVICE_IDS.get(gpu.name)
|
|
642
|
+
if device_ids is None:
|
|
643
|
+
raise ComputeError(f"AMD GPU is requested but no matching device ids found: {gpu=}")
|
|
644
|
+
return client.V1NodeAffinity(
|
|
645
|
+
required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
|
|
646
|
+
node_selector_terms=[
|
|
647
|
+
client.V1NodeSelectorTerm(
|
|
648
|
+
match_expressions=[
|
|
649
|
+
client.V1NodeSelectorRequirement(
|
|
650
|
+
key=f"{AMD_GPU_DEVICE_ID_LABEL_PREFIX}{device_id:x}",
|
|
651
|
+
operator=Operator.EXISTS,
|
|
652
|
+
),
|
|
653
|
+
],
|
|
654
|
+
)
|
|
655
|
+
for device_id in device_ids
|
|
656
|
+
],
|
|
657
|
+
),
|
|
658
|
+
)
|
|
581
659
|
|
|
582
660
|
|
|
583
661
|
def _continue_setup_jump_pod(
|
|
@@ -37,7 +37,7 @@ class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class KubeconfigFileConfig(CoreModel):
|
|
40
|
-
filename: Annotated[str, Field(description="The path to the kubeconfig file")]
|
|
40
|
+
filename: Annotated[str, Field(description="The path to the kubeconfig file")] = ""
|
|
41
41
|
data: Annotated[
|
|
42
42
|
Optional[str],
|
|
43
43
|
Field(
|
|
@@ -50,7 +50,9 @@ class KubeconfigFileConfig(CoreModel):
|
|
|
50
50
|
] = None
|
|
51
51
|
|
|
52
52
|
@root_validator
|
|
53
|
-
def fill_data(cls, values):
|
|
53
|
+
def fill_data(cls, values: dict) -> dict:
|
|
54
|
+
if values.get("filename") == "" and values.get("data") is None:
|
|
55
|
+
raise ValueError("filename or data must be specified")
|
|
54
56
|
return fill_data(values)
|
|
55
57
|
|
|
56
58
|
|
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import random
|
|
3
3
|
import shlex
|
|
4
4
|
import time
|
|
5
|
+
from collections.abc import Iterable
|
|
5
6
|
from functools import cached_property
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import List, Optional
|
|
7
8
|
|
|
8
9
|
from nebius.aio.operation import Operation as SDKOperation
|
|
9
10
|
from nebius.aio.service_error import RequestError, StatusCode
|
|
@@ -21,7 +22,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
21
22
|
get_user_data,
|
|
22
23
|
merge_tags,
|
|
23
24
|
)
|
|
24
|
-
from dstack._internal.core.backends.base.offers import
|
|
25
|
+
from dstack._internal.core.backends.base.offers import (
|
|
26
|
+
OfferModifier,
|
|
27
|
+
get_catalog_offers,
|
|
28
|
+
get_offers_disk_modifier,
|
|
29
|
+
)
|
|
25
30
|
from dstack._internal.core.backends.nebius import resources
|
|
26
31
|
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
|
|
27
32
|
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
|
|
@@ -125,10 +130,8 @@ class NebiusCompute(
|
|
|
125
130
|
for offer in offers
|
|
126
131
|
]
|
|
127
132
|
|
|
128
|
-
def
|
|
129
|
-
|
|
130
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
131
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
133
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
134
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
132
135
|
|
|
133
136
|
def create_instance(
|
|
134
137
|
self,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
1
2
|
from concurrent.futures import ThreadPoolExecutor
|
|
2
3
|
from functools import cached_property
|
|
3
|
-
from typing import
|
|
4
|
+
from typing import List, Optional
|
|
4
5
|
|
|
5
6
|
import oci
|
|
6
7
|
|
|
@@ -13,7 +14,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
13
14
|
generate_unique_instance_name,
|
|
14
15
|
get_user_data,
|
|
15
16
|
)
|
|
16
|
-
from dstack._internal.core.backends.base.offers import
|
|
17
|
+
from dstack._internal.core.backends.base.offers import (
|
|
18
|
+
OfferModifier,
|
|
19
|
+
get_catalog_offers,
|
|
20
|
+
get_offers_disk_modifier,
|
|
21
|
+
)
|
|
17
22
|
from dstack._internal.core.backends.oci import resources
|
|
18
23
|
from dstack._internal.core.backends.oci.models import OCIConfig
|
|
19
24
|
from dstack._internal.core.backends.oci.region import make_region_clients_map
|
|
@@ -96,10 +101,8 @@ class OCICompute(
|
|
|
96
101
|
|
|
97
102
|
return offers_with_availability
|
|
98
103
|
|
|
99
|
-
def
|
|
100
|
-
|
|
101
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
102
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
104
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
105
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
103
106
|
|
|
104
107
|
def terminate_instance(
|
|
105
108
|
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
|
+
from collections.abc import Iterable
|
|
3
4
|
from datetime import timedelta
|
|
4
|
-
from typing import
|
|
5
|
+
from typing import List, Optional
|
|
5
6
|
|
|
6
7
|
from dstack._internal.core.backends.base.backend import Compute
|
|
7
8
|
from dstack._internal.core.backends.base.compute import (
|
|
@@ -12,7 +13,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
12
13
|
get_docker_commands,
|
|
13
14
|
get_job_instance_name,
|
|
14
15
|
)
|
|
15
|
-
from dstack._internal.core.backends.base.offers import
|
|
16
|
+
from dstack._internal.core.backends.base.offers import (
|
|
17
|
+
OfferModifier,
|
|
18
|
+
get_catalog_offers,
|
|
19
|
+
get_offers_disk_modifier,
|
|
20
|
+
)
|
|
16
21
|
from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
|
|
17
22
|
from dstack._internal.core.backends.runpod.models import RunpodConfig
|
|
18
23
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
@@ -72,10 +77,8 @@ class RunpodCompute(
|
|
|
72
77
|
]
|
|
73
78
|
return offers
|
|
74
79
|
|
|
75
|
-
def
|
|
76
|
-
|
|
77
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
78
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
80
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
81
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
79
82
|
|
|
80
83
|
def run_job(
|
|
81
84
|
self,
|
|
@@ -86,6 +89,7 @@ class RunpodCompute(
|
|
|
86
89
|
project_ssh_private_key: str,
|
|
87
90
|
volumes: List[Volume],
|
|
88
91
|
) -> JobProvisioningData:
|
|
92
|
+
assert run.run_spec.ssh_key_pub is not None
|
|
89
93
|
instance_config = InstanceConfiguration(
|
|
90
94
|
project_name=run.project_name,
|
|
91
95
|
instance_name=get_job_instance_name(run, job),
|
|
@@ -47,7 +47,7 @@ class VastAICompute(
|
|
|
47
47
|
"reliability2": {"gte": 0.9},
|
|
48
48
|
"inet_down": {"gt": 128},
|
|
49
49
|
"verified": {"eq": True},
|
|
50
|
-
"cuda_max_good": {"gte": 12.
|
|
50
|
+
"cuda_max_good": {"gte": 12.8},
|
|
51
51
|
"compute_cap": {"gte": 600},
|
|
52
52
|
}
|
|
53
53
|
)
|
|
@@ -58,6 +58,7 @@ class VastAICompute(
|
|
|
58
58
|
) -> List[InstanceOfferWithAvailability]:
|
|
59
59
|
offers = get_catalog_offers(
|
|
60
60
|
backend=BackendType.VASTAI,
|
|
61
|
+
locations=self.config.regions or None,
|
|
61
62
|
requirements=requirements,
|
|
62
63
|
# TODO(egor-s): spots currently not supported
|
|
63
64
|
extra_filter=lambda offer: not offer.instance.resources.spot,
|
|
@@ -85,6 +86,7 @@ class VastAICompute(
|
|
|
85
86
|
instance_name = generate_unique_instance_name_for_job(
|
|
86
87
|
run, job, max_length=MAX_INSTANCE_NAME_LEN
|
|
87
88
|
)
|
|
89
|
+
assert run.run_spec.ssh_key_pub is not None
|
|
88
90
|
commands = get_docker_commands(
|
|
89
91
|
[run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
|
|
90
92
|
)
|
|
@@ -244,7 +244,7 @@ class InstanceGroupParams(CoreModel):
|
|
|
244
244
|
Field(
|
|
245
245
|
description=(
|
|
246
246
|
"The existing reservation to use for instance provisioning."
|
|
247
|
-
" Supports AWS Capacity Reservations
|
|
247
|
+
" Supports AWS Capacity Reservations, AWS Capacity Blocks, and GCP reservations"
|
|
248
248
|
)
|
|
249
249
|
),
|
|
250
250
|
] = None
|
|
@@ -283,7 +283,7 @@ class ProfileParams(CoreModel):
|
|
|
283
283
|
Field(
|
|
284
284
|
description=(
|
|
285
285
|
"The existing reservation to use for instance provisioning."
|
|
286
|
-
" Supports AWS Capacity Reservations
|
|
286
|
+
" Supports AWS Capacity Reservations, AWS Capacity Blocks, and GCP reservations"
|
|
287
287
|
)
|
|
288
288
|
),
|
|
289
289
|
] = None
|
|
@@ -462,11 +462,12 @@ class RunSpec(generate_dual_core_model(RunSpecConfig)):
|
|
|
462
462
|
configuration: Annotated[AnyRunConfiguration, Field(discriminator="type")]
|
|
463
463
|
profile: Annotated[Optional[Profile], Field(description="The profile parameters")] = None
|
|
464
464
|
ssh_key_pub: Annotated[
|
|
465
|
-
str,
|
|
465
|
+
Optional[str],
|
|
466
466
|
Field(
|
|
467
467
|
description="The contents of the SSH public key that will be used to connect to the run."
|
|
468
|
+
" Can be empty only before the run is submitted."
|
|
468
469
|
),
|
|
469
|
-
]
|
|
470
|
+
] = None
|
|
470
471
|
# merged_profile stores profile parameters merged from profile and configuration.
|
|
471
472
|
# Read profile parameters from merged_profile instead of profile directly.
|
|
472
473
|
# TODO: make merged_profile a computed field after migrating to pydanticV2
|
|
@@ -30,6 +30,7 @@ class User(CoreModel):
|
|
|
30
30
|
email: Optional[str]
|
|
31
31
|
active: bool
|
|
32
32
|
permissions: UserPermissions
|
|
33
|
+
ssh_public_key: Optional[str] = None
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
class UserTokenCreds(CoreModel):
|
|
@@ -38,3 +39,12 @@ class UserTokenCreds(CoreModel):
|
|
|
38
39
|
|
|
39
40
|
class UserWithCreds(User):
|
|
40
41
|
creds: UserTokenCreds
|
|
42
|
+
ssh_private_key: Optional[str] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class UserHookConfig(CoreModel):
|
|
46
|
+
"""
|
|
47
|
+
This class can be inherited to extend the user creation configuration passed to the hooks.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
pass
|
|
@@ -558,10 +558,14 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
558
558
|
if (
|
|
559
559
|
_is_fleet_master_instance(instance)
|
|
560
560
|
and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
|
|
561
|
+
and isinstance(compute, ComputeWithPlacementGroupSupport)
|
|
562
|
+
and (
|
|
563
|
+
compute.are_placement_groups_compatible_with_reservations(instance_offer.backend)
|
|
564
|
+
or instance_configuration.reservation is None
|
|
565
|
+
)
|
|
561
566
|
and instance.fleet
|
|
562
567
|
and _is_cloud_cluster(instance.fleet)
|
|
563
568
|
):
|
|
564
|
-
assert isinstance(compute, ComputeWithPlacementGroupSupport)
|
|
565
569
|
placement_group_model = _find_suitable_placement_group(
|
|
566
570
|
placement_groups=placement_group_models,
|
|
567
571
|
instance_offer=instance_offer,
|
|
@@ -243,6 +243,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
243
243
|
job_submission.age,
|
|
244
244
|
)
|
|
245
245
|
ssh_user = job_provisioning_data.username
|
|
246
|
+
assert run.run_spec.ssh_key_pub is not None
|
|
246
247
|
user_ssh_key = run.run_spec.ssh_key_pub.strip()
|
|
247
248
|
public_keys = [project.ssh_public_key.strip(), user_ssh_key]
|
|
248
249
|
if job_provisioning_data.backend == BackendType.LOCAL:
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""user.ssh_key
|
|
2
|
+
|
|
3
|
+
Revision ID: ff1d94f65b08
|
|
4
|
+
Revises: 2498ab323443
|
|
5
|
+
Create Date: 2025-10-09 20:31:31.166786
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "ff1d94f65b08"
|
|
14
|
+
down_revision = "2498ab323443"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
with op.batch_alter_table("users", schema=None) as batch_op:
|
|
22
|
+
batch_op.add_column(sa.Column("ssh_private_key", sa.Text(), nullable=True))
|
|
23
|
+
batch_op.add_column(sa.Column("ssh_public_key", sa.Text(), nullable=True))
|
|
24
|
+
|
|
25
|
+
# ### end Alembic commands ###
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def downgrade() -> None:
|
|
29
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
30
|
+
with op.batch_alter_table("users", schema=None) as batch_op:
|
|
31
|
+
batch_op.drop_column("ssh_public_key")
|
|
32
|
+
batch_op.drop_column("ssh_private_key")
|
|
33
|
+
|
|
34
|
+
# ### end Alembic commands ###
|
|
@@ -190,6 +190,9 @@ class UserModel(BaseModel):
|
|
|
190
190
|
# deactivated users cannot access API
|
|
191
191
|
active: Mapped[bool] = mapped_column(Boolean, default=True)
|
|
192
192
|
|
|
193
|
+
ssh_private_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
|
|
194
|
+
ssh_public_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
|
|
195
|
+
|
|
193
196
|
email: Mapped[Optional[str]] = mapped_column(String(200), nullable=True)
|
|
194
197
|
|
|
195
198
|
projects_quota: Mapped[int] = mapped_column(
|