dstack 0.19.32__py3-none-any.whl → 0.19.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/offer.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +1 -5
- dstack/_internal/core/backends/aws/compute.py +8 -5
- dstack/_internal/core/backends/azure/compute.py +9 -6
- dstack/_internal/core/backends/base/compute.py +40 -17
- dstack/_internal/core/backends/base/offers.py +7 -1
- dstack/_internal/core/backends/datacrunch/compute.py +9 -6
- dstack/_internal/core/backends/gcp/compute.py +151 -6
- dstack/_internal/core/backends/gcp/models.py +10 -0
- dstack/_internal/core/backends/gcp/resources.py +87 -5
- dstack/_internal/core/backends/hotaisle/compute.py +11 -1
- dstack/_internal/core/backends/kubernetes/compute.py +161 -83
- dstack/_internal/core/backends/kubernetes/models.py +4 -2
- dstack/_internal/core/backends/nebius/compute.py +9 -6
- dstack/_internal/core/backends/oci/compute.py +9 -6
- dstack/_internal/core/backends/runpod/compute.py +14 -7
- dstack/_internal/core/backends/vastai/compute.py +3 -1
- dstack/_internal/core/backends/vastai/configurator.py +0 -1
- dstack/_internal/core/compatibility/runs.py +25 -4
- dstack/_internal/core/models/fleets.py +1 -1
- dstack/_internal/core/models/instances.py +2 -1
- dstack/_internal/core/models/profiles.py +1 -1
- dstack/_internal/core/models/runs.py +4 -2
- dstack/_internal/core/models/users.py +10 -0
- dstack/_internal/core/services/configs/__init__.py +1 -0
- dstack/_internal/core/services/ssh/key_manager.py +56 -0
- dstack/_internal/server/background/tasks/process_instances.py +5 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -0
- dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
- dstack/_internal/server/models.py +6 -0
- dstack/_internal/server/routers/metrics.py +6 -2
- dstack/_internal/server/routers/runs.py +5 -1
- dstack/_internal/server/routers/users.py +21 -2
- dstack/_internal/server/services/jobs/__init__.py +18 -9
- dstack/_internal/server/services/offers.py +1 -0
- dstack/_internal/server/services/runs.py +13 -4
- dstack/_internal/server/services/users.py +35 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/main-720ce3a11140daa480cc.css +3 -0
- dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js → main-e79754c136f1d8e4e7e6.js} +12632 -8039
- dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js.map → main-e79754c136f1d8e4e7e6.js.map} +1 -1
- dstack/_internal/server/testing/common.py +4 -0
- dstack/api/_public/__init__.py +8 -11
- dstack/api/_public/repos.py +0 -21
- dstack/api/_public/runs.py +61 -9
- dstack/api/server/__init__.py +4 -0
- dstack/api/server/_users.py +17 -2
- dstack/version.py +2 -2
- {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/METADATA +2 -2
- {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/RECORD +53 -51
- dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +0 -3
- {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/WHEEL +0 -0
- {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -26,9 +26,35 @@ supported_accelerators = [
|
|
|
26
26
|
{"accelerator_name": "nvidia-tesla-t4", "gpu_name": "T4", "memory_mb": 1024 * 16},
|
|
27
27
|
{"accelerator_name": "nvidia-tesla-v100", "gpu_name": "V100", "memory_mb": 1024 * 16},
|
|
28
28
|
{"accelerator_name": "nvidia-tesla-p100", "gpu_name": "P100", "memory_mb": 1024 * 16},
|
|
29
|
+
{"accelerator_name": "nvidia-rtx-pro-6000", "gpu_name": "RTXPRO6000", "memory_mb": 1024 * 96},
|
|
29
30
|
]
|
|
30
31
|
|
|
31
32
|
|
|
33
|
+
def find_accelerator_name(gpu_name: str, memory_mib: int) -> Optional[str]:
|
|
34
|
+
for acc in supported_accelerators:
|
|
35
|
+
if gpu_name == acc["gpu_name"] and memory_mib == acc["memory_mb"]:
|
|
36
|
+
return acc["accelerator_name"]
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def sanitize_filter_value(value: str) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Escape characters that could break the Compute Engine API filter string.
|
|
43
|
+
"""
|
|
44
|
+
return value.replace("\\", "\\\\").replace('"', '\\"')
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_resource_project(resource_url: str) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Extract the project ID from a URL like
|
|
50
|
+
https://www.googleapis.com/compute/v1/projects/proj-id/zones/us-central1-a/instances/vm-name
|
|
51
|
+
"""
|
|
52
|
+
matches = re.findall(r"/projects/(?P<project_id>[a-z0-9-]+)/", resource_url)
|
|
53
|
+
if not matches:
|
|
54
|
+
raise BackendError(f"Invalid resource URL {resource_url}")
|
|
55
|
+
return matches[0]
|
|
56
|
+
|
|
57
|
+
|
|
32
58
|
def get_availability_zones(
|
|
33
59
|
regions_client: compute_v1.RegionsClient,
|
|
34
60
|
project_id: str,
|
|
@@ -123,6 +149,7 @@ def create_instance_struct(
|
|
|
123
149
|
roce_subnetworks: Optional[List[Tuple[str, str]]] = None,
|
|
124
150
|
allocate_public_ip: bool = True,
|
|
125
151
|
placement_policy: Optional[str] = None,
|
|
152
|
+
reservation: Optional[compute_v1.Reservation] = None,
|
|
126
153
|
) -> compute_v1.Instance:
|
|
127
154
|
instance = compute_v1.Instance()
|
|
128
155
|
instance.name = instance_name
|
|
@@ -147,6 +174,25 @@ def create_instance_struct(
|
|
|
147
174
|
initialize_params.disk_type = f"zones/{zone}/diskTypes/hyperdisk-balanced"
|
|
148
175
|
disk.initialize_params = initialize_params
|
|
149
176
|
instance.disks = [disk]
|
|
177
|
+
if (
|
|
178
|
+
reservation is not None
|
|
179
|
+
and reservation.specific_reservation is not None
|
|
180
|
+
and reservation.specific_reservation.instance_properties is not None
|
|
181
|
+
and reservation.specific_reservation.instance_properties.local_ssds is not None
|
|
182
|
+
):
|
|
183
|
+
for local_ssd in reservation.specific_reservation.instance_properties.local_ssds:
|
|
184
|
+
instance.disks.append(
|
|
185
|
+
compute_v1.AttachedDisk(
|
|
186
|
+
auto_delete=True,
|
|
187
|
+
boot=False,
|
|
188
|
+
type_="SCRATCH",
|
|
189
|
+
initialize_params=compute_v1.AttachedDiskInitializeParams(
|
|
190
|
+
disk_type=f"zones/{zone}/diskTypes/local-ssd",
|
|
191
|
+
disk_size_gb=local_ssd.disk_size_gb,
|
|
192
|
+
),
|
|
193
|
+
interface=local_ssd.interface,
|
|
194
|
+
)
|
|
195
|
+
)
|
|
150
196
|
|
|
151
197
|
if accelerators:
|
|
152
198
|
instance.guest_accelerators = accelerators
|
|
@@ -162,6 +208,8 @@ def create_instance_struct(
|
|
|
162
208
|
|
|
163
209
|
if placement_policy is not None:
|
|
164
210
|
instance.resource_policies = [placement_policy]
|
|
211
|
+
elif reservation is not None and "placement" in reservation.resource_policies:
|
|
212
|
+
instance.resource_policies = [reservation.resource_policies["placement"]]
|
|
165
213
|
|
|
166
214
|
if spot:
|
|
167
215
|
instance.scheduling = compute_v1.Scheduling()
|
|
@@ -187,6 +235,17 @@ def create_instance_struct(
|
|
|
187
235
|
)
|
|
188
236
|
]
|
|
189
237
|
|
|
238
|
+
if reservation is not None:
|
|
239
|
+
reservation_project = get_resource_project(reservation.self_link)
|
|
240
|
+
instance.reservation_affinity = compute_v1.ReservationAffinity()
|
|
241
|
+
instance.reservation_affinity.consume_reservation_type = (
|
|
242
|
+
compute_v1.ReservationAffinity.ConsumeReservationType.SPECIFIC_RESERVATION.name
|
|
243
|
+
)
|
|
244
|
+
instance.reservation_affinity.key = "compute.googleapis.com/reservation-name"
|
|
245
|
+
instance.reservation_affinity.values = [
|
|
246
|
+
f"projects/{reservation_project}/reservations/{reservation.name}"
|
|
247
|
+
]
|
|
248
|
+
|
|
190
249
|
return instance
|
|
191
250
|
|
|
192
251
|
|
|
@@ -350,11 +409,8 @@ def get_accelerators(
|
|
|
350
409
|
return []
|
|
351
410
|
accelerator_config = compute_v1.AcceleratorConfig()
|
|
352
411
|
accelerator_config.accelerator_count = len(gpus)
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
accelerator_name = acc["accelerator_name"]
|
|
356
|
-
break
|
|
357
|
-
else:
|
|
412
|
+
accelerator_name = find_accelerator_name(gpus[0].name, gpus[0].memory_mib)
|
|
413
|
+
if accelerator_name is None:
|
|
358
414
|
raise ValueError(f"Unsupported GPU: {gpus[0].name} {gpus[0].memory_mib} MiB")
|
|
359
415
|
accelerator_config.accelerator_type = (
|
|
360
416
|
f"projects/{project_id}/zones/{zone}/acceleratorTypes/{accelerator_name}"
|
|
@@ -362,6 +418,31 @@ def get_accelerators(
|
|
|
362
418
|
return [accelerator_config]
|
|
363
419
|
|
|
364
420
|
|
|
421
|
+
def find_reservation(
|
|
422
|
+
reservations_client: compute_v1.ReservationsClient,
|
|
423
|
+
project_id: str,
|
|
424
|
+
name: str,
|
|
425
|
+
) -> dict[str, compute_v1.Reservation]:
|
|
426
|
+
request = compute_v1.AggregatedListReservationsRequest(
|
|
427
|
+
project=project_id,
|
|
428
|
+
filter=(
|
|
429
|
+
f'(name = "{sanitize_filter_value(name)}")'
|
|
430
|
+
' AND (status = "READY")'
|
|
431
|
+
" AND (specificReservationRequired = true)"
|
|
432
|
+
),
|
|
433
|
+
)
|
|
434
|
+
try:
|
|
435
|
+
aggregated_reservations = reservations_client.aggregated_list(request=request)
|
|
436
|
+
except (google.api_core.exceptions.NotFound, google.api_core.exceptions.Forbidden) as e:
|
|
437
|
+
logger.warning("Could not find reservation: %s", e)
|
|
438
|
+
return {}
|
|
439
|
+
zone_to_reservation = {}
|
|
440
|
+
for zone, zone_reservations in aggregated_reservations:
|
|
441
|
+
if zone_reservations.reservations:
|
|
442
|
+
zone_to_reservation[zone.split("/")[-1]] = zone_reservations.reservations[0]
|
|
443
|
+
return zone_to_reservation
|
|
444
|
+
|
|
445
|
+
|
|
365
446
|
def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
|
|
366
447
|
filtered_labels = {}
|
|
367
448
|
for k, v in labels.items():
|
|
@@ -499,5 +580,6 @@ def instance_type_supports_persistent_disk(instance_type_name: str) -> bool:
|
|
|
499
580
|
"h3-",
|
|
500
581
|
"v6e",
|
|
501
582
|
"a4-",
|
|
583
|
+
"g4-",
|
|
502
584
|
]
|
|
503
585
|
)
|
|
@@ -52,7 +52,7 @@ INSTANCE_TYPE_SPECS = {
|
|
|
52
52
|
"cpu_frequency": 2800000000,
|
|
53
53
|
"cpu_manufacturer": "Intel",
|
|
54
54
|
},
|
|
55
|
-
"4x MI300X 52x Xeon Platinum
|
|
55
|
+
"4x MI300X 52x Xeon Platinum 8470": {
|
|
56
56
|
"cpu_model": "Xeon Platinum 8470",
|
|
57
57
|
"cpu_frequency": 2000000000,
|
|
58
58
|
"cpu_manufacturer": "Intel",
|
|
@@ -62,6 +62,16 @@ INSTANCE_TYPE_SPECS = {
|
|
|
62
62
|
"cpu_frequency": 2800000000,
|
|
63
63
|
"cpu_manufacturer": "Intel",
|
|
64
64
|
},
|
|
65
|
+
"8x MI300X 104x Xeon Platinum 8470": {
|
|
66
|
+
"cpu_model": "Xeon Platinum 8470",
|
|
67
|
+
"cpu_frequency": 2000000000,
|
|
68
|
+
"cpu_manufacturer": "Intel",
|
|
69
|
+
},
|
|
70
|
+
"8x MI300X 104x Xeon Platinum 8462Y+": {
|
|
71
|
+
"cpu_model": "Xeon Platinum 8462Y+",
|
|
72
|
+
"cpu_frequency": 2800000000,
|
|
73
|
+
"cpu_manufacturer": "Intel",
|
|
74
|
+
},
|
|
65
75
|
}
|
|
66
76
|
|
|
67
77
|
|
|
@@ -5,7 +5,7 @@ import time
|
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from typing import List, Optional, Tuple
|
|
7
7
|
|
|
8
|
-
from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
|
|
8
|
+
from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor
|
|
9
9
|
from kubernetes import client
|
|
10
10
|
|
|
11
11
|
from dstack._internal.core.backends.base.compute import (
|
|
@@ -59,19 +59,31 @@ from dstack._internal.utils.logging import get_logger
|
|
|
59
59
|
logger = get_logger(__name__)
|
|
60
60
|
|
|
61
61
|
JUMP_POD_SSH_PORT = 22
|
|
62
|
-
|
|
63
|
-
NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
|
|
64
|
-
NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
|
|
62
|
+
DUMMY_REGION = "-"
|
|
65
63
|
|
|
66
64
|
NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
|
|
67
|
-
NVIDIA_GPU_COUNT_LABEL = f"{NVIDIA_GPU_RESOURCE}.count"
|
|
68
|
-
NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
|
|
69
65
|
NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
|
|
66
|
+
NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
|
|
67
|
+
|
|
68
|
+
AMD_GPU_RESOURCE = "amd.com/gpu"
|
|
69
|
+
AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE
|
|
70
|
+
# The oldest but still supported label format, the safest option, see the commit message:
|
|
71
|
+
# https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48
|
|
72
|
+
# E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs
|
|
73
|
+
# We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue:
|
|
74
|
+
# https://github.com/ROCm/k8s-device-plugin/issues/112
|
|
75
|
+
AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id."
|
|
70
76
|
|
|
71
77
|
# Taints we know and tolerate when creating our objects, e.g., the jump pod.
|
|
72
|
-
TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT,)
|
|
78
|
+
TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT)
|
|
73
79
|
|
|
74
|
-
|
|
80
|
+
NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
|
|
81
|
+
NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
|
|
82
|
+
|
|
83
|
+
AMD_GPU_DEVICE_ID_TO_GPU_INFO = {
|
|
84
|
+
device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids
|
|
85
|
+
}
|
|
86
|
+
AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS}
|
|
75
87
|
|
|
76
88
|
|
|
77
89
|
class Operator(str, Enum):
|
|
@@ -112,21 +124,15 @@ class KubernetesCompute(
|
|
|
112
124
|
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
113
125
|
for node in nodes:
|
|
114
126
|
try:
|
|
115
|
-
labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
|
|
116
127
|
name = get_value(node, ".metadata.name", str, required=True)
|
|
117
|
-
cpus = _parse_cpu(
|
|
118
|
-
get_value(node, ".status.allocatable['cpu']", str, required=True)
|
|
119
|
-
)
|
|
120
128
|
cpu_arch = normalize_arch(
|
|
121
129
|
get_value(node, ".status.node_info.architecture", str)
|
|
122
130
|
).to_cpu_architecture()
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
|
|
129
|
-
)
|
|
131
|
+
allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
|
|
132
|
+
cpus = _parse_cpu(allocatable["cpu"])
|
|
133
|
+
memory_mib = _parse_memory(allocatable["memory"])
|
|
134
|
+
disk_size_mib = _parse_memory(allocatable["ephemeral-storage"])
|
|
135
|
+
gpus = _get_node_gpus(node)
|
|
130
136
|
except (AttributeError, KeyError, ValueError) as e:
|
|
131
137
|
logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
|
|
132
138
|
continue
|
|
@@ -161,6 +167,7 @@ class KubernetesCompute(
|
|
|
161
167
|
volumes: List[Volume],
|
|
162
168
|
) -> JobProvisioningData:
|
|
163
169
|
instance_name = generate_unique_instance_name_for_job(run, job)
|
|
170
|
+
assert run.run_spec.ssh_key_pub is not None
|
|
164
171
|
commands = get_docker_commands(
|
|
165
172
|
[run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
|
|
166
173
|
)
|
|
@@ -217,59 +224,18 @@ class KubernetesCompute(
|
|
|
217
224
|
"GPU is requested but the offer has no GPUs:"
|
|
218
225
|
f" {gpu_spec=} {instance_offer=}",
|
|
219
226
|
)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
# We cannot generate an expected GPU label value from the Gpu model instance
|
|
223
|
-
# as the actual values may have additional components (socket, memory type, etc.)
|
|
224
|
-
# that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
|
|
225
|
-
# Moreover, a single Gpu may match multiple label values.
|
|
226
|
-
# As a workaround, we iterate and process all node labels once again (we already
|
|
227
|
-
# processed them in `get_offers_by_requirements()`).
|
|
228
|
-
node_list = call_api_method(
|
|
229
|
-
self.api.list_node,
|
|
230
|
-
client.V1NodeList,
|
|
231
|
-
)
|
|
232
|
-
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
233
|
-
for node in nodes:
|
|
234
|
-
labels = get_value(node, ".metadata.labels", dict[str, str])
|
|
235
|
-
if not labels:
|
|
236
|
-
continue
|
|
237
|
-
gpus, gpu_label_value = _get_gpus_from_node_labels(labels)
|
|
238
|
-
if not gpus or gpu_label_value is None:
|
|
239
|
-
continue
|
|
240
|
-
if gpus[0] == offer_gpu:
|
|
241
|
-
matching_gpu_label_values.add(gpu_label_value)
|
|
242
|
-
if not matching_gpu_label_values:
|
|
243
|
-
raise ComputeError(
|
|
244
|
-
f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
|
|
245
|
-
)
|
|
246
|
-
logger.debug(
|
|
247
|
-
"Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
|
|
248
|
-
)
|
|
249
|
-
# TODO: support other GPU vendors
|
|
250
|
-
resources_requests[NVIDIA_GPU_RESOURCE] = str(gpu_min)
|
|
251
|
-
resources_limits[NVIDIA_GPU_RESOURCE] = str(gpu_min)
|
|
252
|
-
node_affinity = client.V1NodeAffinity(
|
|
253
|
-
required_during_scheduling_ignored_during_execution=[
|
|
254
|
-
client.V1NodeSelectorTerm(
|
|
255
|
-
match_expressions=[
|
|
256
|
-
client.V1NodeSelectorRequirement(
|
|
257
|
-
key=NVIDIA_GPU_PRODUCT_LABEL,
|
|
258
|
-
operator=Operator.IN,
|
|
259
|
-
values=list(matching_gpu_label_values),
|
|
260
|
-
),
|
|
261
|
-
],
|
|
262
|
-
),
|
|
263
|
-
],
|
|
227
|
+
gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu(
|
|
228
|
+
self.api, offer_gpus[0]
|
|
264
229
|
)
|
|
230
|
+
logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_min)
|
|
231
|
+
resources_requests[gpu_resource] = resources_limits[gpu_resource] = str(gpu_min)
|
|
265
232
|
# It should be NoSchedule, but we also add NoExecute toleration just in case.
|
|
266
233
|
for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
|
|
267
234
|
tolerations.append(
|
|
268
235
|
client.V1Toleration(
|
|
269
|
-
key=
|
|
236
|
+
key=node_taint, operator=Operator.EXISTS, effect=effect
|
|
270
237
|
)
|
|
271
238
|
)
|
|
272
|
-
|
|
273
239
|
if (memory_min := resources_spec.memory.min) is not None:
|
|
274
240
|
resources_requests["memory"] = _render_memory(memory_min)
|
|
275
241
|
if (
|
|
@@ -331,7 +297,9 @@ class KubernetesCompute(
|
|
|
331
297
|
volume_mounts=volume_mounts,
|
|
332
298
|
)
|
|
333
299
|
],
|
|
334
|
-
affinity=
|
|
300
|
+
affinity=client.V1Affinity(
|
|
301
|
+
node_affinity=node_affinity,
|
|
302
|
+
),
|
|
335
303
|
tolerations=tolerations,
|
|
336
304
|
volumes=volumes_,
|
|
337
305
|
),
|
|
@@ -550,34 +518,144 @@ def _render_memory(memory: Memory) -> str:
|
|
|
550
518
|
return f"{float(memory)}Gi"
|
|
551
519
|
|
|
552
520
|
|
|
553
|
-
def
|
|
521
|
+
def _get_node_gpus(node: client.V1Node) -> list[Gpu]:
|
|
522
|
+
node_name = get_value(node, ".metadata.name", str, required=True)
|
|
523
|
+
allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
|
|
524
|
+
labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
|
|
525
|
+
for gpu_resource, gpu_getter in (
|
|
526
|
+
(NVIDIA_GPU_RESOURCE, _get_nvidia_gpu_from_node_labels),
|
|
527
|
+
(AMD_GPU_RESOURCE, _get_amd_gpu_from_node_labels),
|
|
528
|
+
):
|
|
529
|
+
_gpu_count = allocatable.get(gpu_resource)
|
|
530
|
+
if not _gpu_count:
|
|
531
|
+
continue
|
|
532
|
+
gpu_count = int(_gpu_count)
|
|
533
|
+
if gpu_count < 1:
|
|
534
|
+
continue
|
|
535
|
+
gpu = gpu_getter(labels)
|
|
536
|
+
if gpu is None:
|
|
537
|
+
logger.warning(
|
|
538
|
+
"Node %s: GPU resource found, but failed to detect its model: %s=%d",
|
|
539
|
+
node_name,
|
|
540
|
+
gpu_resource,
|
|
541
|
+
gpu_count,
|
|
542
|
+
)
|
|
543
|
+
return []
|
|
544
|
+
return [gpu] * gpu_count
|
|
545
|
+
logger.debug("Node %s: no GPU resource found", node_name)
|
|
546
|
+
return []
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
|
|
554
550
|
# We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
|
|
555
551
|
# to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
|
|
556
552
|
# "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
|
|
557
553
|
# Thus, we convert the product name to a known gpu name.
|
|
558
|
-
# TODO: support other GPU vendors
|
|
559
|
-
gpu_count = labels.get(NVIDIA_GPU_COUNT_LABEL)
|
|
560
554
|
gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
|
|
561
|
-
if
|
|
562
|
-
return
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
for known_gpu_name in NVIDIA_GPU_NAMES:
|
|
566
|
-
if known_gpu_name.lower() in gpu_product.lower().split("-"):
|
|
567
|
-
gpu_name = known_gpu_name
|
|
555
|
+
if gpu_product is None:
|
|
556
|
+
return None
|
|
557
|
+
for gpu_name in NVIDIA_GPU_NAMES:
|
|
558
|
+
if gpu_name.lower() in gpu_product.lower().split("-"):
|
|
568
559
|
break
|
|
569
|
-
|
|
570
|
-
return
|
|
560
|
+
else:
|
|
561
|
+
return None
|
|
571
562
|
gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
|
|
572
563
|
gpu_memory = gpu_info.memory * 1024
|
|
573
564
|
# A100 may come in two variants
|
|
574
565
|
if "40GB" in gpu_product:
|
|
575
566
|
gpu_memory = 40 * 1024
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
567
|
+
return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def _get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
|
|
571
|
+
# (AMDGPUInfo.name, AMDGPUInfo.memory) pairs
|
|
572
|
+
gpus: set[tuple[str, int]] = set()
|
|
573
|
+
for label in labels:
|
|
574
|
+
if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX):
|
|
575
|
+
continue
|
|
576
|
+
_, _, _device_id = label.rpartition(".")
|
|
577
|
+
device_id = int(_device_id, 16)
|
|
578
|
+
gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id)
|
|
579
|
+
if gpu_info is None:
|
|
580
|
+
logger.warning("Unknown AMD GPU device id: %X", device_id)
|
|
581
|
+
continue
|
|
582
|
+
gpus.add((gpu_info.name, gpu_info.memory))
|
|
583
|
+
if not gpus:
|
|
584
|
+
return None
|
|
585
|
+
if len(gpus) == 1:
|
|
586
|
+
gpu_name, gpu_memory_gib = next(iter(gpus))
|
|
587
|
+
return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024)
|
|
588
|
+
logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus)
|
|
589
|
+
return None
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def _get_pod_spec_parameters_for_gpu(
|
|
593
|
+
api: client.CoreV1Api, gpu: Gpu
|
|
594
|
+
) -> tuple[str, client.V1NodeAffinity, str]:
|
|
595
|
+
gpu_vendor = gpu.vendor
|
|
596
|
+
assert gpu_vendor is not None
|
|
597
|
+
if gpu_vendor == AcceleratorVendor.NVIDIA:
|
|
598
|
+
node_affinity = _get_nvidia_gpu_node_affinity(api, gpu)
|
|
599
|
+
return NVIDIA_GPU_RESOURCE, node_affinity, NVIDIA_GPU_NODE_TAINT
|
|
600
|
+
if gpu_vendor == AcceleratorVendor.AMD:
|
|
601
|
+
node_affinity = _get_amd_gpu_node_affinity(gpu)
|
|
602
|
+
return AMD_GPU_RESOURCE, node_affinity, AMD_GPU_NODE_TAINT
|
|
603
|
+
raise ComputeError(f"Unsupported GPU vendor: {gpu_vendor}")
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def _get_nvidia_gpu_node_affinity(api: client.CoreV1Api, gpu: Gpu) -> client.V1NodeAffinity:
|
|
607
|
+
matching_gpu_label_values: set[str] = set()
|
|
608
|
+
# We cannot generate an expected GPU label value from the Gpu model instance
|
|
609
|
+
# as the actual values may have additional components (socket, memory type, etc.)
|
|
610
|
+
# that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
|
|
611
|
+
# Moreover, a single Gpu may match multiple label values.
|
|
612
|
+
# As a workaround, we iterate and process all node labels once again (we already
|
|
613
|
+
# processed them in `get_offers_by_requirements()`).
|
|
614
|
+
node_list = call_api_method(api.list_node, client.V1NodeList)
|
|
615
|
+
nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
|
|
616
|
+
for node in nodes:
|
|
617
|
+
labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
|
|
618
|
+
if _get_nvidia_gpu_from_node_labels(labels) == gpu:
|
|
619
|
+
matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL])
|
|
620
|
+
if not matching_gpu_label_values:
|
|
621
|
+
raise ComputeError(f"NVIDIA GPU is requested but no matching GPU labels found: {gpu=}")
|
|
622
|
+
logger.debug("Selecting nodes by labels %s for NVIDIA %s", matching_gpu_label_values, gpu.name)
|
|
623
|
+
return client.V1NodeAffinity(
|
|
624
|
+
required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
|
|
625
|
+
node_selector_terms=[
|
|
626
|
+
client.V1NodeSelectorTerm(
|
|
627
|
+
match_expressions=[
|
|
628
|
+
client.V1NodeSelectorRequirement(
|
|
629
|
+
key=NVIDIA_GPU_PRODUCT_LABEL,
|
|
630
|
+
operator=Operator.IN,
|
|
631
|
+
values=list(matching_gpu_label_values),
|
|
632
|
+
),
|
|
633
|
+
],
|
|
634
|
+
),
|
|
635
|
+
],
|
|
636
|
+
),
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def _get_amd_gpu_node_affinity(gpu: Gpu) -> client.V1NodeAffinity:
|
|
641
|
+
device_ids = AMD_GPU_NAME_TO_DEVICE_IDS.get(gpu.name)
|
|
642
|
+
if device_ids is None:
|
|
643
|
+
raise ComputeError(f"AMD GPU is requested but no matching device ids found: {gpu=}")
|
|
644
|
+
return client.V1NodeAffinity(
|
|
645
|
+
required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
|
|
646
|
+
node_selector_terms=[
|
|
647
|
+
client.V1NodeSelectorTerm(
|
|
648
|
+
match_expressions=[
|
|
649
|
+
client.V1NodeSelectorRequirement(
|
|
650
|
+
key=f"{AMD_GPU_DEVICE_ID_LABEL_PREFIX}{device_id:x}",
|
|
651
|
+
operator=Operator.EXISTS,
|
|
652
|
+
),
|
|
653
|
+
],
|
|
654
|
+
)
|
|
655
|
+
for device_id in device_ids
|
|
656
|
+
],
|
|
657
|
+
),
|
|
658
|
+
)
|
|
581
659
|
|
|
582
660
|
|
|
583
661
|
def _continue_setup_jump_pod(
|
|
@@ -37,7 +37,7 @@ class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class KubeconfigFileConfig(CoreModel):
|
|
40
|
-
filename: Annotated[str, Field(description="The path to the kubeconfig file")]
|
|
40
|
+
filename: Annotated[str, Field(description="The path to the kubeconfig file")] = ""
|
|
41
41
|
data: Annotated[
|
|
42
42
|
Optional[str],
|
|
43
43
|
Field(
|
|
@@ -50,7 +50,9 @@ class KubeconfigFileConfig(CoreModel):
|
|
|
50
50
|
] = None
|
|
51
51
|
|
|
52
52
|
@root_validator
|
|
53
|
-
def fill_data(cls, values):
|
|
53
|
+
def fill_data(cls, values: dict) -> dict:
|
|
54
|
+
if values.get("filename") == "" and values.get("data") is None:
|
|
55
|
+
raise ValueError("filename or data must be specified")
|
|
54
56
|
return fill_data(values)
|
|
55
57
|
|
|
56
58
|
|
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import random
|
|
3
3
|
import shlex
|
|
4
4
|
import time
|
|
5
|
+
from collections.abc import Iterable
|
|
5
6
|
from functools import cached_property
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import List, Optional
|
|
7
8
|
|
|
8
9
|
from nebius.aio.operation import Operation as SDKOperation
|
|
9
10
|
from nebius.aio.service_error import RequestError, StatusCode
|
|
@@ -21,7 +22,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
21
22
|
get_user_data,
|
|
22
23
|
merge_tags,
|
|
23
24
|
)
|
|
24
|
-
from dstack._internal.core.backends.base.offers import
|
|
25
|
+
from dstack._internal.core.backends.base.offers import (
|
|
26
|
+
OfferModifier,
|
|
27
|
+
get_catalog_offers,
|
|
28
|
+
get_offers_disk_modifier,
|
|
29
|
+
)
|
|
25
30
|
from dstack._internal.core.backends.nebius import resources
|
|
26
31
|
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
|
|
27
32
|
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
|
|
@@ -125,10 +130,8 @@ class NebiusCompute(
|
|
|
125
130
|
for offer in offers
|
|
126
131
|
]
|
|
127
132
|
|
|
128
|
-
def
|
|
129
|
-
|
|
130
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
131
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
133
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
134
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
132
135
|
|
|
133
136
|
def create_instance(
|
|
134
137
|
self,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
1
2
|
from concurrent.futures import ThreadPoolExecutor
|
|
2
3
|
from functools import cached_property
|
|
3
|
-
from typing import
|
|
4
|
+
from typing import List, Optional
|
|
4
5
|
|
|
5
6
|
import oci
|
|
6
7
|
|
|
@@ -13,7 +14,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
13
14
|
generate_unique_instance_name,
|
|
14
15
|
get_user_data,
|
|
15
16
|
)
|
|
16
|
-
from dstack._internal.core.backends.base.offers import
|
|
17
|
+
from dstack._internal.core.backends.base.offers import (
|
|
18
|
+
OfferModifier,
|
|
19
|
+
get_catalog_offers,
|
|
20
|
+
get_offers_disk_modifier,
|
|
21
|
+
)
|
|
17
22
|
from dstack._internal.core.backends.oci import resources
|
|
18
23
|
from dstack._internal.core.backends.oci.models import OCIConfig
|
|
19
24
|
from dstack._internal.core.backends.oci.region import make_region_clients_map
|
|
@@ -96,10 +101,8 @@ class OCICompute(
|
|
|
96
101
|
|
|
97
102
|
return offers_with_availability
|
|
98
103
|
|
|
99
|
-
def
|
|
100
|
-
|
|
101
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
102
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
104
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
105
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
103
106
|
|
|
104
107
|
def terminate_instance(
|
|
105
108
|
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
|
+
from collections.abc import Iterable
|
|
3
4
|
from datetime import timedelta
|
|
4
|
-
from typing import
|
|
5
|
+
from typing import List, Optional
|
|
5
6
|
|
|
6
7
|
from dstack._internal.core.backends.base.backend import Compute
|
|
7
8
|
from dstack._internal.core.backends.base.compute import (
|
|
@@ -12,7 +13,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
12
13
|
get_docker_commands,
|
|
13
14
|
get_job_instance_name,
|
|
14
15
|
)
|
|
15
|
-
from dstack._internal.core.backends.base.offers import
|
|
16
|
+
from dstack._internal.core.backends.base.offers import (
|
|
17
|
+
OfferModifier,
|
|
18
|
+
get_catalog_offers,
|
|
19
|
+
get_offers_disk_modifier,
|
|
20
|
+
)
|
|
16
21
|
from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
|
|
17
22
|
from dstack._internal.core.backends.runpod.models import RunpodConfig
|
|
18
23
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
@@ -72,10 +77,8 @@ class RunpodCompute(
|
|
|
72
77
|
]
|
|
73
78
|
return offers
|
|
74
79
|
|
|
75
|
-
def
|
|
76
|
-
|
|
77
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
78
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
80
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
81
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
79
82
|
|
|
80
83
|
def run_job(
|
|
81
84
|
self,
|
|
@@ -86,6 +89,7 @@ class RunpodCompute(
|
|
|
86
89
|
project_ssh_private_key: str,
|
|
87
90
|
volumes: List[Volume],
|
|
88
91
|
) -> JobProvisioningData:
|
|
92
|
+
assert run.run_spec.ssh_key_pub is not None
|
|
89
93
|
instance_config = InstanceConfiguration(
|
|
90
94
|
project_name=run.project_name,
|
|
91
95
|
instance_name=get_job_instance_name(run, job),
|
|
@@ -228,9 +232,12 @@ class RunpodCompute(
|
|
|
228
232
|
def create_volume(self, volume: Volume) -> VolumeProvisioningData:
|
|
229
233
|
volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN)
|
|
230
234
|
size_gb = volume.configuration.size_gb
|
|
235
|
+
# Runpod regions must be uppercase.
|
|
236
|
+
# Lowercase regions are accepted in the API but they break Runpod in several ways.
|
|
237
|
+
region = volume.configuration.region.upper()
|
|
231
238
|
volume_id = self.api_client.create_network_volume(
|
|
232
239
|
name=volume_name,
|
|
233
|
-
region=
|
|
240
|
+
region=region,
|
|
234
241
|
size=size_gb,
|
|
235
242
|
)
|
|
236
243
|
return VolumeProvisioningData(
|
|
@@ -47,7 +47,7 @@ class VastAICompute(
|
|
|
47
47
|
"reliability2": {"gte": 0.9},
|
|
48
48
|
"inet_down": {"gt": 128},
|
|
49
49
|
"verified": {"eq": True},
|
|
50
|
-
"cuda_max_good": {"gte": 12.
|
|
50
|
+
"cuda_max_good": {"gte": 12.8},
|
|
51
51
|
"compute_cap": {"gte": 600},
|
|
52
52
|
}
|
|
53
53
|
)
|
|
@@ -58,6 +58,7 @@ class VastAICompute(
|
|
|
58
58
|
) -> List[InstanceOfferWithAvailability]:
|
|
59
59
|
offers = get_catalog_offers(
|
|
60
60
|
backend=BackendType.VASTAI,
|
|
61
|
+
locations=self.config.regions or None,
|
|
61
62
|
requirements=requirements,
|
|
62
63
|
# TODO(egor-s): spots currently not supported
|
|
63
64
|
extra_filter=lambda offer: not offer.instance.resources.spot,
|
|
@@ -85,6 +86,7 @@ class VastAICompute(
|
|
|
85
86
|
instance_name = generate_unique_instance_name_for_job(
|
|
86
87
|
run, job, max_length=MAX_INSTANCE_NAME_LEN
|
|
87
88
|
)
|
|
89
|
+
assert run.run_spec.ssh_key_pub is not None
|
|
88
90
|
commands = get_docker_commands(
|
|
89
91
|
[run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
|
|
90
92
|
)
|