dstack 0.19.32__py3-none-any.whl → 0.19.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (54) hide show
  1. dstack/_internal/cli/commands/offer.py +1 -1
  2. dstack/_internal/cli/services/configurators/run.py +1 -5
  3. dstack/_internal/core/backends/aws/compute.py +8 -5
  4. dstack/_internal/core/backends/azure/compute.py +9 -6
  5. dstack/_internal/core/backends/base/compute.py +40 -17
  6. dstack/_internal/core/backends/base/offers.py +7 -1
  7. dstack/_internal/core/backends/datacrunch/compute.py +9 -6
  8. dstack/_internal/core/backends/gcp/compute.py +151 -6
  9. dstack/_internal/core/backends/gcp/models.py +10 -0
  10. dstack/_internal/core/backends/gcp/resources.py +87 -5
  11. dstack/_internal/core/backends/hotaisle/compute.py +11 -1
  12. dstack/_internal/core/backends/kubernetes/compute.py +161 -83
  13. dstack/_internal/core/backends/kubernetes/models.py +4 -2
  14. dstack/_internal/core/backends/nebius/compute.py +9 -6
  15. dstack/_internal/core/backends/oci/compute.py +9 -6
  16. dstack/_internal/core/backends/runpod/compute.py +14 -7
  17. dstack/_internal/core/backends/vastai/compute.py +3 -1
  18. dstack/_internal/core/backends/vastai/configurator.py +0 -1
  19. dstack/_internal/core/compatibility/runs.py +25 -4
  20. dstack/_internal/core/models/fleets.py +1 -1
  21. dstack/_internal/core/models/instances.py +2 -1
  22. dstack/_internal/core/models/profiles.py +1 -1
  23. dstack/_internal/core/models/runs.py +4 -2
  24. dstack/_internal/core/models/users.py +10 -0
  25. dstack/_internal/core/services/configs/__init__.py +1 -0
  26. dstack/_internal/core/services/ssh/key_manager.py +56 -0
  27. dstack/_internal/server/background/tasks/process_instances.py +5 -1
  28. dstack/_internal/server/background/tasks/process_running_jobs.py +1 -0
  29. dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
  30. dstack/_internal/server/models.py +6 -0
  31. dstack/_internal/server/routers/metrics.py +6 -2
  32. dstack/_internal/server/routers/runs.py +5 -1
  33. dstack/_internal/server/routers/users.py +21 -2
  34. dstack/_internal/server/services/jobs/__init__.py +18 -9
  35. dstack/_internal/server/services/offers.py +1 -0
  36. dstack/_internal/server/services/runs.py +13 -4
  37. dstack/_internal/server/services/users.py +35 -2
  38. dstack/_internal/server/statics/index.html +1 -1
  39. dstack/_internal/server/statics/main-720ce3a11140daa480cc.css +3 -0
  40. dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js → main-e79754c136f1d8e4e7e6.js} +12632 -8039
  41. dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js.map → main-e79754c136f1d8e4e7e6.js.map} +1 -1
  42. dstack/_internal/server/testing/common.py +4 -0
  43. dstack/api/_public/__init__.py +8 -11
  44. dstack/api/_public/repos.py +0 -21
  45. dstack/api/_public/runs.py +61 -9
  46. dstack/api/server/__init__.py +4 -0
  47. dstack/api/server/_users.py +17 -2
  48. dstack/version.py +2 -2
  49. {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/METADATA +2 -2
  50. {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/RECORD +53 -51
  51. dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +0 -3
  52. {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/WHEEL +0 -0
  53. {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/entry_points.txt +0 -0
  54. {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/licenses/LICENSE.md +0 -0
@@ -26,9 +26,35 @@ supported_accelerators = [
26
26
  {"accelerator_name": "nvidia-tesla-t4", "gpu_name": "T4", "memory_mb": 1024 * 16},
27
27
  {"accelerator_name": "nvidia-tesla-v100", "gpu_name": "V100", "memory_mb": 1024 * 16},
28
28
  {"accelerator_name": "nvidia-tesla-p100", "gpu_name": "P100", "memory_mb": 1024 * 16},
29
+ {"accelerator_name": "nvidia-rtx-pro-6000", "gpu_name": "RTXPRO6000", "memory_mb": 1024 * 96},
29
30
  ]
30
31
 
31
32
 
33
+ def find_accelerator_name(gpu_name: str, memory_mib: int) -> Optional[str]:
34
+ for acc in supported_accelerators:
35
+ if gpu_name == acc["gpu_name"] and memory_mib == acc["memory_mb"]:
36
+ return acc["accelerator_name"]
37
+ return None
38
+
39
+
40
+ def sanitize_filter_value(value: str) -> str:
41
+ """
42
+ Escape characters that could break the Compute Engine API filter string.
43
+ """
44
+ return value.replace("\\", "\\\\").replace('"', '\\"')
45
+
46
+
47
+ def get_resource_project(resource_url: str) -> str:
48
+ """
49
+ Extract the project ID from a URL like
50
+ https://www.googleapis.com/compute/v1/projects/proj-id/zones/us-central1-a/instances/vm-name
51
+ """
52
+ matches = re.findall(r"/projects/(?P<project_id>[a-z0-9-]+)/", resource_url)
53
+ if not matches:
54
+ raise BackendError(f"Invalid resource URL {resource_url}")
55
+ return matches[0]
56
+
57
+
32
58
  def get_availability_zones(
33
59
  regions_client: compute_v1.RegionsClient,
34
60
  project_id: str,
@@ -123,6 +149,7 @@ def create_instance_struct(
123
149
  roce_subnetworks: Optional[List[Tuple[str, str]]] = None,
124
150
  allocate_public_ip: bool = True,
125
151
  placement_policy: Optional[str] = None,
152
+ reservation: Optional[compute_v1.Reservation] = None,
126
153
  ) -> compute_v1.Instance:
127
154
  instance = compute_v1.Instance()
128
155
  instance.name = instance_name
@@ -147,6 +174,25 @@ def create_instance_struct(
147
174
  initialize_params.disk_type = f"zones/{zone}/diskTypes/hyperdisk-balanced"
148
175
  disk.initialize_params = initialize_params
149
176
  instance.disks = [disk]
177
+ if (
178
+ reservation is not None
179
+ and reservation.specific_reservation is not None
180
+ and reservation.specific_reservation.instance_properties is not None
181
+ and reservation.specific_reservation.instance_properties.local_ssds is not None
182
+ ):
183
+ for local_ssd in reservation.specific_reservation.instance_properties.local_ssds:
184
+ instance.disks.append(
185
+ compute_v1.AttachedDisk(
186
+ auto_delete=True,
187
+ boot=False,
188
+ type_="SCRATCH",
189
+ initialize_params=compute_v1.AttachedDiskInitializeParams(
190
+ disk_type=f"zones/{zone}/diskTypes/local-ssd",
191
+ disk_size_gb=local_ssd.disk_size_gb,
192
+ ),
193
+ interface=local_ssd.interface,
194
+ )
195
+ )
150
196
 
151
197
  if accelerators:
152
198
  instance.guest_accelerators = accelerators
@@ -162,6 +208,8 @@ def create_instance_struct(
162
208
 
163
209
  if placement_policy is not None:
164
210
  instance.resource_policies = [placement_policy]
211
+ elif reservation is not None and "placement" in reservation.resource_policies:
212
+ instance.resource_policies = [reservation.resource_policies["placement"]]
165
213
 
166
214
  if spot:
167
215
  instance.scheduling = compute_v1.Scheduling()
@@ -187,6 +235,17 @@ def create_instance_struct(
187
235
  )
188
236
  ]
189
237
 
238
+ if reservation is not None:
239
+ reservation_project = get_resource_project(reservation.self_link)
240
+ instance.reservation_affinity = compute_v1.ReservationAffinity()
241
+ instance.reservation_affinity.consume_reservation_type = (
242
+ compute_v1.ReservationAffinity.ConsumeReservationType.SPECIFIC_RESERVATION.name
243
+ )
244
+ instance.reservation_affinity.key = "compute.googleapis.com/reservation-name"
245
+ instance.reservation_affinity.values = [
246
+ f"projects/{reservation_project}/reservations/{reservation.name}"
247
+ ]
248
+
190
249
  return instance
191
250
 
192
251
 
@@ -350,11 +409,8 @@ def get_accelerators(
350
409
  return []
351
410
  accelerator_config = compute_v1.AcceleratorConfig()
352
411
  accelerator_config.accelerator_count = len(gpus)
353
- for acc in supported_accelerators:
354
- if gpus[0].name == acc["gpu_name"] and gpus[0].memory_mib == acc["memory_mb"]:
355
- accelerator_name = acc["accelerator_name"]
356
- break
357
- else:
412
+ accelerator_name = find_accelerator_name(gpus[0].name, gpus[0].memory_mib)
413
+ if accelerator_name is None:
358
414
  raise ValueError(f"Unsupported GPU: {gpus[0].name} {gpus[0].memory_mib} MiB")
359
415
  accelerator_config.accelerator_type = (
360
416
  f"projects/{project_id}/zones/{zone}/acceleratorTypes/{accelerator_name}"
@@ -362,6 +418,31 @@ def get_accelerators(
362
418
  return [accelerator_config]
363
419
 
364
420
 
421
+ def find_reservation(
422
+ reservations_client: compute_v1.ReservationsClient,
423
+ project_id: str,
424
+ name: str,
425
+ ) -> dict[str, compute_v1.Reservation]:
426
+ request = compute_v1.AggregatedListReservationsRequest(
427
+ project=project_id,
428
+ filter=(
429
+ f'(name = "{sanitize_filter_value(name)}")'
430
+ ' AND (status = "READY")'
431
+ " AND (specificReservationRequired = true)"
432
+ ),
433
+ )
434
+ try:
435
+ aggregated_reservations = reservations_client.aggregated_list(request=request)
436
+ except (google.api_core.exceptions.NotFound, google.api_core.exceptions.Forbidden) as e:
437
+ logger.warning("Could not find reservation: %s", e)
438
+ return {}
439
+ zone_to_reservation = {}
440
+ for zone, zone_reservations in aggregated_reservations:
441
+ if zone_reservations.reservations:
442
+ zone_to_reservation[zone.split("/")[-1]] = zone_reservations.reservations[0]
443
+ return zone_to_reservation
444
+
445
+
365
446
  def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
366
447
  filtered_labels = {}
367
448
  for k, v in labels.items():
@@ -499,5 +580,6 @@ def instance_type_supports_persistent_disk(instance_type_name: str) -> bool:
499
580
  "h3-",
500
581
  "v6e",
501
582
  "a4-",
583
+ "g4-",
502
584
  ]
503
585
  )
@@ -52,7 +52,7 @@ INSTANCE_TYPE_SPECS = {
52
52
  "cpu_frequency": 2800000000,
53
53
  "cpu_manufacturer": "Intel",
54
54
  },
55
- "4x MI300X 52x Xeon Platinum 8462Y": {
55
+ "4x MI300X 52x Xeon Platinum 8470": {
56
56
  "cpu_model": "Xeon Platinum 8470",
57
57
  "cpu_frequency": 2000000000,
58
58
  "cpu_manufacturer": "Intel",
@@ -62,6 +62,16 @@ INSTANCE_TYPE_SPECS = {
62
62
  "cpu_frequency": 2800000000,
63
63
  "cpu_manufacturer": "Intel",
64
64
  },
65
+ "8x MI300X 104x Xeon Platinum 8470": {
66
+ "cpu_model": "Xeon Platinum 8470",
67
+ "cpu_frequency": 2000000000,
68
+ "cpu_manufacturer": "Intel",
69
+ },
70
+ "8x MI300X 104x Xeon Platinum 8462Y+": {
71
+ "cpu_model": "Xeon Platinum 8462Y+",
72
+ "cpu_frequency": 2800000000,
73
+ "cpu_manufacturer": "Intel",
74
+ },
65
75
  }
66
76
 
67
77
 
@@ -5,7 +5,7 @@ import time
5
5
  from enum import Enum
6
6
  from typing import List, Optional, Tuple
7
7
 
8
- from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
8
+ from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor
9
9
  from kubernetes import client
10
10
 
11
11
  from dstack._internal.core.backends.base.compute import (
@@ -59,19 +59,31 @@ from dstack._internal.utils.logging import get_logger
59
59
  logger = get_logger(__name__)
60
60
 
61
61
  JUMP_POD_SSH_PORT = 22
62
-
63
- NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
64
- NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
62
+ DUMMY_REGION = "-"
65
63
 
66
64
  NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
67
- NVIDIA_GPU_COUNT_LABEL = f"{NVIDIA_GPU_RESOURCE}.count"
68
- NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
69
65
  NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
66
+ NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
67
+
68
+ AMD_GPU_RESOURCE = "amd.com/gpu"
69
+ AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE
70
+ # The oldest but still supported label format, the safest option, see the commit message:
71
+ # https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48
72
+ # E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs
73
+ # We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue:
74
+ # https://github.com/ROCm/k8s-device-plugin/issues/112
75
+ AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id."
70
76
 
71
77
  # Taints we know and tolerate when creating our objects, e.g., the jump pod.
72
- TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT,)
78
+ TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT)
73
79
 
74
- DUMMY_REGION = "-"
80
+ NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
81
+ NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
82
+
83
+ AMD_GPU_DEVICE_ID_TO_GPU_INFO = {
84
+ device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids
85
+ }
86
+ AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS}
75
87
 
76
88
 
77
89
  class Operator(str, Enum):
@@ -112,21 +124,15 @@ class KubernetesCompute(
112
124
  nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
113
125
  for node in nodes:
114
126
  try:
115
- labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
116
127
  name = get_value(node, ".metadata.name", str, required=True)
117
- cpus = _parse_cpu(
118
- get_value(node, ".status.allocatable['cpu']", str, required=True)
119
- )
120
128
  cpu_arch = normalize_arch(
121
129
  get_value(node, ".status.node_info.architecture", str)
122
130
  ).to_cpu_architecture()
123
- memory_mib = _parse_memory(
124
- get_value(node, ".status.allocatable['memory']", str, required=True)
125
- )
126
- gpus, _ = _get_gpus_from_node_labels(labels)
127
- disk_size_mib = _parse_memory(
128
- get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
129
- )
131
+ allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
132
+ cpus = _parse_cpu(allocatable["cpu"])
133
+ memory_mib = _parse_memory(allocatable["memory"])
134
+ disk_size_mib = _parse_memory(allocatable["ephemeral-storage"])
135
+ gpus = _get_node_gpus(node)
130
136
  except (AttributeError, KeyError, ValueError) as e:
131
137
  logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
132
138
  continue
@@ -161,6 +167,7 @@ class KubernetesCompute(
161
167
  volumes: List[Volume],
162
168
  ) -> JobProvisioningData:
163
169
  instance_name = generate_unique_instance_name_for_job(run, job)
170
+ assert run.run_spec.ssh_key_pub is not None
164
171
  commands = get_docker_commands(
165
172
  [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
166
173
  )
@@ -217,59 +224,18 @@ class KubernetesCompute(
217
224
  "GPU is requested but the offer has no GPUs:"
218
225
  f" {gpu_spec=} {instance_offer=}",
219
226
  )
220
- offer_gpu = offer_gpus[0]
221
- matching_gpu_label_values: set[str] = set()
222
- # We cannot generate an expected GPU label value from the Gpu model instance
223
- # as the actual values may have additional components (socket, memory type, etc.)
224
- # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
225
- # Moreover, a single Gpu may match multiple label values.
226
- # As a workaround, we iterate and process all node labels once again (we already
227
- # processed them in `get_offers_by_requirements()`).
228
- node_list = call_api_method(
229
- self.api.list_node,
230
- client.V1NodeList,
231
- )
232
- nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
233
- for node in nodes:
234
- labels = get_value(node, ".metadata.labels", dict[str, str])
235
- if not labels:
236
- continue
237
- gpus, gpu_label_value = _get_gpus_from_node_labels(labels)
238
- if not gpus or gpu_label_value is None:
239
- continue
240
- if gpus[0] == offer_gpu:
241
- matching_gpu_label_values.add(gpu_label_value)
242
- if not matching_gpu_label_values:
243
- raise ComputeError(
244
- f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
245
- )
246
- logger.debug(
247
- "Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
248
- )
249
- # TODO: support other GPU vendors
250
- resources_requests[NVIDIA_GPU_RESOURCE] = str(gpu_min)
251
- resources_limits[NVIDIA_GPU_RESOURCE] = str(gpu_min)
252
- node_affinity = client.V1NodeAffinity(
253
- required_during_scheduling_ignored_during_execution=[
254
- client.V1NodeSelectorTerm(
255
- match_expressions=[
256
- client.V1NodeSelectorRequirement(
257
- key=NVIDIA_GPU_PRODUCT_LABEL,
258
- operator=Operator.IN,
259
- values=list(matching_gpu_label_values),
260
- ),
261
- ],
262
- ),
263
- ],
227
+ gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu(
228
+ self.api, offer_gpus[0]
264
229
  )
230
+ logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_min)
231
+ resources_requests[gpu_resource] = resources_limits[gpu_resource] = str(gpu_min)
265
232
  # It should be NoSchedule, but we also add NoExecute toleration just in case.
266
233
  for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
267
234
  tolerations.append(
268
235
  client.V1Toleration(
269
- key=NVIDIA_GPU_NODE_TAINT, operator=Operator.EXISTS, effect=effect
236
+ key=node_taint, operator=Operator.EXISTS, effect=effect
270
237
  )
271
238
  )
272
-
273
239
  if (memory_min := resources_spec.memory.min) is not None:
274
240
  resources_requests["memory"] = _render_memory(memory_min)
275
241
  if (
@@ -331,7 +297,9 @@ class KubernetesCompute(
331
297
  volume_mounts=volume_mounts,
332
298
  )
333
299
  ],
334
- affinity=node_affinity,
300
+ affinity=client.V1Affinity(
301
+ node_affinity=node_affinity,
302
+ ),
335
303
  tolerations=tolerations,
336
304
  volumes=volumes_,
337
305
  ),
@@ -550,34 +518,144 @@ def _render_memory(memory: Memory) -> str:
550
518
  return f"{float(memory)}Gi"
551
519
 
552
520
 
553
- def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optional[str]]:
521
+ def _get_node_gpus(node: client.V1Node) -> list[Gpu]:
522
+ node_name = get_value(node, ".metadata.name", str, required=True)
523
+ allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
524
+ labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
525
+ for gpu_resource, gpu_getter in (
526
+ (NVIDIA_GPU_RESOURCE, _get_nvidia_gpu_from_node_labels),
527
+ (AMD_GPU_RESOURCE, _get_amd_gpu_from_node_labels),
528
+ ):
529
+ _gpu_count = allocatable.get(gpu_resource)
530
+ if not _gpu_count:
531
+ continue
532
+ gpu_count = int(_gpu_count)
533
+ if gpu_count < 1:
534
+ continue
535
+ gpu = gpu_getter(labels)
536
+ if gpu is None:
537
+ logger.warning(
538
+ "Node %s: GPU resource found, but failed to detect its model: %s=%d",
539
+ node_name,
540
+ gpu_resource,
541
+ gpu_count,
542
+ )
543
+ return []
544
+ return [gpu] * gpu_count
545
+ logger.debug("Node %s: no GPU resource found", node_name)
546
+ return []
547
+
548
+
549
+ def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
554
550
  # We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
555
551
  # to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
556
552
  # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
557
553
  # Thus, we convert the product name to a known gpu name.
558
- # TODO: support other GPU vendors
559
- gpu_count = labels.get(NVIDIA_GPU_COUNT_LABEL)
560
554
  gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
561
- if gpu_count is None or gpu_product is None:
562
- return [], None
563
- gpu_count = int(gpu_count)
564
- gpu_name = None
565
- for known_gpu_name in NVIDIA_GPU_NAMES:
566
- if known_gpu_name.lower() in gpu_product.lower().split("-"):
567
- gpu_name = known_gpu_name
555
+ if gpu_product is None:
556
+ return None
557
+ for gpu_name in NVIDIA_GPU_NAMES:
558
+ if gpu_name.lower() in gpu_product.lower().split("-"):
568
559
  break
569
- if gpu_name is None:
570
- return [], None
560
+ else:
561
+ return None
571
562
  gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
572
563
  gpu_memory = gpu_info.memory * 1024
573
564
  # A100 may come in two variants
574
565
  if "40GB" in gpu_product:
575
566
  gpu_memory = 40 * 1024
576
- gpus = [
577
- Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
578
- for _ in range(gpu_count)
579
- ]
580
- return gpus, gpu_product
567
+ return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
568
+
569
+
570
+ def _get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
571
+ # (AMDGPUInfo.name, AMDGPUInfo.memory) pairs
572
+ gpus: set[tuple[str, int]] = set()
573
+ for label in labels:
574
+ if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX):
575
+ continue
576
+ _, _, _device_id = label.rpartition(".")
577
+ device_id = int(_device_id, 16)
578
+ gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id)
579
+ if gpu_info is None:
580
+ logger.warning("Unknown AMD GPU device id: %X", device_id)
581
+ continue
582
+ gpus.add((gpu_info.name, gpu_info.memory))
583
+ if not gpus:
584
+ return None
585
+ if len(gpus) == 1:
586
+ gpu_name, gpu_memory_gib = next(iter(gpus))
587
+ return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024)
588
+ logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus)
589
+ return None
590
+
591
+
592
+ def _get_pod_spec_parameters_for_gpu(
593
+ api: client.CoreV1Api, gpu: Gpu
594
+ ) -> tuple[str, client.V1NodeAffinity, str]:
595
+ gpu_vendor = gpu.vendor
596
+ assert gpu_vendor is not None
597
+ if gpu_vendor == AcceleratorVendor.NVIDIA:
598
+ node_affinity = _get_nvidia_gpu_node_affinity(api, gpu)
599
+ return NVIDIA_GPU_RESOURCE, node_affinity, NVIDIA_GPU_NODE_TAINT
600
+ if gpu_vendor == AcceleratorVendor.AMD:
601
+ node_affinity = _get_amd_gpu_node_affinity(gpu)
602
+ return AMD_GPU_RESOURCE, node_affinity, AMD_GPU_NODE_TAINT
603
+ raise ComputeError(f"Unsupported GPU vendor: {gpu_vendor}")
604
+
605
+
606
+ def _get_nvidia_gpu_node_affinity(api: client.CoreV1Api, gpu: Gpu) -> client.V1NodeAffinity:
607
+ matching_gpu_label_values: set[str] = set()
608
+ # We cannot generate an expected GPU label value from the Gpu model instance
609
+ # as the actual values may have additional components (socket, memory type, etc.)
610
+ # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
611
+ # Moreover, a single Gpu may match multiple label values.
612
+ # As a workaround, we iterate and process all node labels once again (we already
613
+ # processed them in `get_offers_by_requirements()`).
614
+ node_list = call_api_method(api.list_node, client.V1NodeList)
615
+ nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
616
+ for node in nodes:
617
+ labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
618
+ if _get_nvidia_gpu_from_node_labels(labels) == gpu:
619
+ matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL])
620
+ if not matching_gpu_label_values:
621
+ raise ComputeError(f"NVIDIA GPU is requested but no matching GPU labels found: {gpu=}")
622
+ logger.debug("Selecting nodes by labels %s for NVIDIA %s", matching_gpu_label_values, gpu.name)
623
+ return client.V1NodeAffinity(
624
+ required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
625
+ node_selector_terms=[
626
+ client.V1NodeSelectorTerm(
627
+ match_expressions=[
628
+ client.V1NodeSelectorRequirement(
629
+ key=NVIDIA_GPU_PRODUCT_LABEL,
630
+ operator=Operator.IN,
631
+ values=list(matching_gpu_label_values),
632
+ ),
633
+ ],
634
+ ),
635
+ ],
636
+ ),
637
+ )
638
+
639
+
640
+ def _get_amd_gpu_node_affinity(gpu: Gpu) -> client.V1NodeAffinity:
641
+ device_ids = AMD_GPU_NAME_TO_DEVICE_IDS.get(gpu.name)
642
+ if device_ids is None:
643
+ raise ComputeError(f"AMD GPU is requested but no matching device ids found: {gpu=}")
644
+ return client.V1NodeAffinity(
645
+ required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
646
+ node_selector_terms=[
647
+ client.V1NodeSelectorTerm(
648
+ match_expressions=[
649
+ client.V1NodeSelectorRequirement(
650
+ key=f"{AMD_GPU_DEVICE_ID_LABEL_PREFIX}{device_id:x}",
651
+ operator=Operator.EXISTS,
652
+ ),
653
+ ],
654
+ )
655
+ for device_id in device_ids
656
+ ],
657
+ ),
658
+ )
581
659
 
582
660
 
583
661
  def _continue_setup_jump_pod(
@@ -37,7 +37,7 @@ class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
37
37
 
38
38
 
39
39
  class KubeconfigFileConfig(CoreModel):
40
- filename: Annotated[str, Field(description="The path to the kubeconfig file")]
40
+ filename: Annotated[str, Field(description="The path to the kubeconfig file")] = ""
41
41
  data: Annotated[
42
42
  Optional[str],
43
43
  Field(
@@ -50,7 +50,9 @@ class KubeconfigFileConfig(CoreModel):
50
50
  ] = None
51
51
 
52
52
  @root_validator
53
- def fill_data(cls, values):
53
+ def fill_data(cls, values: dict) -> dict:
54
+ if values.get("filename") == "" and values.get("data") is None:
55
+ raise ValueError("filename or data must be specified")
54
56
  return fill_data(values)
55
57
 
56
58
 
@@ -2,8 +2,9 @@ import json
2
2
  import random
3
3
  import shlex
4
4
  import time
5
+ from collections.abc import Iterable
5
6
  from functools import cached_property
6
- from typing import Callable, List, Optional
7
+ from typing import List, Optional
7
8
 
8
9
  from nebius.aio.operation import Operation as SDKOperation
9
10
  from nebius.aio.service_error import RequestError, StatusCode
@@ -21,7 +22,11 @@ from dstack._internal.core.backends.base.compute import (
21
22
  get_user_data,
22
23
  merge_tags,
23
24
  )
24
- from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
25
+ from dstack._internal.core.backends.base.offers import (
26
+ OfferModifier,
27
+ get_catalog_offers,
28
+ get_offers_disk_modifier,
29
+ )
25
30
  from dstack._internal.core.backends.nebius import resources
26
31
  from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
27
32
  from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
@@ -125,10 +130,8 @@ class NebiusCompute(
125
130
  for offer in offers
126
131
  ]
127
132
 
128
- def get_offers_modifier(
129
- self, requirements: Requirements
130
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
131
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
133
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
134
+ return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
132
135
 
133
136
  def create_instance(
134
137
  self,
@@ -1,6 +1,7 @@
1
+ from collections.abc import Iterable
1
2
  from concurrent.futures import ThreadPoolExecutor
2
3
  from functools import cached_property
3
- from typing import Callable, List, Optional
4
+ from typing import List, Optional
4
5
 
5
6
  import oci
6
7
 
@@ -13,7 +14,11 @@ from dstack._internal.core.backends.base.compute import (
13
14
  generate_unique_instance_name,
14
15
  get_user_data,
15
16
  )
16
- from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
17
+ from dstack._internal.core.backends.base.offers import (
18
+ OfferModifier,
19
+ get_catalog_offers,
20
+ get_offers_disk_modifier,
21
+ )
17
22
  from dstack._internal.core.backends.oci import resources
18
23
  from dstack._internal.core.backends.oci.models import OCIConfig
19
24
  from dstack._internal.core.backends.oci.region import make_region_clients_map
@@ -96,10 +101,8 @@ class OCICompute(
96
101
 
97
102
  return offers_with_availability
98
103
 
99
- def get_offers_modifier(
100
- self, requirements: Requirements
101
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
102
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
104
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
105
+ return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
103
106
 
104
107
  def terminate_instance(
105
108
  self, instance_id: str, region: str, backend_data: Optional[str] = None
@@ -1,7 +1,8 @@
1
1
  import json
2
2
  import uuid
3
+ from collections.abc import Iterable
3
4
  from datetime import timedelta
4
- from typing import Callable, List, Optional
5
+ from typing import List, Optional
5
6
 
6
7
  from dstack._internal.core.backends.base.backend import Compute
7
8
  from dstack._internal.core.backends.base.compute import (
@@ -12,7 +13,11 @@ from dstack._internal.core.backends.base.compute import (
12
13
  get_docker_commands,
13
14
  get_job_instance_name,
14
15
  )
15
- from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
16
+ from dstack._internal.core.backends.base.offers import (
17
+ OfferModifier,
18
+ get_catalog_offers,
19
+ get_offers_disk_modifier,
20
+ )
16
21
  from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
17
22
  from dstack._internal.core.backends.runpod.models import RunpodConfig
18
23
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
@@ -72,10 +77,8 @@ class RunpodCompute(
72
77
  ]
73
78
  return offers
74
79
 
75
- def get_offers_modifier(
76
- self, requirements: Requirements
77
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
78
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
80
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
81
+ return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
79
82
 
80
83
  def run_job(
81
84
  self,
@@ -86,6 +89,7 @@ class RunpodCompute(
86
89
  project_ssh_private_key: str,
87
90
  volumes: List[Volume],
88
91
  ) -> JobProvisioningData:
92
+ assert run.run_spec.ssh_key_pub is not None
89
93
  instance_config = InstanceConfiguration(
90
94
  project_name=run.project_name,
91
95
  instance_name=get_job_instance_name(run, job),
@@ -228,9 +232,12 @@ class RunpodCompute(
228
232
  def create_volume(self, volume: Volume) -> VolumeProvisioningData:
229
233
  volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN)
230
234
  size_gb = volume.configuration.size_gb
235
+ # Runpod regions must be uppercase.
236
+ # Lowercase regions are accepted in the API but they break Runpod in several ways.
237
+ region = volume.configuration.region.upper()
231
238
  volume_id = self.api_client.create_network_volume(
232
239
  name=volume_name,
233
- region=volume.configuration.region,
240
+ region=region,
234
241
  size=size_gb,
235
242
  )
236
243
  return VolumeProvisioningData(
@@ -47,7 +47,7 @@ class VastAICompute(
47
47
  "reliability2": {"gte": 0.9},
48
48
  "inet_down": {"gt": 128},
49
49
  "verified": {"eq": True},
50
- "cuda_max_good": {"gte": 12.1},
50
+ "cuda_max_good": {"gte": 12.8},
51
51
  "compute_cap": {"gte": 600},
52
52
  }
53
53
  )
@@ -58,6 +58,7 @@ class VastAICompute(
58
58
  ) -> List[InstanceOfferWithAvailability]:
59
59
  offers = get_catalog_offers(
60
60
  backend=BackendType.VASTAI,
61
+ locations=self.config.regions or None,
61
62
  requirements=requirements,
62
63
  # TODO(egor-s): spots currently not supported
63
64
  extra_filter=lambda offer: not offer.instance.resources.spot,
@@ -85,6 +86,7 @@ class VastAICompute(
85
86
  instance_name = generate_unique_instance_name_for_job(
86
87
  run, job, max_length=MAX_INSTANCE_NAME_LEN
87
88
  )
89
+ assert run.run_spec.ssh_key_pub is not None
88
90
  commands = get_docker_commands(
89
91
  [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
90
92
  )
@@ -18,7 +18,6 @@ from dstack._internal.core.models.backends.base import (
18
18
  BackendType,
19
19
  )
20
20
 
21
- # VastAI regions are dynamic, currently we don't offer any filtering
22
21
  REGIONS = []
23
22
 
24
23