dstack 0.19.32__py3-none-any.whl → 0.19.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (46) hide show
  1. dstack/_internal/cli/commands/offer.py +1 -1
  2. dstack/_internal/cli/services/configurators/run.py +1 -5
  3. dstack/_internal/core/backends/aws/compute.py +8 -5
  4. dstack/_internal/core/backends/azure/compute.py +9 -6
  5. dstack/_internal/core/backends/base/compute.py +40 -17
  6. dstack/_internal/core/backends/base/offers.py +5 -1
  7. dstack/_internal/core/backends/datacrunch/compute.py +9 -6
  8. dstack/_internal/core/backends/gcp/compute.py +137 -7
  9. dstack/_internal/core/backends/gcp/models.py +7 -0
  10. dstack/_internal/core/backends/gcp/resources.py +87 -5
  11. dstack/_internal/core/backends/hotaisle/compute.py +11 -1
  12. dstack/_internal/core/backends/kubernetes/compute.py +161 -83
  13. dstack/_internal/core/backends/kubernetes/models.py +4 -2
  14. dstack/_internal/core/backends/nebius/compute.py +9 -6
  15. dstack/_internal/core/backends/oci/compute.py +9 -6
  16. dstack/_internal/core/backends/runpod/compute.py +10 -6
  17. dstack/_internal/core/backends/vastai/compute.py +3 -1
  18. dstack/_internal/core/backends/vastai/configurator.py +0 -1
  19. dstack/_internal/core/models/fleets.py +1 -1
  20. dstack/_internal/core/models/profiles.py +1 -1
  21. dstack/_internal/core/models/runs.py +3 -2
  22. dstack/_internal/core/models/users.py +10 -0
  23. dstack/_internal/core/services/configs/__init__.py +1 -0
  24. dstack/_internal/server/background/tasks/process_instances.py +5 -1
  25. dstack/_internal/server/background/tasks/process_running_jobs.py +1 -0
  26. dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
  27. dstack/_internal/server/models.py +3 -0
  28. dstack/_internal/server/routers/runs.py +5 -1
  29. dstack/_internal/server/routers/users.py +14 -2
  30. dstack/_internal/server/services/runs.py +9 -4
  31. dstack/_internal/server/services/users.py +35 -2
  32. dstack/_internal/server/statics/index.html +1 -1
  33. dstack/_internal/server/statics/main-720ce3a11140daa480cc.css +3 -0
  34. dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js → main-97c7e184573ca23f9fe4.js} +12218 -7625
  35. dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js.map → main-97c7e184573ca23f9fe4.js.map} +1 -1
  36. dstack/api/_public/__init__.py +9 -12
  37. dstack/api/_public/repos.py +0 -21
  38. dstack/api/_public/runs.py +64 -9
  39. dstack/api/server/_users.py +17 -2
  40. dstack/version.py +2 -2
  41. {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/METADATA +2 -2
  42. {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/RECORD +45 -44
  43. dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +0 -3
  44. {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/WHEEL +0 -0
  45. {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/entry_points.txt +0 -0
  46. {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -52,7 +52,7 @@ INSTANCE_TYPE_SPECS = {
52
52
  "cpu_frequency": 2800000000,
53
53
  "cpu_manufacturer": "Intel",
54
54
  },
55
- "4x MI300X 52x Xeon Platinum 8462Y": {
55
+ "4x MI300X 52x Xeon Platinum 8470": {
56
56
  "cpu_model": "Xeon Platinum 8470",
57
57
  "cpu_frequency": 2000000000,
58
58
  "cpu_manufacturer": "Intel",
@@ -62,6 +62,16 @@ INSTANCE_TYPE_SPECS = {
62
62
  "cpu_frequency": 2800000000,
63
63
  "cpu_manufacturer": "Intel",
64
64
  },
65
+ "8x MI300X 104x Xeon Platinum 8470": {
66
+ "cpu_model": "Xeon Platinum 8470",
67
+ "cpu_frequency": 2000000000,
68
+ "cpu_manufacturer": "Intel",
69
+ },
70
+ "8x MI300X 104x Xeon Platinum 8462Y+": {
71
+ "cpu_model": "Xeon Platinum 8462Y+",
72
+ "cpu_frequency": 2800000000,
73
+ "cpu_manufacturer": "Intel",
74
+ },
65
75
  }
66
76
 
67
77
 
@@ -5,7 +5,7 @@ import time
5
5
  from enum import Enum
6
6
  from typing import List, Optional, Tuple
7
7
 
8
- from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
8
+ from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor
9
9
  from kubernetes import client
10
10
 
11
11
  from dstack._internal.core.backends.base.compute import (
@@ -59,19 +59,31 @@ from dstack._internal.utils.logging import get_logger
59
59
  logger = get_logger(__name__)
60
60
 
61
61
  JUMP_POD_SSH_PORT = 22
62
-
63
- NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
64
- NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
62
+ DUMMY_REGION = "-"
65
63
 
66
64
  NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
67
- NVIDIA_GPU_COUNT_LABEL = f"{NVIDIA_GPU_RESOURCE}.count"
68
- NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
69
65
  NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
66
+ NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
67
+
68
+ AMD_GPU_RESOURCE = "amd.com/gpu"
69
+ AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE
70
+ # The oldest but still supported label format, the safest option, see the commit message:
71
+ # https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48
72
+ # E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs
73
+ # We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue:
74
+ # https://github.com/ROCm/k8s-device-plugin/issues/112
75
+ AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id."
70
76
 
71
77
  # Taints we know and tolerate when creating our objects, e.g., the jump pod.
72
- TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT,)
78
+ TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT)
73
79
 
74
- DUMMY_REGION = "-"
80
+ NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
81
+ NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
82
+
83
+ AMD_GPU_DEVICE_ID_TO_GPU_INFO = {
84
+ device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids
85
+ }
86
+ AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS}
75
87
 
76
88
 
77
89
  class Operator(str, Enum):
@@ -112,21 +124,15 @@ class KubernetesCompute(
112
124
  nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
113
125
  for node in nodes:
114
126
  try:
115
- labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
116
127
  name = get_value(node, ".metadata.name", str, required=True)
117
- cpus = _parse_cpu(
118
- get_value(node, ".status.allocatable['cpu']", str, required=True)
119
- )
120
128
  cpu_arch = normalize_arch(
121
129
  get_value(node, ".status.node_info.architecture", str)
122
130
  ).to_cpu_architecture()
123
- memory_mib = _parse_memory(
124
- get_value(node, ".status.allocatable['memory']", str, required=True)
125
- )
126
- gpus, _ = _get_gpus_from_node_labels(labels)
127
- disk_size_mib = _parse_memory(
128
- get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
129
- )
131
+ allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
132
+ cpus = _parse_cpu(allocatable["cpu"])
133
+ memory_mib = _parse_memory(allocatable["memory"])
134
+ disk_size_mib = _parse_memory(allocatable["ephemeral-storage"])
135
+ gpus = _get_node_gpus(node)
130
136
  except (AttributeError, KeyError, ValueError) as e:
131
137
  logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
132
138
  continue
@@ -161,6 +167,7 @@ class KubernetesCompute(
161
167
  volumes: List[Volume],
162
168
  ) -> JobProvisioningData:
163
169
  instance_name = generate_unique_instance_name_for_job(run, job)
170
+ assert run.run_spec.ssh_key_pub is not None
164
171
  commands = get_docker_commands(
165
172
  [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
166
173
  )
@@ -217,59 +224,18 @@ class KubernetesCompute(
217
224
  "GPU is requested but the offer has no GPUs:"
218
225
  f" {gpu_spec=} {instance_offer=}",
219
226
  )
220
- offer_gpu = offer_gpus[0]
221
- matching_gpu_label_values: set[str] = set()
222
- # We cannot generate an expected GPU label value from the Gpu model instance
223
- # as the actual values may have additional components (socket, memory type, etc.)
224
- # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
225
- # Moreover, a single Gpu may match multiple label values.
226
- # As a workaround, we iterate and process all node labels once again (we already
227
- # processed them in `get_offers_by_requirements()`).
228
- node_list = call_api_method(
229
- self.api.list_node,
230
- client.V1NodeList,
231
- )
232
- nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
233
- for node in nodes:
234
- labels = get_value(node, ".metadata.labels", dict[str, str])
235
- if not labels:
236
- continue
237
- gpus, gpu_label_value = _get_gpus_from_node_labels(labels)
238
- if not gpus or gpu_label_value is None:
239
- continue
240
- if gpus[0] == offer_gpu:
241
- matching_gpu_label_values.add(gpu_label_value)
242
- if not matching_gpu_label_values:
243
- raise ComputeError(
244
- f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
245
- )
246
- logger.debug(
247
- "Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
248
- )
249
- # TODO: support other GPU vendors
250
- resources_requests[NVIDIA_GPU_RESOURCE] = str(gpu_min)
251
- resources_limits[NVIDIA_GPU_RESOURCE] = str(gpu_min)
252
- node_affinity = client.V1NodeAffinity(
253
- required_during_scheduling_ignored_during_execution=[
254
- client.V1NodeSelectorTerm(
255
- match_expressions=[
256
- client.V1NodeSelectorRequirement(
257
- key=NVIDIA_GPU_PRODUCT_LABEL,
258
- operator=Operator.IN,
259
- values=list(matching_gpu_label_values),
260
- ),
261
- ],
262
- ),
263
- ],
227
+ gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu(
228
+ self.api, offer_gpus[0]
264
229
  )
230
+ logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_min)
231
+ resources_requests[gpu_resource] = resources_limits[gpu_resource] = str(gpu_min)
265
232
  # It should be NoSchedule, but we also add NoExecute toleration just in case.
266
233
  for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
267
234
  tolerations.append(
268
235
  client.V1Toleration(
269
- key=NVIDIA_GPU_NODE_TAINT, operator=Operator.EXISTS, effect=effect
236
+ key=node_taint, operator=Operator.EXISTS, effect=effect
270
237
  )
271
238
  )
272
-
273
239
  if (memory_min := resources_spec.memory.min) is not None:
274
240
  resources_requests["memory"] = _render_memory(memory_min)
275
241
  if (
@@ -331,7 +297,9 @@ class KubernetesCompute(
331
297
  volume_mounts=volume_mounts,
332
298
  )
333
299
  ],
334
- affinity=node_affinity,
300
+ affinity=client.V1Affinity(
301
+ node_affinity=node_affinity,
302
+ ),
335
303
  tolerations=tolerations,
336
304
  volumes=volumes_,
337
305
  ),
@@ -550,34 +518,144 @@ def _render_memory(memory: Memory) -> str:
550
518
  return f"{float(memory)}Gi"
551
519
 
552
520
 
553
- def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optional[str]]:
521
+ def _get_node_gpus(node: client.V1Node) -> list[Gpu]:
522
+ node_name = get_value(node, ".metadata.name", str, required=True)
523
+ allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
524
+ labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
525
+ for gpu_resource, gpu_getter in (
526
+ (NVIDIA_GPU_RESOURCE, _get_nvidia_gpu_from_node_labels),
527
+ (AMD_GPU_RESOURCE, _get_amd_gpu_from_node_labels),
528
+ ):
529
+ _gpu_count = allocatable.get(gpu_resource)
530
+ if not _gpu_count:
531
+ continue
532
+ gpu_count = int(_gpu_count)
533
+ if gpu_count < 1:
534
+ continue
535
+ gpu = gpu_getter(labels)
536
+ if gpu is None:
537
+ logger.warning(
538
+ "Node %s: GPU resource found, but failed to detect its model: %s=%d",
539
+ node_name,
540
+ gpu_resource,
541
+ gpu_count,
542
+ )
543
+ return []
544
+ return [gpu] * gpu_count
545
+ logger.debug("Node %s: no GPU resource found", node_name)
546
+ return []
547
+
548
+
549
+ def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
554
550
  # We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
555
551
  # to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
556
552
  # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
557
553
  # Thus, we convert the product name to a known gpu name.
558
- # TODO: support other GPU vendors
559
- gpu_count = labels.get(NVIDIA_GPU_COUNT_LABEL)
560
554
  gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
561
- if gpu_count is None or gpu_product is None:
562
- return [], None
563
- gpu_count = int(gpu_count)
564
- gpu_name = None
565
- for known_gpu_name in NVIDIA_GPU_NAMES:
566
- if known_gpu_name.lower() in gpu_product.lower().split("-"):
567
- gpu_name = known_gpu_name
555
+ if gpu_product is None:
556
+ return None
557
+ for gpu_name in NVIDIA_GPU_NAMES:
558
+ if gpu_name.lower() in gpu_product.lower().split("-"):
568
559
  break
569
- if gpu_name is None:
570
- return [], None
560
+ else:
561
+ return None
571
562
  gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
572
563
  gpu_memory = gpu_info.memory * 1024
573
564
  # A100 may come in two variants
574
565
  if "40GB" in gpu_product:
575
566
  gpu_memory = 40 * 1024
576
- gpus = [
577
- Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
578
- for _ in range(gpu_count)
579
- ]
580
- return gpus, gpu_product
567
+ return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
568
+
569
+
570
+ def _get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
571
+ # (AMDGPUInfo.name, AMDGPUInfo.memory) pairs
572
+ gpus: set[tuple[str, int]] = set()
573
+ for label in labels:
574
+ if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX):
575
+ continue
576
+ _, _, _device_id = label.rpartition(".")
577
+ device_id = int(_device_id, 16)
578
+ gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id)
579
+ if gpu_info is None:
580
+ logger.warning("Unknown AMD GPU device id: %X", device_id)
581
+ continue
582
+ gpus.add((gpu_info.name, gpu_info.memory))
583
+ if not gpus:
584
+ return None
585
+ if len(gpus) == 1:
586
+ gpu_name, gpu_memory_gib = next(iter(gpus))
587
+ return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024)
588
+ logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus)
589
+ return None
590
+
591
+
592
+ def _get_pod_spec_parameters_for_gpu(
593
+ api: client.CoreV1Api, gpu: Gpu
594
+ ) -> tuple[str, client.V1NodeAffinity, str]:
595
+ gpu_vendor = gpu.vendor
596
+ assert gpu_vendor is not None
597
+ if gpu_vendor == AcceleratorVendor.NVIDIA:
598
+ node_affinity = _get_nvidia_gpu_node_affinity(api, gpu)
599
+ return NVIDIA_GPU_RESOURCE, node_affinity, NVIDIA_GPU_NODE_TAINT
600
+ if gpu_vendor == AcceleratorVendor.AMD:
601
+ node_affinity = _get_amd_gpu_node_affinity(gpu)
602
+ return AMD_GPU_RESOURCE, node_affinity, AMD_GPU_NODE_TAINT
603
+ raise ComputeError(f"Unsupported GPU vendor: {gpu_vendor}")
604
+
605
+
606
+ def _get_nvidia_gpu_node_affinity(api: client.CoreV1Api, gpu: Gpu) -> client.V1NodeAffinity:
607
+ matching_gpu_label_values: set[str] = set()
608
+ # We cannot generate an expected GPU label value from the Gpu model instance
609
+ # as the actual values may have additional components (socket, memory type, etc.)
610
+ # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
611
+ # Moreover, a single Gpu may match multiple label values.
612
+ # As a workaround, we iterate and process all node labels once again (we already
613
+ # processed them in `get_offers_by_requirements()`).
614
+ node_list = call_api_method(api.list_node, client.V1NodeList)
615
+ nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
616
+ for node in nodes:
617
+ labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
618
+ if _get_nvidia_gpu_from_node_labels(labels) == gpu:
619
+ matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL])
620
+ if not matching_gpu_label_values:
621
+ raise ComputeError(f"NVIDIA GPU is requested but no matching GPU labels found: {gpu=}")
622
+ logger.debug("Selecting nodes by labels %s for NVIDIA %s", matching_gpu_label_values, gpu.name)
623
+ return client.V1NodeAffinity(
624
+ required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
625
+ node_selector_terms=[
626
+ client.V1NodeSelectorTerm(
627
+ match_expressions=[
628
+ client.V1NodeSelectorRequirement(
629
+ key=NVIDIA_GPU_PRODUCT_LABEL,
630
+ operator=Operator.IN,
631
+ values=list(matching_gpu_label_values),
632
+ ),
633
+ ],
634
+ ),
635
+ ],
636
+ ),
637
+ )
638
+
639
+
640
+ def _get_amd_gpu_node_affinity(gpu: Gpu) -> client.V1NodeAffinity:
641
+ device_ids = AMD_GPU_NAME_TO_DEVICE_IDS.get(gpu.name)
642
+ if device_ids is None:
643
+ raise ComputeError(f"AMD GPU is requested but no matching device ids found: {gpu=}")
644
+ return client.V1NodeAffinity(
645
+ required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
646
+ node_selector_terms=[
647
+ client.V1NodeSelectorTerm(
648
+ match_expressions=[
649
+ client.V1NodeSelectorRequirement(
650
+ key=f"{AMD_GPU_DEVICE_ID_LABEL_PREFIX}{device_id:x}",
651
+ operator=Operator.EXISTS,
652
+ ),
653
+ ],
654
+ )
655
+ for device_id in device_ids
656
+ ],
657
+ ),
658
+ )
581
659
 
582
660
 
583
661
  def _continue_setup_jump_pod(
@@ -37,7 +37,7 @@ class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
37
37
 
38
38
 
39
39
  class KubeconfigFileConfig(CoreModel):
40
- filename: Annotated[str, Field(description="The path to the kubeconfig file")]
40
+ filename: Annotated[str, Field(description="The path to the kubeconfig file")] = ""
41
41
  data: Annotated[
42
42
  Optional[str],
43
43
  Field(
@@ -50,7 +50,9 @@ class KubeconfigFileConfig(CoreModel):
50
50
  ] = None
51
51
 
52
52
  @root_validator
53
- def fill_data(cls, values):
53
+ def fill_data(cls, values: dict) -> dict:
54
+ if values.get("filename") == "" and values.get("data") is None:
55
+ raise ValueError("filename or data must be specified")
54
56
  return fill_data(values)
55
57
 
56
58
 
@@ -2,8 +2,9 @@ import json
2
2
  import random
3
3
  import shlex
4
4
  import time
5
+ from collections.abc import Iterable
5
6
  from functools import cached_property
6
- from typing import Callable, List, Optional
7
+ from typing import List, Optional
7
8
 
8
9
  from nebius.aio.operation import Operation as SDKOperation
9
10
  from nebius.aio.service_error import RequestError, StatusCode
@@ -21,7 +22,11 @@ from dstack._internal.core.backends.base.compute import (
21
22
  get_user_data,
22
23
  merge_tags,
23
24
  )
24
- from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
25
+ from dstack._internal.core.backends.base.offers import (
26
+ OfferModifier,
27
+ get_catalog_offers,
28
+ get_offers_disk_modifier,
29
+ )
25
30
  from dstack._internal.core.backends.nebius import resources
26
31
  from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
27
32
  from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
@@ -125,10 +130,8 @@ class NebiusCompute(
125
130
  for offer in offers
126
131
  ]
127
132
 
128
- def get_offers_modifier(
129
- self, requirements: Requirements
130
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
131
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
133
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
134
+ return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
132
135
 
133
136
  def create_instance(
134
137
  self,
@@ -1,6 +1,7 @@
1
+ from collections.abc import Iterable
1
2
  from concurrent.futures import ThreadPoolExecutor
2
3
  from functools import cached_property
3
- from typing import Callable, List, Optional
4
+ from typing import List, Optional
4
5
 
5
6
  import oci
6
7
 
@@ -13,7 +14,11 @@ from dstack._internal.core.backends.base.compute import (
13
14
  generate_unique_instance_name,
14
15
  get_user_data,
15
16
  )
16
- from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
17
+ from dstack._internal.core.backends.base.offers import (
18
+ OfferModifier,
19
+ get_catalog_offers,
20
+ get_offers_disk_modifier,
21
+ )
17
22
  from dstack._internal.core.backends.oci import resources
18
23
  from dstack._internal.core.backends.oci.models import OCIConfig
19
24
  from dstack._internal.core.backends.oci.region import make_region_clients_map
@@ -96,10 +101,8 @@ class OCICompute(
96
101
 
97
102
  return offers_with_availability
98
103
 
99
- def get_offers_modifier(
100
- self, requirements: Requirements
101
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
102
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
104
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
105
+ return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
103
106
 
104
107
  def terminate_instance(
105
108
  self, instance_id: str, region: str, backend_data: Optional[str] = None
@@ -1,7 +1,8 @@
1
1
  import json
2
2
  import uuid
3
+ from collections.abc import Iterable
3
4
  from datetime import timedelta
4
- from typing import Callable, List, Optional
5
+ from typing import List, Optional
5
6
 
6
7
  from dstack._internal.core.backends.base.backend import Compute
7
8
  from dstack._internal.core.backends.base.compute import (
@@ -12,7 +13,11 @@ from dstack._internal.core.backends.base.compute import (
12
13
  get_docker_commands,
13
14
  get_job_instance_name,
14
15
  )
15
- from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
16
+ from dstack._internal.core.backends.base.offers import (
17
+ OfferModifier,
18
+ get_catalog_offers,
19
+ get_offers_disk_modifier,
20
+ )
16
21
  from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
17
22
  from dstack._internal.core.backends.runpod.models import RunpodConfig
18
23
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
@@ -72,10 +77,8 @@ class RunpodCompute(
72
77
  ]
73
78
  return offers
74
79
 
75
- def get_offers_modifier(
76
- self, requirements: Requirements
77
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
78
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
80
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
81
+ return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
79
82
 
80
83
  def run_job(
81
84
  self,
@@ -86,6 +89,7 @@ class RunpodCompute(
86
89
  project_ssh_private_key: str,
87
90
  volumes: List[Volume],
88
91
  ) -> JobProvisioningData:
92
+ assert run.run_spec.ssh_key_pub is not None
89
93
  instance_config = InstanceConfiguration(
90
94
  project_name=run.project_name,
91
95
  instance_name=get_job_instance_name(run, job),
@@ -47,7 +47,7 @@ class VastAICompute(
47
47
  "reliability2": {"gte": 0.9},
48
48
  "inet_down": {"gt": 128},
49
49
  "verified": {"eq": True},
50
- "cuda_max_good": {"gte": 12.1},
50
+ "cuda_max_good": {"gte": 12.8},
51
51
  "compute_cap": {"gte": 600},
52
52
  }
53
53
  )
@@ -58,6 +58,7 @@ class VastAICompute(
58
58
  ) -> List[InstanceOfferWithAvailability]:
59
59
  offers = get_catalog_offers(
60
60
  backend=BackendType.VASTAI,
61
+ locations=self.config.regions or None,
61
62
  requirements=requirements,
62
63
  # TODO(egor-s): spots currently not supported
63
64
  extra_filter=lambda offer: not offer.instance.resources.spot,
@@ -85,6 +86,7 @@ class VastAICompute(
85
86
  instance_name = generate_unique_instance_name_for_job(
86
87
  run, job, max_length=MAX_INSTANCE_NAME_LEN
87
88
  )
89
+ assert run.run_spec.ssh_key_pub is not None
88
90
  commands = get_docker_commands(
89
91
  [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
90
92
  )
@@ -18,7 +18,6 @@ from dstack._internal.core.models.backends.base import (
18
18
  BackendType,
19
19
  )
20
20
 
21
- # VastAI regions are dynamic, currently we don't offer any filtering
22
21
  REGIONS = []
23
22
 
24
23
 
@@ -244,7 +244,7 @@ class InstanceGroupParams(CoreModel):
244
244
  Field(
245
245
  description=(
246
246
  "The existing reservation to use for instance provisioning."
247
- " Supports AWS Capacity Reservations and Capacity Blocks"
247
+ " Supports AWS Capacity Reservations, AWS Capacity Blocks, and GCP reservations"
248
248
  )
249
249
  ),
250
250
  ] = None
@@ -283,7 +283,7 @@ class ProfileParams(CoreModel):
283
283
  Field(
284
284
  description=(
285
285
  "The existing reservation to use for instance provisioning."
286
- " Supports AWS Capacity Reservations and Capacity Blocks"
286
+ " Supports AWS Capacity Reservations, AWS Capacity Blocks, and GCP reservations"
287
287
  )
288
288
  ),
289
289
  ] = None
@@ -462,11 +462,12 @@ class RunSpec(generate_dual_core_model(RunSpecConfig)):
462
462
  configuration: Annotated[AnyRunConfiguration, Field(discriminator="type")]
463
463
  profile: Annotated[Optional[Profile], Field(description="The profile parameters")] = None
464
464
  ssh_key_pub: Annotated[
465
- str,
465
+ Optional[str],
466
466
  Field(
467
467
  description="The contents of the SSH public key that will be used to connect to the run."
468
+ " Can be empty only before the run is submitted."
468
469
  ),
469
- ]
470
+ ] = None
470
471
  # merged_profile stores profile parameters merged from profile and configuration.
471
472
  # Read profile parameters from merged_profile instead of profile directly.
472
473
  # TODO: make merged_profile a computed field after migrating to pydanticV2
@@ -30,6 +30,7 @@ class User(CoreModel):
30
30
  email: Optional[str]
31
31
  active: bool
32
32
  permissions: UserPermissions
33
+ ssh_public_key: Optional[str] = None
33
34
 
34
35
 
35
36
  class UserTokenCreds(CoreModel):
@@ -38,3 +39,12 @@ class UserTokenCreds(CoreModel):
38
39
 
39
40
  class UserWithCreds(User):
40
41
  creds: UserTokenCreds
42
+ ssh_private_key: Optional[str] = None
43
+
44
+
45
+ class UserHookConfig(CoreModel):
46
+ """
47
+ This class can be inherited to extend the user creation configuration passed to the hooks.
48
+ """
49
+
50
+ pass
@@ -117,6 +117,7 @@ class ConfigManager:
117
117
 
118
118
  @property
119
119
  def dstack_key_path(self) -> Path:
120
+ # TODO: Remove since 0.19.40
120
121
  return self.dstack_ssh_dir / "id_rsa"
121
122
 
122
123
  @property
@@ -558,10 +558,14 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
558
558
  if (
559
559
  _is_fleet_master_instance(instance)
560
560
  and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
561
+ and isinstance(compute, ComputeWithPlacementGroupSupport)
562
+ and (
563
+ compute.are_placement_groups_compatible_with_reservations(instance_offer.backend)
564
+ or instance_configuration.reservation is None
565
+ )
561
566
  and instance.fleet
562
567
  and _is_cloud_cluster(instance.fleet)
563
568
  ):
564
- assert isinstance(compute, ComputeWithPlacementGroupSupport)
565
569
  placement_group_model = _find_suitable_placement_group(
566
570
  placement_groups=placement_group_models,
567
571
  instance_offer=instance_offer,
@@ -243,6 +243,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
243
243
  job_submission.age,
244
244
  )
245
245
  ssh_user = job_provisioning_data.username
246
+ assert run.run_spec.ssh_key_pub is not None
246
247
  user_ssh_key = run.run_spec.ssh_key_pub.strip()
247
248
  public_keys = [project.ssh_public_key.strip(), user_ssh_key]
248
249
  if job_provisioning_data.backend == BackendType.LOCAL:
@@ -0,0 +1,34 @@
1
+ """user.ssh_key
2
+
3
+ Revision ID: ff1d94f65b08
4
+ Revises: 2498ab323443
5
+ Create Date: 2025-10-09 20:31:31.166786
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "ff1d94f65b08"
14
+ down_revision = "2498ab323443"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table("users", schema=None) as batch_op:
22
+ batch_op.add_column(sa.Column("ssh_private_key", sa.Text(), nullable=True))
23
+ batch_op.add_column(sa.Column("ssh_public_key", sa.Text(), nullable=True))
24
+
25
+ # ### end Alembic commands ###
26
+
27
+
28
+ def downgrade() -> None:
29
+ # ### commands auto generated by Alembic - please adjust! ###
30
+ with op.batch_alter_table("users", schema=None) as batch_op:
31
+ batch_op.drop_column("ssh_public_key")
32
+ batch_op.drop_column("ssh_private_key")
33
+
34
+ # ### end Alembic commands ###
@@ -190,6 +190,9 @@ class UserModel(BaseModel):
190
190
  # deactivated users cannot access API
191
191
  active: Mapped[bool] = mapped_column(Boolean, default=True)
192
192
 
193
+ ssh_private_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
194
+ ssh_public_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
195
+
193
196
  email: Mapped[Optional[str]] = mapped_column(String(200), nullable=True)
194
197
 
195
198
  projects_quota: Mapped[int] = mapped_column(