PyPI - dstack - Versions diffs - 0.19.32__py3-none-any.whl → 0.19.33__py3-none-any.whl - Mend

dstack 0.19.32py3-none-any.whl → 0.19.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (46) hide show

dstack/_internal/core/backends/hotaisle/compute.py CHANGED Viewed

@@ -52,7 +52,7 @@ INSTANCE_TYPE_SPECS = {
         "cpu_frequency": 2800000000,
         "cpu_manufacturer": "Intel",
     },
-    "4x MI300X 52x Xeon Platinum 8462Y": {
+    "4x MI300X 52x Xeon Platinum 8470": {
         "cpu_model": "Xeon Platinum 8470",
         "cpu_frequency": 2000000000,
         "cpu_manufacturer": "Intel",
@@ -62,6 +62,16 @@ INSTANCE_TYPE_SPECS = {
         "cpu_frequency": 2800000000,
         "cpu_manufacturer": "Intel",
     },
+    "8x MI300X 104x Xeon Platinum 8470": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "8x MI300X 104x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
 }

dstack/_internal/core/backends/kubernetes/compute.py CHANGED Viewed

@@ -5,7 +5,7 @@ import time
 from enum import Enum
 from typing import List, Optional, Tuple
-from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
+from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor
 from kubernetes import client
 from dstack._internal.core.backends.base.compute import (
@@ -59,19 +59,31 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
 JUMP_POD_SSH_PORT = 22
-NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
-NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
+DUMMY_REGION = "-"
 NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
-NVIDIA_GPU_COUNT_LABEL = f"{NVIDIA_GPU_RESOURCE}.count"
-NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
 NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
+NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
+AMD_GPU_RESOURCE = "amd.com/gpu"
+AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE
+# The oldest but still supported label format, the safest option, see the commit message:
+# https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48
+# E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs
+# We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue:
+# https://github.com/ROCm/k8s-device-plugin/issues/112
+AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id."
 # Taints we know and tolerate when creating our objects, e.g., the jump pod.
-TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT,)
+TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT)
-DUMMY_REGION = "-"
+NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
+NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
+AMD_GPU_DEVICE_ID_TO_GPU_INFO = {
+    device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids
+}
+AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS}
 class Operator(str, Enum):
@@ -112,21 +124,15 @@ class KubernetesCompute(
         nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
         for node in nodes:
             try:
-                labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
                 name = get_value(node, ".metadata.name", str, required=True)
-                cpus = _parse_cpu(
-                    get_value(node, ".status.allocatable['cpu']", str, required=True)
-                )
                 cpu_arch = normalize_arch(
                     get_value(node, ".status.node_info.architecture", str)
                 ).to_cpu_architecture()
-                memory_mib = _parse_memory(
-                    get_value(node, ".status.allocatable['memory']", str, required=True)
-                )
-                gpus, _ = _get_gpus_from_node_labels(labels)
-                disk_size_mib = _parse_memory(
-                    get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
-                )
+                allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
+                cpus = _parse_cpu(allocatable["cpu"])
+                memory_mib = _parse_memory(allocatable["memory"])
+                disk_size_mib = _parse_memory(allocatable["ephemeral-storage"])
+                gpus = _get_node_gpus(node)
             except (AttributeError, KeyError, ValueError) as e:
                 logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
                 continue
@@ -161,6 +167,7 @@ class KubernetesCompute(
         volumes: List[Volume],
     ) -> JobProvisioningData:
         instance_name = generate_unique_instance_name_for_job(run, job)
+        assert run.run_spec.ssh_key_pub is not None
         commands = get_docker_commands(
             [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
         )
@@ -217,59 +224,18 @@ class KubernetesCompute(
                         "GPU is requested but the offer has no GPUs:"
                         f" {gpu_spec=} {instance_offer=}",
                     )
-                offer_gpu = offer_gpus[0]
-                matching_gpu_label_values: set[str] = set()
-                # We cannot generate an expected GPU label value from the Gpu model instance
-                # as the actual values may have additional components (socket, memory type, etc.)
-                # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
-                # Moreover, a single Gpu may match multiple label values.
-                # As a workaround, we iterate and process all node labels once again (we already
-                # processed them in `get_offers_by_requirements()`).
-                node_list = call_api_method(
-                    self.api.list_node,
-                    client.V1NodeList,
-                )
-                nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
-                for node in nodes:
-                    labels = get_value(node, ".metadata.labels", dict[str, str])
-                    if not labels:
-                        continue
-                    gpus, gpu_label_value = _get_gpus_from_node_labels(labels)
-                    if not gpus or gpu_label_value is None:
-                        continue
-                    if gpus[0] == offer_gpu:
-                        matching_gpu_label_values.add(gpu_label_value)
-                if not matching_gpu_label_values:
-                    raise ComputeError(
-                        f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
-                    )
-                logger.debug(
-                    "Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
-                )
-                # TODO: support other GPU vendors
-                resources_requests[NVIDIA_GPU_RESOURCE] = str(gpu_min)
-                resources_limits[NVIDIA_GPU_RESOURCE] = str(gpu_min)
-                node_affinity = client.V1NodeAffinity(
-                    required_during_scheduling_ignored_during_execution=[
-                        client.V1NodeSelectorTerm(
-                            match_expressions=[
-                                client.V1NodeSelectorRequirement(
-                                    key=NVIDIA_GPU_PRODUCT_LABEL,
-                                    operator=Operator.IN,
-                                    values=list(matching_gpu_label_values),
-                                ),
-                            ],
-                        ),
-                    ],
+                gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu(
+                    self.api, offer_gpus[0]
                 )
+                logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_min)
+                resources_requests[gpu_resource] = resources_limits[gpu_resource] = str(gpu_min)
                 # It should be NoSchedule, but we also add NoExecute toleration just in case.
                 for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
                     tolerations.append(
                         client.V1Toleration(
-                            key=NVIDIA_GPU_NODE_TAINT, operator=Operator.EXISTS, effect=effect
+                            key=node_taint, operator=Operator.EXISTS, effect=effect
                         )
                     )
         if (memory_min := resources_spec.memory.min) is not None:
             resources_requests["memory"] = _render_memory(memory_min)
         if (
@@ -331,7 +297,9 @@ class KubernetesCompute(
                         volume_mounts=volume_mounts,
                     )
                 ],
-                affinity=node_affinity,
+                affinity=client.V1Affinity(
+                    node_affinity=node_affinity,
+                ),
                 tolerations=tolerations,
                 volumes=volumes_,
             ),
@@ -550,34 +518,144 @@ def _render_memory(memory: Memory) -> str:
     return f"{float(memory)}Gi"
-def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optional[str]]:
+def _get_node_gpus(node: client.V1Node) -> list[Gpu]:
+    node_name = get_value(node, ".metadata.name", str, required=True)
+    allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
+    labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
+    for gpu_resource, gpu_getter in (
+        (NVIDIA_GPU_RESOURCE, _get_nvidia_gpu_from_node_labels),
+        (AMD_GPU_RESOURCE, _get_amd_gpu_from_node_labels),
+    ):
+        _gpu_count = allocatable.get(gpu_resource)
+        if not _gpu_count:
+            continue
+        gpu_count = int(_gpu_count)
+        if gpu_count < 1:
+            continue
+        gpu = gpu_getter(labels)
+        if gpu is None:
+            logger.warning(
+                "Node %s: GPU resource found, but failed to detect its model: %s=%d",
+                node_name,
+                gpu_resource,
+                gpu_count,
+            )
+            return []
+        return [gpu] * gpu_count
+    logger.debug("Node %s: no GPU resource found", node_name)
+    return []
+def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
     # We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
     # to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
     # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
     # Thus, we convert the product name to a known gpu name.
-    # TODO: support other GPU vendors
-    gpu_count = labels.get(NVIDIA_GPU_COUNT_LABEL)
     gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
-    if gpu_count is None or gpu_product is None:
-        return [], None
-    gpu_count = int(gpu_count)
-    gpu_name = None
-    for known_gpu_name in NVIDIA_GPU_NAMES:
-        if known_gpu_name.lower() in gpu_product.lower().split("-"):
-            gpu_name = known_gpu_name
+    if gpu_product is None:
+        return None
+    for gpu_name in NVIDIA_GPU_NAMES:
+        if gpu_name.lower() in gpu_product.lower().split("-"):
             break
-    if gpu_name is None:
-        return [], None
+    else:
+        return None
     gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
     gpu_memory = gpu_info.memory * 1024
     # A100 may come in two variants
     if "40GB" in gpu_product:
         gpu_memory = 40 * 1024
-    gpus = [
-        Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
-        for _ in range(gpu_count)
-    ]
-    return gpus, gpu_product
+    return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
+def _get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
+    # (AMDGPUInfo.name, AMDGPUInfo.memory) pairs
+    gpus: set[tuple[str, int]] = set()
+    for label in labels:
+        if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX):
+            continue
+        _, _, _device_id = label.rpartition(".")
+        device_id = int(_device_id, 16)
+        gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id)
+        if gpu_info is None:
+            logger.warning("Unknown AMD GPU device id: %X", device_id)
+            continue
+        gpus.add((gpu_info.name, gpu_info.memory))
+    if not gpus:
+        return None
+    if len(gpus) == 1:
+        gpu_name, gpu_memory_gib = next(iter(gpus))
+        return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024)
+    logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus)
+    return None
+def _get_pod_spec_parameters_for_gpu(
+    api: client.CoreV1Api, gpu: Gpu
+) -> tuple[str, client.V1NodeAffinity, str]:
+    gpu_vendor = gpu.vendor
+    assert gpu_vendor is not None
+    if gpu_vendor == AcceleratorVendor.NVIDIA:
+        node_affinity = _get_nvidia_gpu_node_affinity(api, gpu)
+        return NVIDIA_GPU_RESOURCE, node_affinity, NVIDIA_GPU_NODE_TAINT
+    if gpu_vendor == AcceleratorVendor.AMD:
+        node_affinity = _get_amd_gpu_node_affinity(gpu)
+        return AMD_GPU_RESOURCE, node_affinity, AMD_GPU_NODE_TAINT
+    raise ComputeError(f"Unsupported GPU vendor: {gpu_vendor}")
+def _get_nvidia_gpu_node_affinity(api: client.CoreV1Api, gpu: Gpu) -> client.V1NodeAffinity:
+    matching_gpu_label_values: set[str] = set()
+    # We cannot generate an expected GPU label value from the Gpu model instance
+    # as the actual values may have additional components (socket, memory type, etc.)
+    # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
+    # Moreover, a single Gpu may match multiple label values.
+    # As a workaround, we iterate and process all node labels once again (we already
+    # processed them in `get_offers_by_requirements()`).
+    node_list = call_api_method(api.list_node, client.V1NodeList)
+    nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
+    for node in nodes:
+        labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
+        if _get_nvidia_gpu_from_node_labels(labels) == gpu:
+            matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL])
+    if not matching_gpu_label_values:
+        raise ComputeError(f"NVIDIA GPU is requested but no matching GPU labels found: {gpu=}")
+    logger.debug("Selecting nodes by labels %s for NVIDIA %s", matching_gpu_label_values, gpu.name)
+    return client.V1NodeAffinity(
+        required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
+            node_selector_terms=[
+                client.V1NodeSelectorTerm(
+                    match_expressions=[
+                        client.V1NodeSelectorRequirement(
+                            key=NVIDIA_GPU_PRODUCT_LABEL,
+                            operator=Operator.IN,
+                            values=list(matching_gpu_label_values),
+                        ),
+                    ],
+                ),
+            ],
+        ),
+    )
+def _get_amd_gpu_node_affinity(gpu: Gpu) -> client.V1NodeAffinity:
+    device_ids = AMD_GPU_NAME_TO_DEVICE_IDS.get(gpu.name)
+    if device_ids is None:
+        raise ComputeError(f"AMD GPU is requested but no matching device ids found: {gpu=}")
+    return client.V1NodeAffinity(
+        required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
+            node_selector_terms=[
+                client.V1NodeSelectorTerm(
+                    match_expressions=[
+                        client.V1NodeSelectorRequirement(
+                            key=f"{AMD_GPU_DEVICE_ID_LABEL_PREFIX}{device_id:x}",
+                            operator=Operator.EXISTS,
+                        ),
+                    ],
+                )
+                for device_id in device_ids
+            ],
+        ),
+    )
 def _continue_setup_jump_pod(

dstack/_internal/core/backends/kubernetes/models.py CHANGED Viewed

@@ -37,7 +37,7 @@ class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
 class KubeconfigFileConfig(CoreModel):
-    filename: Annotated[str, Field(description="The path to the kubeconfig file")]
+    filename: Annotated[str, Field(description="The path to the kubeconfig file")] = ""
     data: Annotated[
         Optional[str],
         Field(
@@ -50,7 +50,9 @@ class KubeconfigFileConfig(CoreModel):
     ] = None
     @root_validator
-    def fill_data(cls, values):
+    def fill_data(cls, values: dict) -> dict:
+        if values.get("filename") == "" and values.get("data") is None:
+            raise ValueError("filename or data must be specified")
         return fill_data(values)

dstack/_internal/core/backends/nebius/compute.py CHANGED Viewed

@@ -2,8 +2,9 @@ import json
 import random
 import shlex
 import time
+from collections.abc import Iterable
 from functools import cached_property
-from typing import Callable, List, Optional
+from typing import List, Optional
 from nebius.aio.operation import Operation as SDKOperation
 from nebius.aio.service_error import RequestError, StatusCode
@@ -21,7 +22,11 @@ from dstack._internal.core.backends.base.compute import (
     get_user_data,
     merge_tags,
 )
-from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
+from dstack._internal.core.backends.base.offers import (
+    OfferModifier,
+    get_catalog_offers,
+    get_offers_disk_modifier,
+)
 from dstack._internal.core.backends.nebius import resources
 from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
 from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
@@ -125,10 +130,8 @@ class NebiusCompute(
             for offer in offers
         ]
-    def get_offers_modifier(
-        self, requirements: Requirements
-    ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
-        return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
+    def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
+        return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
     def create_instance(
         self,

dstack/_internal/core/backends/oci/compute.py CHANGED Viewed

@@ -1,6 +1,7 @@
+from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from functools import cached_property
-from typing import Callable, List, Optional
+from typing import List, Optional
 import oci
@@ -13,7 +14,11 @@ from dstack._internal.core.backends.base.compute import (
     generate_unique_instance_name,
     get_user_data,
 )
-from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
+from dstack._internal.core.backends.base.offers import (
+    OfferModifier,
+    get_catalog_offers,
+    get_offers_disk_modifier,
+)
 from dstack._internal.core.backends.oci import resources
 from dstack._internal.core.backends.oci.models import OCIConfig
 from dstack._internal.core.backends.oci.region import make_region_clients_map
@@ -96,10 +101,8 @@ class OCICompute(
         return offers_with_availability
-    def get_offers_modifier(
-        self, requirements: Requirements
-    ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
-        return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
+    def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
+        return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
     def terminate_instance(
         self, instance_id: str, region: str, backend_data: Optional[str] = None

dstack/_internal/core/backends/runpod/compute.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
 import uuid
+from collections.abc import Iterable
 from datetime import timedelta
-from typing import Callable, List, Optional
+from typing import List, Optional
 from dstack._internal.core.backends.base.backend import Compute
 from dstack._internal.core.backends.base.compute import (
@@ -12,7 +13,11 @@ from dstack._internal.core.backends.base.compute import (
     get_docker_commands,
     get_job_instance_name,
 )
-from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
+from dstack._internal.core.backends.base.offers import (
+    OfferModifier,
+    get_catalog_offers,
+    get_offers_disk_modifier,
+)
 from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
 from dstack._internal.core.backends.runpod.models import RunpodConfig
 from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
@@ -72,10 +77,8 @@ class RunpodCompute(
         ]
         return offers
-    def get_offers_modifier(
-        self, requirements: Requirements
-    ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
-        return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
+    def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
+        return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
     def run_job(
         self,
@@ -86,6 +89,7 @@ class RunpodCompute(
         project_ssh_private_key: str,
         volumes: List[Volume],
     ) -> JobProvisioningData:
+        assert run.run_spec.ssh_key_pub is not None
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
             instance_name=get_job_instance_name(run, job),

dstack/_internal/core/backends/vastai/compute.py CHANGED Viewed

@@ -47,7 +47,7 @@ class VastAICompute(
                     "reliability2": {"gte": 0.9},
                     "inet_down": {"gt": 128},
                     "verified": {"eq": True},
-                    "cuda_max_good": {"gte": 12.1},
+                    "cuda_max_good": {"gte": 12.8},
                     "compute_cap": {"gte": 600},
                 }
             )
@@ -58,6 +58,7 @@ class VastAICompute(
     ) -> List[InstanceOfferWithAvailability]:
         offers = get_catalog_offers(
             backend=BackendType.VASTAI,
+            locations=self.config.regions or None,
             requirements=requirements,
             # TODO(egor-s): spots currently not supported
             extra_filter=lambda offer: not offer.instance.resources.spot,
@@ -85,6 +86,7 @@ class VastAICompute(
         instance_name = generate_unique_instance_name_for_job(
             run, job, max_length=MAX_INSTANCE_NAME_LEN
         )
+        assert run.run_spec.ssh_key_pub is not None
         commands = get_docker_commands(
             [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
         )

dstack/_internal/core/backends/vastai/configurator.py CHANGED Viewed

@@ -18,7 +18,6 @@ from dstack._internal.core.models.backends.base import (
     BackendType,
 )
-# VastAI regions are dynamic, currently we don't offer any filtering
 REGIONS = []

dstack/_internal/core/models/fleets.py CHANGED Viewed

@@ -244,7 +244,7 @@ class InstanceGroupParams(CoreModel):
         Field(
             description=(
                 "The existing reservation to use for instance provisioning."
-                " Supports AWS Capacity Reservations and Capacity Blocks"
+                " Supports AWS Capacity Reservations, AWS Capacity Blocks, and GCP reservations"
             )
         ),
     ] = None

dstack/_internal/core/models/profiles.py CHANGED Viewed

@@ -283,7 +283,7 @@ class ProfileParams(CoreModel):
         Field(
             description=(
                 "The existing reservation to use for instance provisioning."
-                " Supports AWS Capacity Reservations and Capacity Blocks"
+                " Supports AWS Capacity Reservations, AWS Capacity Blocks, and GCP reservations"
             )
         ),
     ] = None

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -462,11 +462,12 @@ class RunSpec(generate_dual_core_model(RunSpecConfig)):
     configuration: Annotated[AnyRunConfiguration, Field(discriminator="type")]
     profile: Annotated[Optional[Profile], Field(description="The profile parameters")] = None
     ssh_key_pub: Annotated[
-        str,
+        Optional[str],
         Field(
             description="The contents of the SSH public key that will be used to connect to the run."
+            " Can be empty only before the run is submitted."
         ),
-    ]
+    ] = None
     # merged_profile stores profile parameters merged from profile and configuration.
     # Read profile parameters from merged_profile instead of profile directly.
     # TODO: make merged_profile a computed field after migrating to pydanticV2

dstack/_internal/core/models/users.py CHANGED Viewed

@@ -30,6 +30,7 @@ class User(CoreModel):
     email: Optional[str]
     active: bool
     permissions: UserPermissions
+    ssh_public_key: Optional[str] = None
 class UserTokenCreds(CoreModel):
@@ -38,3 +39,12 @@ class UserTokenCreds(CoreModel):
 class UserWithCreds(User):
     creds: UserTokenCreds
+    ssh_private_key: Optional[str] = None
+class UserHookConfig(CoreModel):
+    """
+    This class can be inherited to extend the user creation configuration passed to the hooks.
+    """
+    pass

dstack/_internal/core/services/configs/__init__.py CHANGED Viewed

@@ -117,6 +117,7 @@ class ConfigManager:
     @property
     def dstack_key_path(self) -> Path:
+        # TODO: Remove since 0.19.40
         return self.dstack_ssh_dir / "id_rsa"
     @property

dstack/_internal/server/background/tasks/process_instances.py CHANGED Viewed

@@ -558,10 +558,14 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
         if (
             _is_fleet_master_instance(instance)
             and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
+            and isinstance(compute, ComputeWithPlacementGroupSupport)
+            and (
+                compute.are_placement_groups_compatible_with_reservations(instance_offer.backend)
+                or instance_configuration.reservation is None
+            )
             and instance.fleet
             and _is_cloud_cluster(instance.fleet)
         ):
-            assert isinstance(compute, ComputeWithPlacementGroupSupport)
             placement_group_model = _find_suitable_placement_group(
                 placement_groups=placement_group_models,
                 instance_offer=instance_offer,

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -243,6 +243,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
                 job_submission.age,
             )
             ssh_user = job_provisioning_data.username
+            assert run.run_spec.ssh_key_pub is not None
             user_ssh_key = run.run_spec.ssh_key_pub.strip()
             public_keys = [project.ssh_public_key.strip(), user_ssh_key]
             if job_provisioning_data.backend == BackendType.LOCAL:

dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""user.ssh_key
+Revision ID: ff1d94f65b08
+Revises: 2498ab323443
+Create Date: 2025-10-09 20:31:31.166786
+"""
+import sqlalchemy as sa
+from alembic import op
+# revision identifiers, used by Alembic.
+revision = "ff1d94f65b08"
+down_revision = "2498ab323443"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("users", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("ssh_private_key", sa.Text(), nullable=True))
+        batch_op.add_column(sa.Column("ssh_public_key", sa.Text(), nullable=True))
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("users", schema=None) as batch_op:
+        batch_op.drop_column("ssh_public_key")
+        batch_op.drop_column("ssh_private_key")
+    # ### end Alembic commands ###

dstack/_internal/server/models.py CHANGED Viewed

@@ -190,6 +190,9 @@ class UserModel(BaseModel):
     # deactivated users cannot access API
     active: Mapped[bool] = mapped_column(Boolean, default=True)
+    ssh_private_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
+    ssh_public_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
     email: Mapped[Optional[str]] = mapped_column(String(200), nullable=True)
     projects_quota: Mapped[int] = mapped_column(

dstack 0.19.32__py3-none-any.whl → 0.19.33__py3-none-any.whl

Potentially problematic release.

dstack 0.19.32py3-none-any.whl → 0.19.33py3-none-any.whl