PyPI - dstack - Versions diffs - 0.19.32__py3-none-any.whl → 0.19.34__py3-none-any.whl - Mend

dstack 0.19.32py3-none-any.whl → 0.19.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (54) hide show

dstack/_internal/cli/commands/offer.py +1 -1
dstack/_internal/cli/services/configurators/run.py +1 -5
dstack/_internal/core/backends/aws/compute.py +8 -5
dstack/_internal/core/backends/azure/compute.py +9 -6
dstack/_internal/core/backends/base/compute.py +40 -17
dstack/_internal/core/backends/base/offers.py +7 -1
dstack/_internal/core/backends/datacrunch/compute.py +9 -6
dstack/_internal/core/backends/gcp/compute.py +151 -6
dstack/_internal/core/backends/gcp/models.py +10 -0
dstack/_internal/core/backends/gcp/resources.py +87 -5
dstack/_internal/core/backends/hotaisle/compute.py +11 -1
dstack/_internal/core/backends/kubernetes/compute.py +161 -83
dstack/_internal/core/backends/kubernetes/models.py +4 -2
dstack/_internal/core/backends/nebius/compute.py +9 -6
dstack/_internal/core/backends/oci/compute.py +9 -6
dstack/_internal/core/backends/runpod/compute.py +14 -7
dstack/_internal/core/backends/vastai/compute.py +3 -1
dstack/_internal/core/backends/vastai/configurator.py +0 -1
dstack/_internal/core/compatibility/runs.py +25 -4
dstack/_internal/core/models/fleets.py +1 -1
dstack/_internal/core/models/instances.py +2 -1
dstack/_internal/core/models/profiles.py +1 -1
dstack/_internal/core/models/runs.py +4 -2
dstack/_internal/core/models/users.py +10 -0
dstack/_internal/core/services/configs/__init__.py +1 -0
dstack/_internal/core/services/ssh/key_manager.py +56 -0
dstack/_internal/server/background/tasks/process_instances.py +5 -1
dstack/_internal/server/background/tasks/process_running_jobs.py +1 -0
dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
dstack/_internal/server/models.py +6 -0
dstack/_internal/server/routers/metrics.py +6 -2
dstack/_internal/server/routers/runs.py +5 -1
dstack/_internal/server/routers/users.py +21 -2
dstack/_internal/server/services/jobs/__init__.py +18 -9
dstack/_internal/server/services/offers.py +1 -0
dstack/_internal/server/services/runs.py +13 -4
dstack/_internal/server/services/users.py +35 -2
dstack/_internal/server/statics/index.html +1 -1
dstack/_internal/server/statics/main-720ce3a11140daa480cc.css +3 -0
dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js → main-e79754c136f1d8e4e7e6.js} +12632 -8039
dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js.map → main-e79754c136f1d8e4e7e6.js.map} +1 -1
dstack/_internal/server/testing/common.py +4 -0
dstack/api/_public/__init__.py +8 -11
dstack/api/_public/repos.py +0 -21
dstack/api/_public/runs.py +61 -9
dstack/api/server/__init__.py +4 -0
dstack/api/server/_users.py +17 -2
dstack/version.py +2 -2
{dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/METADATA +2 -2
{dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/RECORD +53 -51
dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +0 -3
{dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/WHEEL +0 -0
{dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/entry_points.txt +0 -0
{dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/licenses/LICENSE.md +0 -0

dstack/_internal/core/backends/gcp/resources.py CHANGED Viewed

@@ -26,9 +26,35 @@ supported_accelerators = [
     {"accelerator_name": "nvidia-tesla-t4", "gpu_name": "T4", "memory_mb": 1024 * 16},
     {"accelerator_name": "nvidia-tesla-v100", "gpu_name": "V100", "memory_mb": 1024 * 16},
     {"accelerator_name": "nvidia-tesla-p100", "gpu_name": "P100", "memory_mb": 1024 * 16},
+    {"accelerator_name": "nvidia-rtx-pro-6000", "gpu_name": "RTXPRO6000", "memory_mb": 1024 * 96},
 ]
+def find_accelerator_name(gpu_name: str, memory_mib: int) -> Optional[str]:
+    for acc in supported_accelerators:
+        if gpu_name == acc["gpu_name"] and memory_mib == acc["memory_mb"]:
+            return acc["accelerator_name"]
+    return None
+def sanitize_filter_value(value: str) -> str:
+    """
+    Escape characters that could break the Compute Engine API filter string.
+    """
+    return value.replace("\\", "\\\\").replace('"', '\\"')
+def get_resource_project(resource_url: str) -> str:
+    """
+    Extract the project ID from a URL like
+    https://www.googleapis.com/compute/v1/projects/proj-id/zones/us-central1-a/instances/vm-name
+    """
+    matches = re.findall(r"/projects/(?P<project_id>[a-z0-9-]+)/", resource_url)
+    if not matches:
+        raise BackendError(f"Invalid resource URL {resource_url}")
+    return matches[0]
 def get_availability_zones(
     regions_client: compute_v1.RegionsClient,
     project_id: str,
@@ -123,6 +149,7 @@ def create_instance_struct(
     roce_subnetworks: Optional[List[Tuple[str, str]]] = None,
     allocate_public_ip: bool = True,
     placement_policy: Optional[str] = None,
+    reservation: Optional[compute_v1.Reservation] = None,
 ) -> compute_v1.Instance:
     instance = compute_v1.Instance()
     instance.name = instance_name
@@ -147,6 +174,25 @@ def create_instance_struct(
         initialize_params.disk_type = f"zones/{zone}/diskTypes/hyperdisk-balanced"
     disk.initialize_params = initialize_params
     instance.disks = [disk]
+    if (
+        reservation is not None
+        and reservation.specific_reservation is not None
+        and reservation.specific_reservation.instance_properties is not None
+        and reservation.specific_reservation.instance_properties.local_ssds is not None
+    ):
+        for local_ssd in reservation.specific_reservation.instance_properties.local_ssds:
+            instance.disks.append(
+                compute_v1.AttachedDisk(
+                    auto_delete=True,
+                    boot=False,
+                    type_="SCRATCH",
+                    initialize_params=compute_v1.AttachedDiskInitializeParams(
+                        disk_type=f"zones/{zone}/diskTypes/local-ssd",
+                        disk_size_gb=local_ssd.disk_size_gb,
+                    ),
+                    interface=local_ssd.interface,
+                )
+            )
     if accelerators:
         instance.guest_accelerators = accelerators
@@ -162,6 +208,8 @@ def create_instance_struct(
     if placement_policy is not None:
         instance.resource_policies = [placement_policy]
+    elif reservation is not None and "placement" in reservation.resource_policies:
+        instance.resource_policies = [reservation.resource_policies["placement"]]
     if spot:
         instance.scheduling = compute_v1.Scheduling()
@@ -187,6 +235,17 @@ def create_instance_struct(
             )
         ]
+    if reservation is not None:
+        reservation_project = get_resource_project(reservation.self_link)
+        instance.reservation_affinity = compute_v1.ReservationAffinity()
+        instance.reservation_affinity.consume_reservation_type = (
+            compute_v1.ReservationAffinity.ConsumeReservationType.SPECIFIC_RESERVATION.name
+        )
+        instance.reservation_affinity.key = "compute.googleapis.com/reservation-name"
+        instance.reservation_affinity.values = [
+            f"projects/{reservation_project}/reservations/{reservation.name}"
+        ]
     return instance
@@ -350,11 +409,8 @@ def get_accelerators(
         return []
     accelerator_config = compute_v1.AcceleratorConfig()
     accelerator_config.accelerator_count = len(gpus)
-    for acc in supported_accelerators:
-        if gpus[0].name == acc["gpu_name"] and gpus[0].memory_mib == acc["memory_mb"]:
-            accelerator_name = acc["accelerator_name"]
-            break
-    else:
+    accelerator_name = find_accelerator_name(gpus[0].name, gpus[0].memory_mib)
+    if accelerator_name is None:
         raise ValueError(f"Unsupported GPU: {gpus[0].name} {gpus[0].memory_mib} MiB")
     accelerator_config.accelerator_type = (
         f"projects/{project_id}/zones/{zone}/acceleratorTypes/{accelerator_name}"
@@ -362,6 +418,31 @@ def get_accelerators(
     return [accelerator_config]
+def find_reservation(
+    reservations_client: compute_v1.ReservationsClient,
+    project_id: str,
+    name: str,
+) -> dict[str, compute_v1.Reservation]:
+    request = compute_v1.AggregatedListReservationsRequest(
+        project=project_id,
+        filter=(
+            f'(name = "{sanitize_filter_value(name)}")'
+            ' AND (status = "READY")'
+            " AND (specificReservationRequired = true)"
+        ),
+    )
+    try:
+        aggregated_reservations = reservations_client.aggregated_list(request=request)
+    except (google.api_core.exceptions.NotFound, google.api_core.exceptions.Forbidden) as e:
+        logger.warning("Could not find reservation: %s", e)
+        return {}
+    zone_to_reservation = {}
+    for zone, zone_reservations in aggregated_reservations:
+        if zone_reservations.reservations:
+            zone_to_reservation[zone.split("/")[-1]] = zone_reservations.reservations[0]
+    return zone_to_reservation
 def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
     filtered_labels = {}
     for k, v in labels.items():
@@ -499,5 +580,6 @@ def instance_type_supports_persistent_disk(instance_type_name: str) -> bool:
             "h3-",
             "v6e",
             "a4-",
+            "g4-",
         ]
     )

dstack/_internal/core/backends/hotaisle/compute.py CHANGED Viewed

@@ -52,7 +52,7 @@ INSTANCE_TYPE_SPECS = {
         "cpu_frequency": 2800000000,
         "cpu_manufacturer": "Intel",
     },
-    "4x MI300X 52x Xeon Platinum 8462Y": {
+    "4x MI300X 52x Xeon Platinum 8470": {
         "cpu_model": "Xeon Platinum 8470",
         "cpu_frequency": 2000000000,
         "cpu_manufacturer": "Intel",
@@ -62,6 +62,16 @@ INSTANCE_TYPE_SPECS = {
         "cpu_frequency": 2800000000,
         "cpu_manufacturer": "Intel",
     },
+    "8x MI300X 104x Xeon Platinum 8470": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "8x MI300X 104x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
 }

dstack/_internal/core/backends/kubernetes/compute.py CHANGED Viewed

@@ -5,7 +5,7 @@ import time
 from enum import Enum
 from typing import List, Optional, Tuple
-from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
+from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor
 from kubernetes import client
 from dstack._internal.core.backends.base.compute import (
@@ -59,19 +59,31 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
 JUMP_POD_SSH_PORT = 22
-NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
-NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
+DUMMY_REGION = "-"
 NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
-NVIDIA_GPU_COUNT_LABEL = f"{NVIDIA_GPU_RESOURCE}.count"
-NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
 NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
+NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
+AMD_GPU_RESOURCE = "amd.com/gpu"
+AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE
+# The oldest but still supported label format, the safest option, see the commit message:
+# https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48
+# E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs
+# We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue:
+# https://github.com/ROCm/k8s-device-plugin/issues/112
+AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id."
 # Taints we know and tolerate when creating our objects, e.g., the jump pod.
-TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT,)
+TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT)
-DUMMY_REGION = "-"
+NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
+NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
+AMD_GPU_DEVICE_ID_TO_GPU_INFO = {
+    device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids
+}
+AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS}
 class Operator(str, Enum):
@@ -112,21 +124,15 @@ class KubernetesCompute(
         nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
         for node in nodes:
             try:
-                labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
                 name = get_value(node, ".metadata.name", str, required=True)
-                cpus = _parse_cpu(
-                    get_value(node, ".status.allocatable['cpu']", str, required=True)
-                )
                 cpu_arch = normalize_arch(
                     get_value(node, ".status.node_info.architecture", str)
                 ).to_cpu_architecture()
-                memory_mib = _parse_memory(
-                    get_value(node, ".status.allocatable['memory']", str, required=True)
-                )
-                gpus, _ = _get_gpus_from_node_labels(labels)
-                disk_size_mib = _parse_memory(
-                    get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
-                )
+                allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
+                cpus = _parse_cpu(allocatable["cpu"])
+                memory_mib = _parse_memory(allocatable["memory"])
+                disk_size_mib = _parse_memory(allocatable["ephemeral-storage"])
+                gpus = _get_node_gpus(node)
             except (AttributeError, KeyError, ValueError) as e:
                 logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
                 continue
@@ -161,6 +167,7 @@ class KubernetesCompute(
         volumes: List[Volume],
     ) -> JobProvisioningData:
         instance_name = generate_unique_instance_name_for_job(run, job)
+        assert run.run_spec.ssh_key_pub is not None
         commands = get_docker_commands(
             [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
         )
@@ -217,59 +224,18 @@ class KubernetesCompute(
                         "GPU is requested but the offer has no GPUs:"
                         f" {gpu_spec=} {instance_offer=}",
                     )
-                offer_gpu = offer_gpus[0]
-                matching_gpu_label_values: set[str] = set()
-                # We cannot generate an expected GPU label value from the Gpu model instance
-                # as the actual values may have additional components (socket, memory type, etc.)
-                # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
-                # Moreover, a single Gpu may match multiple label values.
-                # As a workaround, we iterate and process all node labels once again (we already
-                # processed them in `get_offers_by_requirements()`).
-                node_list = call_api_method(
-                    self.api.list_node,
-                    client.V1NodeList,
-                )
-                nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
-                for node in nodes:
-                    labels = get_value(node, ".metadata.labels", dict[str, str])
-                    if not labels:
-                        continue
-                    gpus, gpu_label_value = _get_gpus_from_node_labels(labels)
-                    if not gpus or gpu_label_value is None:
-                        continue
-                    if gpus[0] == offer_gpu:
-                        matching_gpu_label_values.add(gpu_label_value)
-                if not matching_gpu_label_values:
-                    raise ComputeError(
-                        f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
-                    )
-                logger.debug(
-                    "Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
-                )
-                # TODO: support other GPU vendors
-                resources_requests[NVIDIA_GPU_RESOURCE] = str(gpu_min)
-                resources_limits[NVIDIA_GPU_RESOURCE] = str(gpu_min)
-                node_affinity = client.V1NodeAffinity(
-                    required_during_scheduling_ignored_during_execution=[
-                        client.V1NodeSelectorTerm(
-                            match_expressions=[
-                                client.V1NodeSelectorRequirement(
-                                    key=NVIDIA_GPU_PRODUCT_LABEL,
-                                    operator=Operator.IN,
-                                    values=list(matching_gpu_label_values),
-                                ),
-                            ],
-                        ),
-                    ],
+                gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu(
+                    self.api, offer_gpus[0]
                 )
+                logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_min)
+                resources_requests[gpu_resource] = resources_limits[gpu_resource] = str(gpu_min)
                 # It should be NoSchedule, but we also add NoExecute toleration just in case.
                 for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
                     tolerations.append(
                         client.V1Toleration(
-                            key=NVIDIA_GPU_NODE_TAINT, operator=Operator.EXISTS, effect=effect
+                            key=node_taint, operator=Operator.EXISTS, effect=effect
                         )
                     )
         if (memory_min := resources_spec.memory.min) is not None:
             resources_requests["memory"] = _render_memory(memory_min)
         if (
@@ -331,7 +297,9 @@ class KubernetesCompute(
                         volume_mounts=volume_mounts,
                     )
                 ],
-                affinity=node_affinity,
+                affinity=client.V1Affinity(
+                    node_affinity=node_affinity,
+                ),
                 tolerations=tolerations,
                 volumes=volumes_,
             ),
@@ -550,34 +518,144 @@ def _render_memory(memory: Memory) -> str:
     return f"{float(memory)}Gi"
-def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optional[str]]:
+def _get_node_gpus(node: client.V1Node) -> list[Gpu]:
+    node_name = get_value(node, ".metadata.name", str, required=True)
+    allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
+    labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
+    for gpu_resource, gpu_getter in (
+        (NVIDIA_GPU_RESOURCE, _get_nvidia_gpu_from_node_labels),
+        (AMD_GPU_RESOURCE, _get_amd_gpu_from_node_labels),
+    ):
+        _gpu_count = allocatable.get(gpu_resource)
+        if not _gpu_count:
+            continue
+        gpu_count = int(_gpu_count)
+        if gpu_count < 1:
+            continue
+        gpu = gpu_getter(labels)
+        if gpu is None:
+            logger.warning(
+                "Node %s: GPU resource found, but failed to detect its model: %s=%d",
+                node_name,
+                gpu_resource,
+                gpu_count,
+            )
+            return []
+        return [gpu] * gpu_count
+    logger.debug("Node %s: no GPU resource found", node_name)
+    return []
+def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
     # We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
     # to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
     # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
     # Thus, we convert the product name to a known gpu name.
-    # TODO: support other GPU vendors
-    gpu_count = labels.get(NVIDIA_GPU_COUNT_LABEL)
     gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
-    if gpu_count is None or gpu_product is None:
-        return [], None
-    gpu_count = int(gpu_count)
-    gpu_name = None
-    for known_gpu_name in NVIDIA_GPU_NAMES:
-        if known_gpu_name.lower() in gpu_product.lower().split("-"):
-            gpu_name = known_gpu_name
+    if gpu_product is None:
+        return None
+    for gpu_name in NVIDIA_GPU_NAMES:
+        if gpu_name.lower() in gpu_product.lower().split("-"):
             break
-    if gpu_name is None:
-        return [], None
+    else:
+        return None
     gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
     gpu_memory = gpu_info.memory * 1024
     # A100 may come in two variants
     if "40GB" in gpu_product:
         gpu_memory = 40 * 1024
-    gpus = [
-        Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
-        for _ in range(gpu_count)
-    ]
-    return gpus, gpu_product
+    return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
+def _get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
+    # (AMDGPUInfo.name, AMDGPUInfo.memory) pairs
+    gpus: set[tuple[str, int]] = set()
+    for label in labels:
+        if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX):
+            continue
+        _, _, _device_id = label.rpartition(".")
+        device_id = int(_device_id, 16)
+        gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id)
+        if gpu_info is None:
+            logger.warning("Unknown AMD GPU device id: %X", device_id)
+            continue
+        gpus.add((gpu_info.name, gpu_info.memory))
+    if not gpus:
+        return None
+    if len(gpus) == 1:
+        gpu_name, gpu_memory_gib = next(iter(gpus))
+        return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024)
+    logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus)
+    return None
+def _get_pod_spec_parameters_for_gpu(
+    api: client.CoreV1Api, gpu: Gpu
+) -> tuple[str, client.V1NodeAffinity, str]:
+    gpu_vendor = gpu.vendor
+    assert gpu_vendor is not None
+    if gpu_vendor == AcceleratorVendor.NVIDIA:
+        node_affinity = _get_nvidia_gpu_node_affinity(api, gpu)
+        return NVIDIA_GPU_RESOURCE, node_affinity, NVIDIA_GPU_NODE_TAINT
+    if gpu_vendor == AcceleratorVendor.AMD:
+        node_affinity = _get_amd_gpu_node_affinity(gpu)
+        return AMD_GPU_RESOURCE, node_affinity, AMD_GPU_NODE_TAINT
+    raise ComputeError(f"Unsupported GPU vendor: {gpu_vendor}")
+def _get_nvidia_gpu_node_affinity(api: client.CoreV1Api, gpu: Gpu) -> client.V1NodeAffinity:
+    matching_gpu_label_values: set[str] = set()
+    # We cannot generate an expected GPU label value from the Gpu model instance
+    # as the actual values may have additional components (socket, memory type, etc.)
+    # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
+    # Moreover, a single Gpu may match multiple label values.
+    # As a workaround, we iterate and process all node labels once again (we already
+    # processed them in `get_offers_by_requirements()`).
+    node_list = call_api_method(api.list_node, client.V1NodeList)
+    nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
+    for node in nodes:
+        labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
+        if _get_nvidia_gpu_from_node_labels(labels) == gpu:
+            matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL])
+    if not matching_gpu_label_values:
+        raise ComputeError(f"NVIDIA GPU is requested but no matching GPU labels found: {gpu=}")
+    logger.debug("Selecting nodes by labels %s for NVIDIA %s", matching_gpu_label_values, gpu.name)
+    return client.V1NodeAffinity(
+        required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
+            node_selector_terms=[
+                client.V1NodeSelectorTerm(
+                    match_expressions=[
+                        client.V1NodeSelectorRequirement(
+                            key=NVIDIA_GPU_PRODUCT_LABEL,
+                            operator=Operator.IN,
+                            values=list(matching_gpu_label_values),
+                        ),
+                    ],
+                ),
+            ],
+        ),
+    )
+def _get_amd_gpu_node_affinity(gpu: Gpu) -> client.V1NodeAffinity:
+    device_ids = AMD_GPU_NAME_TO_DEVICE_IDS.get(gpu.name)
+    if device_ids is None:
+        raise ComputeError(f"AMD GPU is requested but no matching device ids found: {gpu=}")
+    return client.V1NodeAffinity(
+        required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
+            node_selector_terms=[
+                client.V1NodeSelectorTerm(
+                    match_expressions=[
+                        client.V1NodeSelectorRequirement(
+                            key=f"{AMD_GPU_DEVICE_ID_LABEL_PREFIX}{device_id:x}",
+                            operator=Operator.EXISTS,
+                        ),
+                    ],
+                )
+                for device_id in device_ids
+            ],
+        ),
+    )
 def _continue_setup_jump_pod(

dstack/_internal/core/backends/kubernetes/models.py CHANGED Viewed

@@ -37,7 +37,7 @@ class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
 class KubeconfigFileConfig(CoreModel):
-    filename: Annotated[str, Field(description="The path to the kubeconfig file")]
+    filename: Annotated[str, Field(description="The path to the kubeconfig file")] = ""
     data: Annotated[
         Optional[str],
         Field(
@@ -50,7 +50,9 @@ class KubeconfigFileConfig(CoreModel):
     ] = None
     @root_validator
-    def fill_data(cls, values):
+    def fill_data(cls, values: dict) -> dict:
+        if values.get("filename") == "" and values.get("data") is None:
+            raise ValueError("filename or data must be specified")
         return fill_data(values)

dstack/_internal/core/backends/nebius/compute.py CHANGED Viewed

@@ -2,8 +2,9 @@ import json
 import random
 import shlex
 import time
+from collections.abc import Iterable
 from functools import cached_property
-from typing import Callable, List, Optional
+from typing import List, Optional
 from nebius.aio.operation import Operation as SDKOperation
 from nebius.aio.service_error import RequestError, StatusCode
@@ -21,7 +22,11 @@ from dstack._internal.core.backends.base.compute import (
     get_user_data,
     merge_tags,
 )
-from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
+from dstack._internal.core.backends.base.offers import (
+    OfferModifier,
+    get_catalog_offers,
+    get_offers_disk_modifier,
+)
 from dstack._internal.core.backends.nebius import resources
 from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
 from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
@@ -125,10 +130,8 @@ class NebiusCompute(
             for offer in offers
         ]
-    def get_offers_modifier(
-        self, requirements: Requirements
-    ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
-        return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
+    def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
+        return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
     def create_instance(
         self,

dstack/_internal/core/backends/oci/compute.py CHANGED Viewed

@@ -1,6 +1,7 @@
+from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from functools import cached_property
-from typing import Callable, List, Optional
+from typing import List, Optional
 import oci
@@ -13,7 +14,11 @@ from dstack._internal.core.backends.base.compute import (
     generate_unique_instance_name,
     get_user_data,
 )
-from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
+from dstack._internal.core.backends.base.offers import (
+    OfferModifier,
+    get_catalog_offers,
+    get_offers_disk_modifier,
+)
 from dstack._internal.core.backends.oci import resources
 from dstack._internal.core.backends.oci.models import OCIConfig
 from dstack._internal.core.backends.oci.region import make_region_clients_map
@@ -96,10 +101,8 @@ class OCICompute(
         return offers_with_availability
-    def get_offers_modifier(
-        self, requirements: Requirements
-    ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
-        return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
+    def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
+        return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
     def terminate_instance(
         self, instance_id: str, region: str, backend_data: Optional[str] = None

dstack/_internal/core/backends/runpod/compute.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
 import uuid
+from collections.abc import Iterable
 from datetime import timedelta
-from typing import Callable, List, Optional
+from typing import List, Optional
 from dstack._internal.core.backends.base.backend import Compute
 from dstack._internal.core.backends.base.compute import (
@@ -12,7 +13,11 @@ from dstack._internal.core.backends.base.compute import (
     get_docker_commands,
     get_job_instance_name,
 )
-from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
+from dstack._internal.core.backends.base.offers import (
+    OfferModifier,
+    get_catalog_offers,
+    get_offers_disk_modifier,
+)
 from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
 from dstack._internal.core.backends.runpod.models import RunpodConfig
 from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
@@ -72,10 +77,8 @@ class RunpodCompute(
         ]
         return offers
-    def get_offers_modifier(
-        self, requirements: Requirements
-    ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
-        return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
+    def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
+        return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
     def run_job(
         self,
@@ -86,6 +89,7 @@ class RunpodCompute(
         project_ssh_private_key: str,
         volumes: List[Volume],
     ) -> JobProvisioningData:
+        assert run.run_spec.ssh_key_pub is not None
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
             instance_name=get_job_instance_name(run, job),
@@ -228,9 +232,12 @@ class RunpodCompute(
     def create_volume(self, volume: Volume) -> VolumeProvisioningData:
         volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN)
         size_gb = volume.configuration.size_gb
+        # Runpod regions must be uppercase.
+        # Lowercase regions are accepted in the API but they break Runpod in several ways.
+        region = volume.configuration.region.upper()
         volume_id = self.api_client.create_network_volume(
             name=volume_name,
-            region=volume.configuration.region,
+            region=region,
             size=size_gb,
         )
         return VolumeProvisioningData(

dstack/_internal/core/backends/vastai/compute.py CHANGED Viewed

@@ -47,7 +47,7 @@ class VastAICompute(
                     "reliability2": {"gte": 0.9},
                     "inet_down": {"gt": 128},
                     "verified": {"eq": True},
-                    "cuda_max_good": {"gte": 12.1},
+                    "cuda_max_good": {"gte": 12.8},
                     "compute_cap": {"gte": 600},
                 }
             )
@@ -58,6 +58,7 @@ class VastAICompute(
     ) -> List[InstanceOfferWithAvailability]:
         offers = get_catalog_offers(
             backend=BackendType.VASTAI,
+            locations=self.config.regions or None,
             requirements=requirements,
             # TODO(egor-s): spots currently not supported
             extra_filter=lambda offer: not offer.instance.resources.spot,
@@ -85,6 +86,7 @@ class VastAICompute(
         instance_name = generate_unique_instance_name_for_job(
             run, job, max_length=MAX_INSTANCE_NAME_LEN
         )
+        assert run.run_spec.ssh_key_pub is not None
         commands = get_docker_commands(
             [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
         )

dstack/_internal/core/backends/vastai/configurator.py CHANGED Viewed

@@ -18,7 +18,6 @@ from dstack._internal.core.models.backends.base import (
     BackendType,
 )
-# VastAI regions are dynamic, currently we don't offer any filtering
 REGIONS = []

dstack 0.19.32__py3-none-any.whl → 0.19.34__py3-none-any.whl

Potentially problematic release.

dstack 0.19.32py3-none-any.whl → 0.19.34py3-none-any.whl