PyPI - dstack - Versions diffs - 0.19.31__py3-none-any.whl → 0.19.33__py3-none-any.whl - Mend

dstack 0.19.31py3-none-any.whl → 0.19.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (53) hide show

dstack/_internal/core/backends/hotaisle/compute.py CHANGED Viewed

@@ -42,6 +42,36 @@ INSTANCE_TYPE_SPECS = {
         "cpu_frequency": 2000000000,
         "cpu_manufacturer": "Intel",
     },
+    "2x MI300X 26x Xeon Platinum 8470": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "2x MI300X 26x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "4x MI300X 52x Xeon Platinum 8470": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "4x MI300X 52x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "8x MI300X 104x Xeon Platinum 8470": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "8x MI300X 104x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
 }

dstack/_internal/core/backends/kubernetes/compute.py CHANGED Viewed

@@ -2,9 +2,10 @@ import subprocess
 import tempfile
 import threading
 import time
+from enum import Enum
 from typing import List, Optional, Tuple
-from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
+from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor
 from kubernetes import client
 from dstack._internal.core.backends.base.compute import (
@@ -58,11 +59,42 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
 JUMP_POD_SSH_PORT = 22
+DUMMY_REGION = "-"
+NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
+NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
+NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
+AMD_GPU_RESOURCE = "amd.com/gpu"
+AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE
+# The oldest but still supported label format, the safest option, see the commit message:
+# https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48
+# E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs
+# We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue:
+# https://github.com/ROCm/k8s-device-plugin/issues/112
+AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id."
+# Taints we know and tolerate when creating our objects, e.g., the jump pod.
+TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT)
 NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
 NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
-DUMMY_REGION = "-"
+AMD_GPU_DEVICE_ID_TO_GPU_INFO = {
+    device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids
+}
+AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS}
+class Operator(str, Enum):
+    EXISTS = "Exists"
+    IN = "In"
+class TaintEffect(str, Enum):
+    NO_EXECUTE = "NoExecute"
+    NO_SCHEDULE = "NoSchedule"
+    PREFER_NO_SCHEDULE = "PreferNoSchedule"
 class KubernetesCompute(
@@ -92,21 +124,15 @@ class KubernetesCompute(
         nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
         for node in nodes:
             try:
-                labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
                 name = get_value(node, ".metadata.name", str, required=True)
-                cpus = _parse_cpu(
-                    get_value(node, ".status.allocatable['cpu']", str, required=True)
-                )
                 cpu_arch = normalize_arch(
                     get_value(node, ".status.node_info.architecture", str)
                 ).to_cpu_architecture()
-                memory_mib = _parse_memory(
-                    get_value(node, ".status.allocatable['memory']", str, required=True)
-                )
-                gpus, _ = _get_gpus_from_node_labels(labels)
-                disk_size_mib = _parse_memory(
-                    get_value(node, ".status.allocatable['ephemeral-storage']", str, required=True)
-                )
+                allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
+                cpus = _parse_cpu(allocatable["cpu"])
+                memory_mib = _parse_memory(allocatable["memory"])
+                disk_size_mib = _parse_memory(allocatable["ephemeral-storage"])
+                gpus = _get_node_gpus(node)
             except (AttributeError, KeyError, ValueError) as e:
                 logger.exception("Failed to process node: %s: %s", type(e).__name__, e)
                 continue
@@ -141,6 +167,7 @@ class KubernetesCompute(
         volumes: List[Volume],
     ) -> JobProvisioningData:
         instance_name = generate_unique_instance_name_for_job(run, job)
+        assert run.run_spec.ssh_key_pub is not None
         commands = get_docker_commands(
             [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
         )
@@ -181,6 +208,7 @@ class KubernetesCompute(
         resources_requests: dict[str, str] = {}
         resources_limits: dict[str, str] = {}
         node_affinity: Optional[client.V1NodeAffinity] = None
+        tolerations: list[client.V1Toleration] = []
         volumes_: list[client.V1Volume] = []
         volume_mounts: list[client.V1VolumeMount] = []
@@ -196,52 +224,18 @@ class KubernetesCompute(
                         "GPU is requested but the offer has no GPUs:"
                         f" {gpu_spec=} {instance_offer=}",
                     )
-                offer_gpu = offer_gpus[0]
-                matching_gpu_label_values: set[str] = set()
-                # We cannot generate an expected GPU label value from the Gpu model instance
-                # as the actual values may have additional components (socket, memory type, etc.)
-                # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
-                # Moreover, a single Gpu may match multiple label values.
-                # As a workaround, we iterate and process all node labels once again (we already
-                # processed them in `get_offers_by_requirements()`).
-                node_list = call_api_method(
-                    self.api.list_node,
-                    client.V1NodeList,
+                gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu(
+                    self.api, offer_gpus[0]
                 )
-                nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
-                for node in nodes:
-                    labels = get_value(node, ".metadata.labels", dict[str, str])
-                    if not labels:
-                        continue
-                    gpus, gpu_label_value = _get_gpus_from_node_labels(labels)
-                    if not gpus or gpu_label_value is None:
-                        continue
-                    if gpus[0] == offer_gpu:
-                        matching_gpu_label_values.add(gpu_label_value)
-                if not matching_gpu_label_values:
-                    raise ComputeError(
-                        f"GPU is requested but no matching GPU labels found: {gpu_spec=}"
+                logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_min)
+                resources_requests[gpu_resource] = resources_limits[gpu_resource] = str(gpu_min)
+                # It should be NoSchedule, but we also add NoExecute toleration just in case.
+                for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
+                    tolerations.append(
+                        client.V1Toleration(
+                            key=node_taint, operator=Operator.EXISTS, effect=effect
+                        )
                     )
-                logger.debug(
-                    "Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
-                )
-                # TODO: support other GPU vendors
-                resources_requests["nvidia.com/gpu"] = str(gpu_min)
-                resources_limits["nvidia.com/gpu"] = str(gpu_min)
-                node_affinity = client.V1NodeAffinity(
-                    required_during_scheduling_ignored_during_execution=[
-                        client.V1NodeSelectorTerm(
-                            match_expressions=[
-                                client.V1NodeSelectorRequirement(
-                                    key="nvidia.com/gpu.product",
-                                    operator="In",
-                                    values=list(matching_gpu_label_values),
-                                ),
-                            ],
-                        ),
-                    ],
-                )
         if (memory_min := resources_spec.memory.min) is not None:
             resources_requests["memory"] = _render_memory(memory_min)
         if (
@@ -303,7 +297,10 @@ class KubernetesCompute(
                         volume_mounts=volume_mounts,
                     )
                 ],
-                affinity=node_affinity,
+                affinity=client.V1Affinity(
+                    node_affinity=node_affinity,
+                ),
+                tolerations=tolerations,
                 volumes=volumes_,
             ),
         )
@@ -521,34 +518,144 @@ def _render_memory(memory: Memory) -> str:
     return f"{float(memory)}Gi"
-def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optional[str]]:
+def _get_node_gpus(node: client.V1Node) -> list[Gpu]:
+    node_name = get_value(node, ".metadata.name", str, required=True)
+    allocatable = get_value(node, ".status.allocatable", dict[str, str], required=True)
+    labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
+    for gpu_resource, gpu_getter in (
+        (NVIDIA_GPU_RESOURCE, _get_nvidia_gpu_from_node_labels),
+        (AMD_GPU_RESOURCE, _get_amd_gpu_from_node_labels),
+    ):
+        _gpu_count = allocatable.get(gpu_resource)
+        if not _gpu_count:
+            continue
+        gpu_count = int(_gpu_count)
+        if gpu_count < 1:
+            continue
+        gpu = gpu_getter(labels)
+        if gpu is None:
+            logger.warning(
+                "Node %s: GPU resource found, but failed to detect its model: %s=%d",
+                node_name,
+                gpu_resource,
+                gpu_count,
+            )
+            return []
+        return [gpu] * gpu_count
+    logger.debug("Node %s: no GPU resource found", node_name)
+    return []
+def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
     # We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery
     # to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or
     # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
     # Thus, we convert the product name to a known gpu name.
-    # TODO: support other GPU vendors
-    gpu_count = labels.get("nvidia.com/gpu.count")
-    gpu_product = labels.get("nvidia.com/gpu.product")
-    if gpu_count is None or gpu_product is None:
-        return [], None
-    gpu_count = int(gpu_count)
-    gpu_name = None
-    for known_gpu_name in NVIDIA_GPU_NAMES:
-        if known_gpu_name.lower() in gpu_product.lower().split("-"):
-            gpu_name = known_gpu_name
+    gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
+    if gpu_product is None:
+        return None
+    for gpu_name in NVIDIA_GPU_NAMES:
+        if gpu_name.lower() in gpu_product.lower().split("-"):
             break
-    if gpu_name is None:
-        return [], None
+    else:
+        return None
     gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name]
     gpu_memory = gpu_info.memory * 1024
     # A100 may come in two variants
     if "40GB" in gpu_product:
         gpu_memory = 40 * 1024
-    gpus = [
-        Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
-        for _ in range(gpu_count)
-    ]
-    return gpus, gpu_product
+    return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory)
+def _get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]:
+    # (AMDGPUInfo.name, AMDGPUInfo.memory) pairs
+    gpus: set[tuple[str, int]] = set()
+    for label in labels:
+        if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX):
+            continue
+        _, _, _device_id = label.rpartition(".")
+        device_id = int(_device_id, 16)
+        gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id)
+        if gpu_info is None:
+            logger.warning("Unknown AMD GPU device id: %X", device_id)
+            continue
+        gpus.add((gpu_info.name, gpu_info.memory))
+    if not gpus:
+        return None
+    if len(gpus) == 1:
+        gpu_name, gpu_memory_gib = next(iter(gpus))
+        return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024)
+    logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus)
+    return None
+def _get_pod_spec_parameters_for_gpu(
+    api: client.CoreV1Api, gpu: Gpu
+) -> tuple[str, client.V1NodeAffinity, str]:
+    gpu_vendor = gpu.vendor
+    assert gpu_vendor is not None
+    if gpu_vendor == AcceleratorVendor.NVIDIA:
+        node_affinity = _get_nvidia_gpu_node_affinity(api, gpu)
+        return NVIDIA_GPU_RESOURCE, node_affinity, NVIDIA_GPU_NODE_TAINT
+    if gpu_vendor == AcceleratorVendor.AMD:
+        node_affinity = _get_amd_gpu_node_affinity(gpu)
+        return AMD_GPU_RESOURCE, node_affinity, AMD_GPU_NODE_TAINT
+    raise ComputeError(f"Unsupported GPU vendor: {gpu_vendor}")
+def _get_nvidia_gpu_node_affinity(api: client.CoreV1Api, gpu: Gpu) -> client.V1NodeAffinity:
+    matching_gpu_label_values: set[str] = set()
+    # We cannot generate an expected GPU label value from the Gpu model instance
+    # as the actual values may have additional components (socket, memory type, etc.)
+    # that we don't preserve in the Gpu model, e.g., "NVIDIA-H100-80GB-HBM3".
+    # Moreover, a single Gpu may match multiple label values.
+    # As a workaround, we iterate and process all node labels once again (we already
+    # processed them in `get_offers_by_requirements()`).
+    node_list = call_api_method(api.list_node, client.V1NodeList)
+    nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
+    for node in nodes:
+        labels = get_value(node, ".metadata.labels", dict[str, str]) or {}
+        if _get_nvidia_gpu_from_node_labels(labels) == gpu:
+            matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL])
+    if not matching_gpu_label_values:
+        raise ComputeError(f"NVIDIA GPU is requested but no matching GPU labels found: {gpu=}")
+    logger.debug("Selecting nodes by labels %s for NVIDIA %s", matching_gpu_label_values, gpu.name)
+    return client.V1NodeAffinity(
+        required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
+            node_selector_terms=[
+                client.V1NodeSelectorTerm(
+                    match_expressions=[
+                        client.V1NodeSelectorRequirement(
+                            key=NVIDIA_GPU_PRODUCT_LABEL,
+                            operator=Operator.IN,
+                            values=list(matching_gpu_label_values),
+                        ),
+                    ],
+                ),
+            ],
+        ),
+    )
+def _get_amd_gpu_node_affinity(gpu: Gpu) -> client.V1NodeAffinity:
+    device_ids = AMD_GPU_NAME_TO_DEVICE_IDS.get(gpu.name)
+    if device_ids is None:
+        raise ComputeError(f"AMD GPU is requested but no matching device ids found: {gpu=}")
+    return client.V1NodeAffinity(
+        required_during_scheduling_ignored_during_execution=client.V1NodeSelector(
+            node_selector_terms=[
+                client.V1NodeSelectorTerm(
+                    match_expressions=[
+                        client.V1NodeSelectorRequirement(
+                            key=f"{AMD_GPU_DEVICE_ID_LABEL_PREFIX}{device_id:x}",
+                            operator=Operator.EXISTS,
+                        ),
+                    ],
+                )
+                for device_id in device_ids
+            ],
+        ),
+    )
 def _continue_setup_jump_pod(
@@ -647,6 +754,39 @@ def _create_jump_pod_service(
         namespace=namespace,
         name=pod_name,
     )
+    node_list = call_api_method(api.list_node, client.V1NodeList)
+    nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
+    # False if we found at least one node without any "hard" taint, that is, if we don't need to
+    # specify the toleration.
+    toleration_required = True
+    # (key, effect) pairs.
+    tolerated_taints: set[tuple[str, str]] = set()
+    for node in nodes:
+        # True if the node has at least one NoExecute or NoSchedule taint.
+        has_hard_taint = False
+        taints = get_value(node, ".spec.taints", list[client.V1Taint]) or []
+        for taint in taints:
+            effect = get_value(taint, ".effect", str, required=True)
+            # A "soft" taint, ignore.
+            if effect == TaintEffect.PREFER_NO_SCHEDULE:
+                continue
+            has_hard_taint = True
+            key = get_value(taint, ".key", str, required=True)
+            if key in TOLERATED_NODE_TAINTS:
+                tolerated_taints.add((key, effect))
+        if not has_hard_taint:
+            toleration_required = False
+            break
+    tolerations: list[client.V1Toleration] = []
+    if toleration_required:
+        for key, effect in tolerated_taints:
+            tolerations.append(
+                client.V1Toleration(key=key, operator=Operator.EXISTS, effect=effect)
+            )
+        if not tolerations:
+            logger.warning("No appropriate node found, the jump pod may never be scheduled")
     commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys)
     pod = client.V1Pod(
         metadata=client.V1ObjectMeta(
@@ -667,7 +807,8 @@ def _create_jump_pod_service(
                         )
                     ],
                 )
-            ]
+            ],
+            tolerations=tolerations,
         ),
     )
     call_api_method(

dstack/_internal/core/backends/kubernetes/models.py CHANGED Viewed

@@ -37,7 +37,7 @@ class KubernetesBackendConfigWithCreds(KubernetesBackendConfig):
 class KubeconfigFileConfig(CoreModel):
-    filename: Annotated[str, Field(description="The path to the kubeconfig file")]
+    filename: Annotated[str, Field(description="The path to the kubeconfig file")] = ""
     data: Annotated[
         Optional[str],
         Field(
@@ -50,7 +50,9 @@ class KubeconfigFileConfig(CoreModel):
     ] = None
     @root_validator
-    def fill_data(cls, values):
+    def fill_data(cls, values: dict) -> dict:
+        if values.get("filename") == "" and values.get("data") is None:
+            raise ValueError("filename or data must be specified")
         return fill_data(values)

dstack/_internal/core/backends/nebius/compute.py CHANGED Viewed

@@ -2,8 +2,9 @@ import json
 import random
 import shlex
 import time
+from collections.abc import Iterable
 from functools import cached_property
-from typing import Callable, List, Optional
+from typing import List, Optional
 from nebius.aio.operation import Operation as SDKOperation
 from nebius.aio.service_error import RequestError, StatusCode
@@ -19,8 +20,13 @@ from dstack._internal.core.backends.base.compute import (
     ComputeWithPrivilegedSupport,
     generate_unique_instance_name,
     get_user_data,
+    merge_tags,
+)
+from dstack._internal.core.backends.base.offers import (
+    OfferModifier,
+    get_catalog_offers,
+    get_offers_disk_modifier,
 )
-from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
 from dstack._internal.core.backends.nebius import resources
 from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
 from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
@@ -124,10 +130,8 @@ class NebiusCompute(
             for offer in offers
         ]
-    def get_offers_modifier(
-        self, requirements: Requirements
-    ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
-        return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
+    def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
+        return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
     def create_instance(
         self,
@@ -150,6 +154,18 @@ class NebiusCompute(
             if backend_data.cluster is not None:
                 cluster_id = backend_data.cluster.id
+        labels = {
+            "owner": "dstack",
+            "dstack_project": instance_config.project_name.lower(),
+            "dstack_name": instance_config.instance_name,
+            "dstack_user": instance_config.user.lower(),
+        }
+        labels = merge_tags(
+            base_tags=labels,
+            backend_tags=self.config.tags,
+            resource_tags=instance_config.tags,
+        )
+        labels = resources.filter_invalid_labels(labels)
         gpus = instance_offer.instance.resources.gpus
         create_disk_op = resources.create_disk(
             sdk=self._sdk,
@@ -159,6 +175,7 @@ class NebiusCompute(
             image_family="ubuntu24.04-cuda12"
             if gpus and gpus[0].name == "B200"
             else "ubuntu22.04-cuda12",
+            labels=labels,
         )
         create_instance_op = None
         try:
@@ -184,6 +201,7 @@ class NebiusCompute(
                 disk_id=create_disk_op.resource_id,
                 subnet_id=self._get_subnet_id(instance_offer.region),
                 preemptible=instance_offer.instance.resources.spot,
+                labels=labels,
             )
             _wait_for_instance(self._sdk, create_instance_op)
         except BaseException:

dstack/_internal/core/backends/nebius/configurator.py CHANGED Viewed

@@ -3,6 +3,7 @@ import json
 from nebius.aio.service_error import RequestError
 from dstack._internal.core.backends.base.configurator import (
+    TAGS_MAX_NUM,
     BackendRecord,
     Configurator,
     raise_invalid_credentials_error,
@@ -18,6 +19,7 @@ from dstack._internal.core.backends.nebius.models import (
     NebiusServiceAccountCreds,
     NebiusStoredConfig,
 )
+from dstack._internal.core.errors import BackendError, ServerClientError
 from dstack._internal.core.models.backends.base import BackendType
@@ -53,6 +55,19 @@ class NebiusConfigurator(
                     f" some of the valid options: {sorted(valid_fabrics)}"
                 ),
             )
+        self._check_config_tags(config)
+    def _check_config_tags(self, config: NebiusBackendConfigWithCreds):
+        if not config.tags:
+            return
+        if len(config.tags) > TAGS_MAX_NUM:
+            raise ServerClientError(
+                f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed."
+            )
+        try:
+            resources.validate_labels(config.tags)
+        except BackendError as e:
+            raise ServerClientError(e.args[0])
     def create_backend(
         self, project_name: str, config: NebiusBackendConfigWithCreds

dstack/_internal/core/backends/nebius/models.py CHANGED Viewed

@@ -1,4 +1,6 @@
-from typing import Annotated, Literal, Optional, Union
+import json
+from pathlib import Path
+from typing import Annotated, Dict, Literal, Optional, Union
 from pydantic import Field, root_validator
@@ -27,16 +29,38 @@ class NebiusServiceAccountCreds(CoreModel):
             )
         ),
     ]
+    filename: Annotated[
+        Optional[str], Field(description="The path to the service account credentials file")
+    ] = None
 class NebiusServiceAccountFileCreds(CoreModel):
     type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
         "service_account"
     )
-    service_account_id: Annotated[str, Field(description="Service account ID")]
-    public_key_id: Annotated[str, Field(description="ID of the service account public key")]
+    service_account_id: Annotated[
+        Optional[str],
+        Field(
+            description=(
+                "Service account ID. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
+            )
+        ),
+    ] = None
+    public_key_id: Annotated[
+        Optional[str],
+        Field(
+            description=(
+                "ID of the service account public key. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
+            )
+        ),
+    ] = None
     private_key_file: Annotated[
-        Optional[str], Field(description=("Path to the service account private key"))
+        Optional[str],
+        Field(
+            description=(
+                "Path to the service account private key. Set automatically if `filename` or `private_key_content` is specified. When configuring via the UI, it must be specified explicitly"
+            )
+        ),
     ] = None
     private_key_content: Annotated[
         Optional[str],
@@ -44,13 +68,35 @@ class NebiusServiceAccountFileCreds(CoreModel):
             description=(
                 "Content of the service account private key. When configuring via"
                 " `server/config.yml`, it's automatically filled from `private_key_file`."
-                " When configuring via UI, it has to be specified explicitly."
+                " When configuring via UI, it has to be specified explicitly"
             )
         ),
     ] = None
+    filename: Annotated[
+        Optional[str], Field(description="The path to the service account credentials file")
+    ] = None
     @root_validator
     def fill_data(cls, values):
+        if filename := values.get("filename"):
+            try:
+                with open(Path(filename).expanduser()) as f:
+                    data = json.load(f)
+                from nebius.base.service_account.credentials_file import (
+                    ServiceAccountCredentials,
+                )
+                credentials = ServiceAccountCredentials.from_json(data)
+                subject = credentials.subject_credentials
+                values["service_account_id"] = subject.sub
+                values["public_key_id"] = subject.kid
+                values["private_key_content"] = subject.private_key
+            except OSError:
+                raise ValueError(f"No such file {filename}")
+            except Exception as e:
+                raise ValueError(f"Failed to parse credentials file {filename}: {e}")
+            return values
         return fill_data(
             values, filename_field="private_key_file", data_field="private_key_content"
         )
@@ -95,6 +141,12 @@ class NebiusBackendConfig(CoreModel):
             )
         ),
     ] = None
+    tags: Annotated[
+        Optional[Dict[str, str]],
+        Field(
+            description="The tags (labels) that will be assigned to resources created by `dstack`"
+        ),
+    ] = None
 class NebiusBackendConfigWithCreds(NebiusBackendConfig):

dstack 0.19.31__py3-none-any.whl → 0.19.33__py3-none-any.whl

Potentially problematic release.

dstack 0.19.31py3-none-any.whl → 0.19.33py3-none-any.whl