PyPI - gpustack-runtime - Versions diffs - 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl - Mend

gpustack-runtime 0.1.41.post2py3-none-any.whl → 0.1.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

gpustack_runtime/_version.py +2 -2
gpustack_runtime/_version_appendix.py +1 -1
gpustack_runtime/cmds/detector.py +3 -1
gpustack_runtime/deployer/__types__.py +314 -233
gpustack_runtime/deployer/cdi/__utils__.py +4 -1
gpustack_runtime/deployer/docker.py +109 -148
gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +21 -3
gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
gpustack_runtime/deployer/kuberentes.py +91 -126
gpustack_runtime/deployer/podman.py +89 -122
gpustack_runtime/detector/__init__.py +2 -0
gpustack_runtime/detector/__types__.py +26 -0
gpustack_runtime/detector/amd.py +28 -8
gpustack_runtime/detector/ascend.py +49 -4
gpustack_runtime/detector/cambricon.py +3 -0
gpustack_runtime/detector/hygon.py +16 -1
gpustack_runtime/detector/iluvatar.py +6 -0
gpustack_runtime/detector/metax.py +8 -0
gpustack_runtime/detector/mthreads.py +11 -0
gpustack_runtime/detector/nvidia.py +139 -134
gpustack_runtime/detector/pyixml/__init__.py +16 -0
gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
gpustack_runtime/detector/thead.py +135 -127
gpustack_runtime/envs.py +7 -6
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0

gpustack_runtime/deployer/cdi/__utils__.py CHANGED Viewed

@@ -147,6 +147,7 @@ def path_to_cdi_mount(
     path: str,
     container_path: str | None = None,
     options: list[str] | None = None,
+    ignore_notfound: bool = False,
 ) -> ConfigMount | None:
     """
     Convert a file/directory path to a ConfigMount.
@@ -158,13 +159,15 @@ def path_to_cdi_mount(
             Path to the file or directory inside the container.
         options:
             Mount options.
+        ignore_notfound:
+            Whether to ignore if the path does not exist.
     Returns:
         The ConfigMount object.
         None if the path does not exist.
     """
-    if not Path(path).exists():
+    if not Path(path).exists() and not ignore_notfound:
         return None
     if container_path is None:

gpustack_runtime/deployer/docker.py CHANGED Viewed

@@ -4,13 +4,11 @@ import contextlib
 import io
 import json
 import logging
-import operator
 import os
 import socket
 import sys
 import tarfile
 from dataclasses import dataclass, field
-from functools import reduce
 from math import ceil
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
@@ -81,17 +79,6 @@ class DockerWorkloadPlan(WorkloadPlan):
             Image used for the pause container.
         unhealthy_restart_image (str):
             Image used for unhealthy restart container.
-        resource_key_runtime_env_mapping: (dict[str, str]):
-            Mapping from resource names to environment variable names for device allocation,
-            which is used to tell the Container Runtime which GPUs to mount into the container.
-            For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
-            which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
-            With privileged mode, the container can access all GPUs even if specified.
-        resource_key_backend_env_mapping: (dict[str, list[str]]):
-            Mapping from resource names to environment variable names for device runtime,
-            which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
-            For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
-            which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
         namespace (str | None):
             Namespace of the workload.
         name (str):
@@ -845,7 +832,7 @@ class DockerDeployer(EndoscopicDeployer):
             msg = f"Failed to upload ephemeral files to container {container.name}"
             raise OperationError(msg)
-    def _create_containers(  # noqa: C901
+    def _create_containers(
         self,
         workload: DockerWorkloadPlan,
         ephemeral_volume_name_mapping: dict[str, str],
@@ -955,146 +942,120 @@ class DockerDeployer(EndoscopicDeployer):
                     envs.GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY.lower()
                     == "cdi"
                 )
+                fmt = "plain" if not cdi else "cdi"
-                r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
-                r_k_backend_env = workload.resource_key_backend_env_mapping or {}
-                vd_manus, vd_env, vd_cdis, vd_values = (
-                    self.get_visible_devices_materials()
-                )
                 for r_k, r_v in c.resources.items():
-                    match r_k:
-                        case "cpu":
-                            if isinstance(r_v, int | float):
-                                create_options["cpu_shares"] = ceil(r_v * 1024)
-                            elif isinstance(r_v, str) and r_v.isdigit():
-                                create_options["cpu_shares"] = ceil(float(r_v) * 1024)
-                        case "memory":
-                            if isinstance(r_v, int):
-                                create_options["mem_limit"] = r_v
-                                create_options["mem_reservation"] = r_v
-                                create_options["memswap_limit"] = r_v
-                            elif isinstance(r_v, str):
-                                v = r_v.lower().removesuffix("i")
-                                create_options["mem_limit"] = v
-                                create_options["mem_reservation"] = v
-                                create_options["memswap_limit"] = v
-                        case _:
-                            if r_k in r_k_runtime_env:
-                                # Set env if resource key is mapped.
-                                runtime_env = [r_k_runtime_env[r_k]]
-                            elif (
-                                r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
-                            ):
-                                # Set env if auto-mapping key is matched.
-                                runtime_env = list(vd_env.keys())
-                            else:
-                                continue
+                    if r_k == "cpu":
+                        if isinstance(r_v, int | float):
+                            create_options["cpu_shares"] = ceil(r_v * 1024)
+                        elif isinstance(r_v, str) and r_v.isdigit():
+                            create_options["cpu_shares"] = ceil(float(r_v) * 1024)
+                        continue
+                    if r_k == "memory":
+                        if isinstance(r_v, int):
+                            create_options["mem_limit"] = r_v
+                            create_options["mem_reservation"] = r_v
+                            create_options["memswap_limit"] = r_v
+                        elif isinstance(r_v, str):
+                            v = r_v.lower().removesuffix("i")
+                            create_options["mem_limit"] = v
+                            create_options["mem_reservation"] = v
+                            create_options["memswap_limit"] = v
+                        continue
-                            if r_k in r_k_backend_env:
-                                # Set env if resource key is mapped.
-                                backend_env = r_k_backend_env[r_k]
-                            else:
-                                # Otherwise, use the default backend env names.
-                                backend_env = reduce(
-                                    operator.add,
-                                    list(vd_env.values()),
-                                )
+                    if (
+                        r_k
+                        in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
+                    ):
+                        # Set env if resource key is mapped.
+                        runtime_envs = [
+                            envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
+                                r_k
+                            ],
+                        ]
+                    elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
+                        # Set env if auto-mapping key is matched.
+                        runtime_envs = self.get_runtime_envs()
+                    else:
+                        continue
-                            privileged = create_options.get("privileged", False)
-                            # Generate CDI config if not yet.
-                            if cdi and envs.GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE:
-                                for re in runtime_env:
-                                    cdi_dump_config(
-                                        manufacturer=vd_manus[re],
-                                        output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
-                                    )
-                            # Configure device access environment variable.
-                            if r_v == "all" and backend_env:
-                                # Configure privileged if requested all devices.
-                                create_options["privileged"] = True
-                                # Then, set container backend visible devices env to all devices,
-                                # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
-                                # and mount corresponding libs if needed.
-                                for re in runtime_env:
-                                    # Request device via CDI.
-                                    if cdi:
-                                        rv = [
-                                            f"{vd_cdis[re]}={v}"
-                                            for v in (vd_values.get(re) or ["all"])
-                                        ]
-                                        if "device_requests" not in create_options:
-                                            create_options["device_requests"] = []
-                                        create_options["device_requests"].append(
-                                            docker.types.DeviceRequest(
-                                                driver="cdi",
-                                                count=0,
-                                                device_ids=rv,
-                                            ),
-                                        )
-                                        continue
-                                    # Request device via visible devices env.
-                                    rv = ",".join(vd_values.get(re) or ["all"])
-                                    create_options["environment"][re] = rv
+                    privileged = create_options.get("privileged", False)
+                    resource_values = [x.strip() for x in r_v.split(",")]
+                    # Generate CDI config if not yet.
+                    if cdi and envs.GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE:
+                        for ren in runtime_envs:
+                            r_m = self.get_manufacturer(ren)
+                            cdi_dump_config(
+                                manufacturer=r_m,
+                                output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
+                            )
+                    # Request devices.
+                    if r_v == "all":
+                        # Configure privileged.
+                        create_options["privileged"] = True
+                        # Request all devices.
+                        for ren in runtime_envs:
+                            r_vs = self.get_runtime_visible_devices(ren, fmt)
+                            # Request device via CDI.
+                            if cdi:
+                                if "device_requests" not in create_options:
+                                    create_options["device_requests"] = []
+                                create_options["device_requests"].append(
+                                    docker.types.DeviceRequest(
+                                        driver="cdi",
+                                        count=0,
+                                        device_ids=r_vs,
+                                    ),
+                                )
+                                continue
+                            # Request device via visible devices env.
+                            create_options["environment"][ren] = ",".join(r_vs)
+                    else:
+                        # Request specific devices.
+                        for ren in runtime_envs:
+                            # Request all devices if privileged,
+                            # otherwise, normalize requested devices.
+                            if privileged:
+                                r_vs = self.get_runtime_visible_devices(ren, fmt)
                             else:
-                                # Set env to the allocated device IDs if no privileged,
-                                # otherwise, set container backend visible devices env to all devices,
-                                # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
-                                # and mount corresponding libs if needed.
-                                for re in runtime_env:
-                                    # Request device via CDI.
-                                    if cdi:
-                                        if not privileged:
-                                            rv = [
-                                                f"{vd_cdis[re]}={v.strip()}"
-                                                for v in r_v.split(",")
-                                            ]
-                                        else:
-                                            rv = [
-                                                f"{vd_cdis[re]}={v}"
-                                                for v in (vd_values.get(re) or ["all"])
-                                            ]
-                                        if "device_requests" not in create_options:
-                                            create_options["device_requests"] = []
-                                        create_options["device_requests"].append(
-                                            docker.types.DeviceRequest(
-                                                driver="cdi",
-                                                count=0,
-                                                device_ids=rv,
-                                            ),
-                                        )
-                                        continue
-                                    # Request device via visible devices env.
-                                    if not privileged:
-                                        rv = str(r_v)
-                                    else:
-                                        rv = ",".join(vd_values.get(re) or ["all"])
-                                    create_options["environment"][re] = rv
-                            # Configure runtime device access environment variables.
-                            if r_v != "all" and privileged:
-                                for be in backend_env:
-                                    create_options["environment"][be] = (
-                                        self.align_backend_visible_devices_env_values(
-                                            be,
-                                            str(r_v),
-                                        )
-                                    )
-                            # Configure affinity if applicable.
-                            if (
-                                envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
-                                or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
-                            ):
-                                cpus, numas = self.get_visible_devices_affinities(
-                                    runtime_env,
-                                    r_v,
+                                r_vs = self.map_runtime_visible_devices(
+                                    ren,
+                                    resource_values,
+                                    fmt,
                                 )
-                                if cpus:
-                                    create_options["cpuset_cpus"] = cpus
-                                if numas and envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
-                                    create_options["cpuset_mems"] = numas
+                            # Request device via CDI.
+                            if cdi:
+                                if "device_requests" not in create_options:
+                                    create_options["device_requests"] = []
+                                create_options["device_requests"].append(
+                                    docker.types.DeviceRequest(
+                                        driver="cdi",
+                                        count=0,
+                                        device_ids=r_vs,
+                                    ),
+                                )
+                                continue
+                            # Request device via visible devices env.
+                            create_options["environment"][ren] = ",".join(r_vs)
+                    # If not requesting all devices but privileged,
+                    # must configure visible devices.
+                    if r_v != "all" and privileged:
+                        b_vs = self.map_backend_visible_devices(
+                            runtime_envs,
+                            resource_values,
+                        )
+                        create_options["environment"].update(b_vs)
+                    # Configure affinity if applicable.
+                    create_options.update(
+                        self.map_visible_devices_affinities(
+                            runtime_envs,
+                            resource_values,
+                        ),
+                    )
             # Parameterize mounts.
             self._append_container_mounts(

gpustack_runtime/deployer/k8s/deviceplugin/__init__.py CHANGED Viewed

@@ -65,7 +65,7 @@ async def serve_async(
         if not devices:
             continue
-        allocation_policy = _get_device_allocation_policy(manu)
+        allocation_policy = get_device_allocation_policy(manu)
         logger.info(
             "Using device allocation policy '%s' for manufacturer '%s'",
             allocation_policy,
@@ -277,7 +277,23 @@ def is_kubelet_socket_accessible(
 @lru_cache
-def _get_device_allocation_policy(
+def get_resource_injection_policy() -> Literal["env", "kdp"]:
+    """
+    Get the resource injection policy (in lowercase) for the deployer.
+    Returns:
+        The resource injection policy.
+    """
+    policy = envs.GPUSTACK_RUNTIME_KUBERNETES_RESOURCE_INJECTION_POLICY.lower()
+    if policy != "auto":
+        return policy
+    return "kdp" if is_kubelet_socket_accessible() else "env"
+@lru_cache
+def get_device_allocation_policy(
     manufacturer: ManufacturerEnum,
 ) -> Literal["env", "cdi", "opaque"]:
     """
@@ -307,7 +323,7 @@ def _get_device_allocation_policy(
     if manufacturer in [
         ManufacturerEnum.AMD,
-        # ManufacturerEnum.ASCEND, # Prioritize using Env policy for Ascend.
+        ManufacturerEnum.ASCEND,
         ManufacturerEnum.HYGON,
         ManufacturerEnum.ILUVATAR,
         ManufacturerEnum.METAX,
@@ -320,6 +336,8 @@ def _get_device_allocation_policy(
 __all__ = [
     "cdi_kind_to_kdp_resource",
+    "get_device_allocation_policy",
+    "get_resource_injection_policy",
     "is_kubelet_socket_accessible",
     "serve",
     "serve_async",

gpustack_runtime/deployer/k8s/deviceplugin/plugin.py CHANGED Viewed

@@ -11,7 +11,7 @@ import grpc
 from grpc_interceptor import AsyncServerInterceptor
 from grpc_interceptor.exceptions import GrpcException
-from ....detector import Device, str_range_to_list
+from ....detector import Device, DeviceMemoryStatusEnum, str_range_to_list
 from ...cdi import (
     generate_config,
     manufacturer_to_cdi_kind,
@@ -40,6 +40,7 @@ from ..types.kubelet.deviceplugin.v1beta1 import (
     RegisterRequest,
     RegistrationStub,
     TopologyInfo,
+    Unhealthy,
     Version,
     add_DevicePluginServicer_to_server,
 )
@@ -159,7 +160,7 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
         self._runtime_env = manufacturer_to_runtime_env(device.manufacturer)
         self._kdp_resource = cdi_kind_to_kdp_resource(
             cdi_kind=self._cdi_kind,
-            device_index=device.index,
+            device_index=str(device.index),
         )
         super().__init__(self._kdp_resource)
@@ -334,12 +335,12 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
             The response containing the list of devices.
         """
-        device_id = (
-            self._device.uuid if self._id_by == "uuid" else str(self._device.index)
-        )
         dp_devices: list[DevicePluginDevice] = []
-        dp_device_health = Healthy
+        dp_device_health = (
+            Healthy
+            if self._device.memory_status == DeviceMemoryStatusEnum.HEALTHY
+            else Unhealthy
+        )
         dp_device_topo = TopologyInfo(
             nodes=[
                 NUMANode(
@@ -352,7 +353,10 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
         )
         for device_replica in range(1, self._max_allocations + 1):
-            dp_device_id = _to_device_plugin_device_id(device_id, device_replica)
+            dp_device_id = _to_device_plugin_device_id(
+                str(self._device.index),
+                device_replica,
+            )
             dp_devices.append(
                 DevicePluginDevice(
                     ID=dp_device_id,
@@ -419,28 +423,25 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
         req: ContainerAllocateRequest,
     ) -> ContainerAllocateResponse:
         policy = self._allocation_policy
-        request_dp_device_ids = req.devices_ids
+        device_id = self._device.uuid
+        if self._id_by == "index":
+            device_id = str(self._device.index)
         # CDI device allocation.
         if policy == "cdi":
-            cdi_devices: list[CDIDevice] = []
-            for dp_device_id in request_dp_device_ids:
-                device_id, _ = _from_device_plugin_device_id(dp_device_id)
-                cdi_devices.append(
+            return ContainerAllocateResponse(
+                cdi_devices=[
                     CDIDevice(
                         name=f"{self._cdi_kind}={device_id}",
                     ),
-                )
-            return ContainerAllocateResponse(
-                cdi_devices=cdi_devices,
+                ],
             )
         # Environment variable device allocation.
         if policy == "env":
             return ContainerAllocateResponse(
                 envs={
-                    self._runtime_env: ",".join(request_dp_device_ids),
+                    self._runtime_env: device_id,
                 },
             )
@@ -509,7 +510,7 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
 @lru_cache
 def cdi_kind_to_kdp_resource(
     cdi_kind: str,
-    device_index: int,
+    device_index: str,
 ) -> str:
     """
     Map CDI kind and device index to a Kubernetes Device Plugin resource name.

gpustack-runtime 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl

gpustack-runtime 0.1.41.post2py3-none-any.whl → 0.1.42py3-none-any.whl