PyPI - gpustack-runtime - Versions diffs - 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl - Mend

gpustack-runtime 0.1.41.post2py3-none-any.whl → 0.1.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

gpustack_runtime/_version.py +2 -2
gpustack_runtime/_version_appendix.py +1 -1
gpustack_runtime/cmds/detector.py +3 -1
gpustack_runtime/deployer/__types__.py +314 -233
gpustack_runtime/deployer/cdi/__utils__.py +4 -1
gpustack_runtime/deployer/docker.py +109 -148
gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +21 -3
gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
gpustack_runtime/deployer/kuberentes.py +91 -126
gpustack_runtime/deployer/podman.py +89 -122
gpustack_runtime/detector/__init__.py +2 -0
gpustack_runtime/detector/__types__.py +26 -0
gpustack_runtime/detector/amd.py +28 -8
gpustack_runtime/detector/ascend.py +49 -4
gpustack_runtime/detector/cambricon.py +3 -0
gpustack_runtime/detector/hygon.py +16 -1
gpustack_runtime/detector/iluvatar.py +6 -0
gpustack_runtime/detector/metax.py +8 -0
gpustack_runtime/detector/mthreads.py +11 -0
gpustack_runtime/detector/nvidia.py +139 -134
gpustack_runtime/detector/pyixml/__init__.py +16 -0
gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
gpustack_runtime/detector/thead.py +135 -127
gpustack_runtime/envs.py +7 -6
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0

gpustack_runtime/detector/pyixml/__init__.py CHANGED Viewed

@@ -960,6 +960,14 @@ NVML_HOST_VGPU_MODE_SRIOV = 1
 # GSP firmware
 NVML_GSP_FIRMWARE_VERSION_BUF_SIZE = 0x40
+# Health
+IXML_HEALTH_SYSHUB_ERROR = 0x0000000000000001
+IXML_HEALTH_MC_ERROR = 0x0000000000000002
+IXML_HEALTH_ECC_ERROR = 0x0000000000000010
+IXML_HEALTH_MEMORY_ERROR = 0x0000000000000020
+IXML_HEALTH_PCIE_ERROR = 0x0000000000000040
+IXML_HEALTH_OK = 0x0000000000000000
 ## Error Checking ##
 class NVMLError(Exception):
@@ -5267,3 +5275,11 @@ def nvmlDeviceGetGpuFabricInfo(device, gpuFabricInfo):
     ret = fn(device, gpuFabricInfo)
     _nvmlCheckReturn(ret)
     return ret
+def ixmlDeviceGetHealth(device):
+    c_health = c_longlong()
+    fn = _nvmlGetFunctionPointer("ixmlDeviceGetHealth")
+    ret = fn(device, byref(c_health))
+    _nvmlCheckReturn(ret)
+    return c_health.value

gpustack_runtime/detector/pyrocmsmi/__init__.py CHANGED Viewed

@@ -393,3 +393,17 @@ def rsmi_is_p2p_accessible(device_a=0, device_b=0):
     )
     _rocmsmiCheckReturn(ret)
     return c_accessible.value
+def rsmi_dev_ecc_count_get(device=0, gpu_block=None):
+    if gpu_block is None:
+        gpu_block = rsmi_gpu_block_t.RSMI_GPU_BLOCK_UMC
+    c_error_count = rsmi_error_count_t()
+    fn = _rocmsmiGetFunctionPointer("rsmi_dev_ecc_count_get")
+    ret = fn(
+        device,
+        gpu_block,
+        byref(c_error_count),
+    )
+    _rocmsmiCheckReturn(ret)
+    return c_error_count

gpustack_runtime/detector/thead.py CHANGED Viewed

@@ -12,6 +12,7 @@ from . import pyhgml
 from .__types__ import (
     Detector,
     Device,
+    DeviceMemoryStatusEnum,
     Devices,
     ManufacturerEnum,
     Topology,
@@ -138,17 +139,33 @@ class THeadDetector(Detector):
                     )
                     dev_numa = bitmask_to_str(list(dev_node_affinity))
+                dev_temp = None
+                with contextlib.suppress(pyhgml.HGMLError):
+                    dev_temp = pyhgml.hgmlDeviceGetTemperature(
+                        dev,
+                        pyhgml.HGML_TEMPERATURE_GPU,
+                    )
+                dev_power = None
+                dev_power_used = None
+                with contextlib.suppress(pyhgml.HGMLError):
+                    dev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(dev)
+                    dev_power = dev_power // 1000  # mW to W
+                    dev_power_used = (
+                        pyhgml.hgmlDeviceGetPowerUsage(dev) // 1000
+                    )  # mW to W
                 dev_mig_mode = pyhgml.HGML_DEVICE_MIG_DISABLE
                 with contextlib.suppress(pyhgml.HGMLError):
                     dev_mig_mode, _ = pyhgml.hgmlDeviceGetMigMode(dev)
+                dev_index = dev_idx
+                if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
+                    dev_index = pyhgml.hgmlDeviceGetMinorNumber(dev)
                 # With MIG disabled, treat as a single device.
                 if dev_mig_mode == pyhgml.HGML_DEVICE_MIG_DISABLE:
-                    dev_index = dev_idx
-                    if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
-                        dev_index = pyhgml.hgmlDeviceGetMinorNumber(dev)
                     dev_name = pyhgml.hgmlDeviceGetName(dev)
                     dev_uuid = pyhgml.hgmlDeviceGetUUID(dev)
@@ -171,6 +188,7 @@ class THeadDetector(Detector):
                     dev_mem = 0
                     dev_mem_used = 0
+                    dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                     with contextlib.suppress(pyhgml.HGMLError):
                         dev_mem_info = pyhgml.hgmlDeviceGetMemoryInfo(dev)
                         dev_mem = byte_to_mebibyte(  # byte to MiB
@@ -179,22 +197,14 @@ class THeadDetector(Detector):
                         dev_mem_used = byte_to_mebibyte(  # byte to MiB
                             dev_mem_info.used,
                         )
-                    dev_temp = None
-                    with contextlib.suppress(pyhgml.HGMLError):
-                        dev_temp = pyhgml.hgmlDeviceGetTemperature(
+                        dev_mem_ecc_errors = pyhgml.hgmlDeviceGetMemoryErrorCounter(
                             dev,
-                            pyhgml.HGML_TEMPERATURE_GPU,
+                            pyhgml.HGML_MEMORY_ERROR_TYPE_UNCORRECTED,
+                            pyhgml.HGML_VOLATILE_ECC,
+                            pyhgml.HGML_MEMORY_LOCATION_DRAM,
                         )
-                    dev_power = None
-                    dev_power_used = None
-                    with contextlib.suppress(pyhgml.HGMLError):
-                        dev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(dev)
-                        dev_power = dev_power // 1000  # mW to W
-                        dev_power_used = (
-                            pyhgml.hgmlDeviceGetPowerUsage(dev) // 1000
-                        )  # mW to W
+                        if dev_mem_ecc_errors > 0:
+                            dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
                     dev_is_vgpu = False
                     if dev_bdf:
@@ -221,6 +231,7 @@ class THeadDetector(Detector):
                             memory=dev_mem,
                             memory_used=dev_mem_used,
                             memory_utilization=get_utilization(dev_mem_used, dev_mem),
+                            memory_status=dev_mem_status,
                             temperature=dev_temp,
                             power=dev_power,
                             power_used=dev_power_used,
@@ -236,35 +247,34 @@ class THeadDetector(Detector):
                 mdev_cores = None
                 mdev_count = pyhgml.hgmlDeviceGetMaxMigDeviceCount(dev)
                 for mdev_idx in range(mdev_count):
-                    mdev = pyhgml.hgmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
+                    mdev = None
+                    with contextlib.suppress(pyhgml.HGMLError):
+                        mdev = pyhgml.hgmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
+                    if not mdev:
+                        continue
-                    mdev_index = mdev_idx
+                    mdev_index = mdev_idx + dev_count * (dev_idx + 1)
                     mdev_uuid = pyhgml.hgmlDeviceGetUUID(mdev)
-                    mdev_mem, mdev_mem_used = 0, 0
+                    mdev_mem = 0
+                    mdev_mem_used = 0
+                    mdev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                     with contextlib.suppress(pyhgml.HGMLError):
                         mdev_mem_info = pyhgml.hgmlDeviceGetMemoryInfo(mdev)
-                        byte_to_mebibyte(  # byte to MiB
+                        mdev_mem = byte_to_mebibyte(  # byte to MiB
                             mdev_mem_info.total,
                         )
-                        byte_to_mebibyte(  # byte to MiB
+                        mdev_mem_used = byte_to_mebibyte(  # byte to MiB
                             mdev_mem_info.used,
                         )
-                    mdev_temp = pyhgml.hgmlDeviceGetTemperature(
-                        mdev,
-                        pyhgml.HGML_TEMPERATURE_GPU,
-                    )
-                    mdev_power = None
-                    with contextlib.suppress(pyhgml.HGMLError):
-                        mdev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(
+                        mdev_mem_ecc_errors = pyhgml.hgmlDeviceGetMemoryErrorCounter(
                             mdev,
+                            pyhgml.HGML_MEMORY_ERROR_TYPE_UNCORRECTED,
+                            pyhgml.HGML_AGGREGATE_ECC,
+                            pyhgml.HGML_MEMORY_LOCATION_SRAM,
                         )
-                        mdev_power = mdev_power // 1000  # mW to W
-                    mdev_power_used = (
-                        pyhgml.hgmlDeviceGetPowerUsage(mdev) // 1000
-                    )  # mW to W
+                        if mdev_mem_ecc_errors > 0:
+                            mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
                     mdev_appendix = {
                         "vgpu": True,
@@ -279,63 +289,64 @@ class THeadDetector(Detector):
                     mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
-                    if not mdev_name:
-                        mdev_gi = pyhgml.hgmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
-                        mdev_ci = pyhgml.hgmlGpuInstanceGetComputeInstanceById(
-                            mdev_gi,
-                            mdev_ci_id,
-                        )
-                        mdev_gi_info = pyhgml.hgmlGpuInstanceGetInfo(mdev_gi)
-                        mdev_ci_info = pyhgml.hgmlComputeInstanceGetInfo(mdev_ci)
-                        for dev_gi_prf_id in range(
-                            pyhgml.HGML_GPU_INSTANCE_PROFILE_COUNT,
-                        ):
-                            try:
-                                dev_gi_prf = pyhgml.hgmlDeviceGetGpuInstanceProfileInfo(
-                                    dev,
-                                    dev_gi_prf_id,
-                                )
-                                if dev_gi_prf.id != mdev_gi_info.profileId:
-                                    continue
-                            except pyhgml.HGMLError:
+                    mdev_gi = pyhgml.hgmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
+                    mdev_ci = pyhgml.hgmlGpuInstanceGetComputeInstanceById(
+                        mdev_gi,
+                        mdev_ci_id,
+                    )
+                    mdev_gi_info = pyhgml.hgmlGpuInstanceGetInfo(mdev_gi)
+                    mdev_ci_info = pyhgml.hgmlComputeInstanceGetInfo(mdev_ci)
+                    for dev_gi_prf_id in range(
+                        pyhgml.HGML_GPU_INSTANCE_PROFILE_COUNT,
+                    ):
+                        try:
+                            dev_gi_prf = pyhgml.hgmlDeviceGetGpuInstanceProfileInfo(
+                                dev,
+                                dev_gi_prf_id,
+                            )
+                            if dev_gi_prf.id != mdev_gi_info.profileId:
                                 continue
+                        except pyhgml.HGMLError:
+                            continue
-                            for dev_ci_prf_id in range(
-                                pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_COUNT,
+                        for dev_ci_prf_id in range(
+                            pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_COUNT,
+                        ):
+                            for dev_cig_prf_id in range(
+                                pyhgml.HGML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
                             ):
-                                for dev_cig_prf_id in range(
-                                    pyhgml.HGML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
-                                ):
-                                    try:
-                                        mdev_ci_prf = pyhgml.hgmlGpuInstanceGetComputeInstanceProfileInfo(
-                                            mdev_gi,
-                                            dev_ci_prf_id,
-                                            dev_cig_prf_id,
-                                        )
-                                        if mdev_ci_prf.id != mdev_ci_info.profileId:
-                                            continue
-                                    except pyhgml.HGMLError:
-                                        continue
-                                    ci_slice = _get_compute_instance_slice(
+                                try:
+                                    mdev_ci_prf = pyhgml.hgmlGpuInstanceGetComputeInstanceProfileInfo(
+                                        mdev_gi,
                                         dev_ci_prf_id,
+                                        dev_cig_prf_id,
                                     )
-                                    gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
-                                    gi_mem = _get_gpu_instance_memory(
-                                        dev_mem_info,
-                                        dev_gi_prf,
-                                    )
+                                    if mdev_ci_prf.id != mdev_ci_info.profileId:
+                                        continue
+                                except pyhgml.HGMLError:
+                                    continue
-                                    if ci_slice == gi_slice:
-                                        mdev_name = f"{gi_slice}g.{gi_mem}gb"
+                                ci_slice = _get_compute_instance_slice(dev_ci_prf_id)
+                                gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
+                                if ci_slice == gi_slice:
+                                    if hasattr(dev_gi_prf, "name"):
+                                        mdev_name = dev_gi_prf.name
                                     else:
-                                        mdev_name = (
-                                            f"{ci_slice}u.{gi_slice}g.{gi_mem}gb"
+                                        gi_mem = round(
+                                            math.ceil(dev_gi_prf.memorySizeMB >> 10),
                                         )
+                                        mdev_name = f"{gi_slice}g.{gi_mem}gb"
+                                elif hasattr(mdev_ci_prf, "name"):
+                                    mdev_name = mdev_ci_prf.name
+                                else:
+                                    gi_mem = round(
+                                        math.ceil(dev_gi_prf.memorySizeMB >> 10),
+                                    )
+                                    mdev_name = f"{ci_slice}u.{gi_slice}g.{gi_mem}gb"
-                                    mdev_cores = mdev_ci_prf.multiprocessorCount
+                                mdev_cores = mdev_ci_prf.multiprocessorCount
-                                    break
+                                break
                     ret.append(
                         Device(
@@ -352,9 +363,10 @@ class THeadDetector(Detector):
                             memory=mdev_mem,
                             memory_used=mdev_mem_used,
                             memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
-                            temperature=mdev_temp,
-                            power=mdev_power,
-                            power_used=mdev_power_used,
+                            memory_status=mdev_mem_status,
+                            temperature=dev_temp,
+                            power=dev_power,
+                            power_used=dev_power_used,
                             appendix=mdev_appendix,
                         ),
                     )
@@ -392,11 +404,17 @@ class THeadDetector(Detector):
             devices_count=len(devices),
         )
+        get_links_cache = {}
         try:
             pyhgml.hgmlInit()
             for i, dev_i in enumerate(devices):
-                dev_i_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_i.uuid)
+                dev_i_bdf = dev_i.appendix.get("bdf")
+                if dev_i.appendix.get("vgpu", False):
+                    dev_i_handle = pyhgml.hgmlDeviceGetHandleByPciBusId(dev_i_bdf)
+                else:
+                    dev_i_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_i.uuid)
                 # Get NUMA and CPU affinities.
                 ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
@@ -405,7 +423,12 @@ class THeadDetector(Detector):
                 )
                 # Get links state if applicable.
-                if dev_i_links_state := _get_links_state(dev_i_handle):
+                if dev_i_bdf in get_links_cache:
+                    dev_i_links_state = get_links_cache[dev_i_bdf]
+                else:
+                    dev_i_links_state = _get_links_state(dev_i_handle)
+                    get_links_cache[dev_i_bdf] = dev_i_links_state
+                if dev_i_links_state:
                     ret.appendices[i].update(dev_i_links_state)
                     # In practice, if a card has an active *Link,
                     # then other cards in the same machine should be interconnected with it through the *Link.
@@ -422,21 +445,30 @@ class THeadDetector(Detector):
                     if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
                         continue
-                    dev_j_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_j.uuid)
-                    distance = TopologyDistanceEnum.UNK
-                    try:
-                        distance = pyhgml.hgmlDeviceGetTopologyCommonAncestor(
-                            dev_i_handle,
-                            dev_j_handle,
-                        )
-                    except pyhgml.HGMLError:
-                        debug_log_exception(
-                            logger,
-                            "Failed to get distance between device %d and %d",
-                            dev_i.index,
-                            dev_j.index,
-                        )
+                    dev_j_bdf = dev_j.appendix.get("bdf")
+                    if dev_i_bdf == dev_j_bdf:
+                        distance = TopologyDistanceEnum.SELF
+                    else:
+                        if dev_j.appendix.get("vgpu", False):
+                            dev_j_handle = pyhgml.hgmlDeviceGetHandleByPciBusId(
+                                dev_j_bdf,
+                            )
+                        else:
+                            dev_j_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_j.uuid)
+                        distance = TopologyDistanceEnum.UNK
+                        try:
+                            distance = pyhgml.hgmlDeviceGetTopologyCommonAncestor(
+                                dev_i_handle,
+                                dev_j_handle,
+                            )
+                        except pyhgml.HGMLError:
+                            debug_log_exception(
+                                logger,
+                                "Failed to get distance between device %d and %d",
+                                dev_i.index,
+                                dev_j.index,
+                            )
                     ret.devices_distances[i][j] = distance
                     ret.devices_distances[j][i] = distance
@@ -655,30 +687,6 @@ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
     raise AttributeError(msg)
-def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
-    """
-    Compute the memory size of a MIG compute instance in GiB.
-    Args:
-        dev_mem:
-            The total memory info of the parent GPU device.
-        dev_gi_prf:
-            The profile info of the GPU instance.
-    Returns:
-        The memory size in GiB.
-    """
-    mem = dev_gi_prf.memorySizeMB * (1 << 20)  # MiB to byte
-    gib = round(
-        math.ceil(mem / dev_mem.total * 8)
-        / 8
-        * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
-    )
-    return gib
 def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
     """
     Get the number of slice for a given Compute Instance Profile ID.

gpustack_runtime/envs.py CHANGED Viewed

@@ -246,7 +246,7 @@ if TYPE_CHECKING:
     GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE: bool = True
     """
     Generate CDI specifications during deployment when using CDI resource injection policy,
-    requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to be existed.
+    requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
     Works only when `GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY` is set to `CDI`.
     Using internal knowledge to generate the CDI specifications for deployer,
     if the output file conflicts with other tools generating CDI specifications(e.g., NVIDIA Container Toolkit),
@@ -283,7 +283,7 @@ if TYPE_CHECKING:
     Resource injection policy for the Kubernetes deployer (e.g., Auto, Env, KDP).
     `Auto`: Automatically choose the resource injection policy based on the environment.
     `Env`: Injects resources using standard environment variable, depends on underlying Container Toolkit, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES`.
-    `KDP`: Injects resources using Kubernetes Device Plugin, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_CDI`.
+    `KDP`: Injects resources using Kubernetes Device Plugin.
     """
     GPUSTACK_RUNTIME_KUBERNETES_KDP_PER_DEVICE_MAX_ALLOCATIONS: int | None = None
     """
@@ -294,14 +294,14 @@ if TYPE_CHECKING:
     """
     Device allocation policy for the Kubernetes Device Plugin (e.g., CDI, Env, Opaque).
     `Auto`: Automatically choose the device allocation policy based on the environment.
-    `Env`: Allocates devices using runtime-visible environment variables; requires Container Toolkit support.
-    `CDI`: Allocates devices using generated CDI specifications, making it easy to debug and troubleshoot; requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
+    `Env`: Allocates devices using runtime-visible environment variables, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES`; requires Container Toolkit support.
+    `CDI`: Allocates devices using generated CDI specifications, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_CDI`, making it easy to debug and troubleshoot; requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
     `Opaque`: Uses internal logic for allocation, which is convenient for deployment but difficult to troubleshoot.
     """
     GPUSTACK_RUNTIME_KUBERNETES_KDP_CDI_SPECS_GENERATE: bool = True
     """
     Generate CDI specifications during deployment,
-    requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to be existed.
+    requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
     Works only when `GPUSTACK_RUNTIME_KUBERNETES_KDP_DEVICE_ALLOCATION_POLICY` is set to `CDI`.
     Using internal knowledge to generate the CDI specifications for deployer,
     if the output file conflicts with other tools generating CDI specifications(e.g., NVIDIA Container Toolkit),
@@ -344,7 +344,7 @@ if TYPE_CHECKING:
     GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE: bool = True
     """
     Generate CDI specifications during deployment,
-    requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to be existed.
+    requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
     Using internal knowledge to generate the CDI specifications for deployer,
     if the output file conflicts with other tools generating CDI specifications(e.g., NVIDIA Container Toolkit),
     please disable this and remove the output file manually.
@@ -577,6 +577,7 @@ variables: dict[str, Callable[[], Any]] = {
     "GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID": lambda: to_set(
         getenv(
             "GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID",
+            "NVIDIA_VISIBLE_DEVICES",
         ),
         sep=",",
     ),

{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpustack-runtime
-Version: 0.1.41.post2
+Version: 0.1.42
 Summary: GPUStack Runtime is library for detecting GPU resources and launching GPU workloads.
 Project-URL: Homepage, https://github.com/gpustack/runtime
 Project-URL: Bug Tracker, https://github.com/gpustack/gpustack/issues
@@ -16,7 +16,7 @@ Requires-Python: >=3.10
 Requires-Dist: argcomplete>=3.6.3
 Requires-Dist: cachetools>=5.5.2
 Requires-Dist: docker>=7.1.0
-Requires-Dist: gpustack-runner>=0.1.24.post1
+Requires-Dist: gpustack-runner>=0.1.24.post4
 Requires-Dist: grpc-interceptor>=0.15.4
 Requires-Dist: grpcio>=1.76.0
 Requires-Dist: kubernetes>=33.1.0

{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD RENAMED Viewed

@@ -1,51 +1,51 @@
 gpustack_runtime/__init__.py,sha256=Xw_PVWneitx-8QmW6sJQeymj6zVbEgEndGhIB_km6TI,186
 gpustack_runtime/__main__.py,sha256=O9yJKcN7vg0Ppgc13qesxHwST2wkH3ccOkTQXPWHnNA,3939
-gpustack_runtime/_version.py,sha256=fURzjTdczs5t1qsDdLl8wAzcF1VVhLtnm_S0ghrUvIE,792
+gpustack_runtime/_version.py,sha256=Vvw3zQNp4FXvAmZn_g3ZyhEUudGslUaT_mMkqwv_Tdg,777
 gpustack_runtime/_version.pyi,sha256=A42NoSgcqEXVy2OeNm4LXC9CbyonbooYrSUBlPm2lGY,156
-gpustack_runtime/envs.py,sha256=Q8vK42OpkY4T72zN6pOz_eCS_hnQElhAmxZ1wdks0xQ,38794
+gpustack_runtime/envs.py,sha256=gkr30NnIq3USzSYvbW6ipf8tKdbXx-BrDcY3-4Oc-Hg,38894
 gpustack_runtime/logging.py,sha256=wMPriPpOuVsuClsjMh0qwEPQKyJiJa89ggdDjqkk7i0,6934
 gpustack_runtime/cmds/__init__.py,sha256=-_X2O2lBn6KcdLGUzhL3lEjQC4_cwA36fvWDnFAgtVM,1382
 gpustack_runtime/cmds/__types__.py,sha256=TBnUWUqzTkDtJnsMv363kdw-H8fOf-XQYbOvrmQif-M,815
 gpustack_runtime/cmds/deployer.py,sha256=KvhPhU6ZW-UV6vLykI5adKI1ThgVFFJqWaII3n4OhL8,32846
-gpustack_runtime/cmds/detector.py,sha256=AALcoqCiNuwYucKBnyj7r5ScOWc_BSzAhHR2C0QbEHE,8750
+gpustack_runtime/cmds/detector.py,sha256=2ORRF53q3goyYRB4T8gu3X8u0VZ4v0xeEndJqtuktyQ,8872
 gpustack_runtime/cmds/images.py,sha256=7tb-D3G4yqLPkbS9aSuWI1bD3DYK8BLbPbgqac56blI,594
 gpustack_runtime/deployer/__init__.py,sha256=impMrmvkMjuCBthsn3QUz3LuwpmmNAymHJKJ2o6SZoc,16249
 gpustack_runtime/deployer/__patches__.py,sha256=cTBge8BT6IsY5MzETKY3kN28k3igYfNj7pcpgDzfDzw,17849
-gpustack_runtime/deployer/__types__.py,sha256=PgIWogHOvHKsHoeBjmKFEEM3JrKck89Mmnwlfx01BbE,72248
+gpustack_runtime/deployer/__types__.py,sha256=J2YX8X7EYY_56_L9WL5YMmdsyJ572uOIhMoHCVjPaog,72469
 gpustack_runtime/deployer/__utils__.py,sha256=paQu2M1UeoSfQPsiskmAqJSiln-8qwibTssEoWFMLec,21109
-gpustack_runtime/deployer/docker.py,sha256=bOaXbTnaalbO42FlyWR1Ha26Y30LGWPzWKPV5Q-Nk7g,85039
-gpustack_runtime/deployer/kuberentes.py,sha256=VkaAvuQJ5rRiNVD6OfM2pE3rmyT_a6oEvp-G8gW8Ojo,89816
-gpustack_runtime/deployer/podman.py,sha256=_qdbsTezacRmiXa3n04OUPUsgVy1pSFgJSKxous4s14,82156
+gpustack_runtime/deployer/docker.py,sha256=e48conm3gfu8dlwcIhvTvM5NhlhdgKlvk6Ix8xGYVeI,81448
+gpustack_runtime/deployer/kuberentes.py,sha256=-G7eYuqTDDi3T9u2Jqr6j0Ut-8vkP5u2lxzSyDx0EWM,86776
+gpustack_runtime/deployer/podman.py,sha256=9lo4AvXzD3HUteY17-Fuz9A0ItScPb_D1tweDgm7PVo,79090
 gpustack_runtime/deployer/cdi/__init__.py,sha256=2wHrxkud3GJokE3ytNc3jvjddemXkNuuz_oIKzxD3-I,4000
 gpustack_runtime/deployer/cdi/__types__.py,sha256=04DKvcogk7OoHS7TU2Bmht3VVMu7iOEBWTEOvxpHt4w,18399
-gpustack_runtime/deployer/cdi/__utils__.py,sha256=mvdOqkbhaSkphl0K-VpNwtFviAkttS9UrmKEA285kRw,3908
+gpustack_runtime/deployer/cdi/__utils__.py,sha256=CAYUv76akZiHJYZO_VY0NXKhEI2jrP7G3OgvQa8Pg4U,4050
 gpustack_runtime/deployer/cdi/amd.py,sha256=-eq_SOlC56VX2QscZXvnoeffWSRindhr8zFZmaIcKrE,4082
 gpustack_runtime/deployer/cdi/ascend.py,sha256=lDs75a9--c0lM34xfJqu-_QbfWNFrf4zE-GXPKReBe4,4538
 gpustack_runtime/deployer/cdi/hygon.py,sha256=h6-vQfv03sgxYjMJAf_JOMq9cHFPaNjK1YbUYIiSXck,4117
 gpustack_runtime/deployer/cdi/iluvatar.py,sha256=6nNECZpU5IPP6-5l-O1rzU-ib-WcuwKvDg7ZV__1NE4,3650
 gpustack_runtime/deployer/cdi/metax.py,sha256=tmJBvr-n9pERAp-dXsa54qv6xmxt0rJoJwY36TFdoWk,4143
 gpustack_runtime/deployer/cdi/thead.py,sha256=SvIDKNYZx7FwMPTTxyJ2RRjlr9LXLN8BUYCUhidmiQk,3671
-gpustack_runtime/deployer/k8s/deviceplugin/__init__.py,sha256=KoFztB0MmwO-lHFgEa0wLRsQyG2BN8i5w3mzo43GkbE,10498
+gpustack_runtime/deployer/k8s/deviceplugin/__init__.py,sha256=kvjsDx_8kNt3h8a5MOx5A7qPvqRsk1amvFr_ZYDA1l0,10931
 gpustack_runtime/deployer/k8s/deviceplugin/__types__.py,sha256=LCkgPDZ64Mra7bo5jmtsAO2Ypbc4qK99lMl6R_nQhnY,3043
-gpustack_runtime/deployer/k8s/deviceplugin/plugin.py,sha256=ipZ_V6pgJ2pzyEYUgAizZ7_W3a4noKEdTiZ9GAeuiRY,17728
+gpustack_runtime/deployer/k8s/deviceplugin/plugin.py,sha256=20eUDvM_SBFCi5WDR3AfyDJpnL7CJxxcPdW4p626I_M,17671
 gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py,sha256=3rOYmgDIIJ4idEtwgnumGStH7PaK-J7EYrOnLa9A-8o,118
 gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto,sha256=rmB8RDe4LN5FCVkQ608uS-pl32mk5tt6iGe-g2lKtPs,7919
 gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py,sha256=dNkzjTE-2y25q8NF0QRznNJ5r1-5ZxxJS598MHbjx98,45998
 gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi,sha256=lq1dbSgBYqJ7zyGfoKKHCyfr6R5vcCGzJxteeyQpbuI,8232
 gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py,sha256=GM6EyCEFeyOjL0XOCisbcHurRoLKqKDUI5obsUyTxpE,17446
 gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py,sha256=tpNk3e_cvY67C9RwVsdTNl75YuNXBgsn53fSJIzeTR4,828
-gpustack_runtime/detector/__init__.py,sha256=9i6KOd3Qp_BmnSyPURlOBHlHJMSInqlDTh1kpAbs4_U,8104
-gpustack_runtime/detector/__types__.py,sha256=tiYbxPD6gV5wS79K3d2dUzy6btJl4QcsgunyxtJ240E,13162
+gpustack_runtime/detector/__init__.py,sha256=2GaczS5lWLuPsighvq7oPhlPFQSUJfBKLAliXPfl4i0,8162
+gpustack_runtime/detector/__types__.py,sha256=ZiKuYZb0jeXqzyTCUWTDppoHnRmkaZuljAWbnR_ohX8,13626
 gpustack_runtime/detector/__utils__.py,sha256=QdLWXwsU1_EMxXG5Y29psqnttWJyXWMphHDjiC_6Byc,25153
-gpustack_runtime/detector/amd.py,sha256=qh86xGhPJRIXwiKaHmeyIrsxchUDRpyggR6yc0cLuKw,17553
-gpustack_runtime/detector/ascend.py,sha256=E6YPoREI5r2HZIegUaQb0XwC3Qau1mnkNeCRbgtlE5k,17992
-gpustack_runtime/detector/cambricon.py,sha256=GzXlS4et8zape0rr19d1EwSV9cfFEmHgaElTVMjR3IY,3947
-gpustack_runtime/detector/hygon.py,sha256=R4I8h66YHJC00iAtDJhBX772VMKUdZ8nleRXZFPUt3Q,12299
-gpustack_runtime/detector/iluvatar.py,sha256=bqciqjYE_tIxBg2PSOlIzH3WcFYBgTDjfN6nT90LSGg,10206
-gpustack_runtime/detector/metax.py,sha256=W4NSZD7Kf5431B63UBpYnKIk2Jz1SutEpwXkwgYfmfE,10374
-gpustack_runtime/detector/mthreads.py,sha256=GcElUIMvU5C6P4Fx_X_kSOsJps8WZ47tkZ2B2MJZmk4,12131
-gpustack_runtime/detector/nvidia.py,sha256=xdvoMdNx943on5fd_mI3WI_85kMY-0dYm5NU3vqXb9M,33400
-gpustack_runtime/detector/thead.py,sha256=43TGPq78FulpYBUeEMVxDTY-0X3ve2FsX4Hsd0Lswy0,25561
+gpustack_runtime/detector/amd.py,sha256=ywJMDKFnmF2REdJc1F8_zGYK6O4K0o5xh5eqWU-X2EE,18294
+gpustack_runtime/detector/ascend.py,sha256=a6QRnJfXdyU2tyGiiUKy0fgsp6NF652Zr2fFQgVg1Xw,19470
+gpustack_runtime/detector/cambricon.py,sha256=5AXILG9NAMYiWjaLRZ5h8lXtFk7FLC7LB_aFQz0ZtYU,4102
+gpustack_runtime/detector/hygon.py,sha256=3AcHBlPXTFiH0JQ0VS_xZcqjX-FXy-cdle6Nc-rNj5w,12795
+gpustack_runtime/detector/iluvatar.py,sha256=klFl5H607w8ksTvYSt21QkHMRzzeg-TkJKfoh9CMzqc,10551
+gpustack_runtime/detector/metax.py,sha256=P24WiqK2Ngjpu6AQt0Fp1wEVNra2Xgs-C8JAAwpYews,10801
+gpustack_runtime/detector/mthreads.py,sha256=mwNdsc42nebnSJMPFo6ue1tbiOwHmvPw6dF2CrLwdIQ,12714
+gpustack_runtime/detector/nvidia.py,sha256=oD3HUPfYWXRIRZ87iidNTW2Tg8CTVNIJh8qW1Z3HBO4,34535
+gpustack_runtime/detector/thead.py,sha256=hIRtlZNPa7xzAT0W_2XgFiDVH3YHSGi8NqCdaFaqQcA,26818
 gpustack_runtime/detector/pyacl/__init__.py,sha256=UQjaBxP7nJNyzr08N8_lH-5wPtnFmUY9pyQhs6vIChU,16232
 gpustack_runtime/detector/pyamdgpu/__init__.py,sha256=x-UO07EpKEgfTLmXQOD6j9f6kibuvDC7riQFof3YGdw,8617
 gpustack_runtime/detector/pyamdsmi/__init__.py,sha256=800-khq2w6HLgXM12RkhcdvXBGeAJ4s1_TWJyHebCMk,955
@@ -55,13 +55,13 @@ gpustack_runtime/detector/pyhgml/__init__.py,sha256=Yp9s-QhHS4ck7Iq9kd4v6a4BruyJ
 gpustack_runtime/detector/pyhgml/libhgml.so,sha256=BPzGVBpzrMX1tSvbXddq8Q0Qhi8w-No2JXX8sRxTioI,2101640
 gpustack_runtime/detector/pyhgml/libuki.so,sha256=EE6v1vIYYT4FSDMMm9rSfAqwrwIPFD-4_6KtP51lSps,702352
 gpustack_runtime/detector/pyhsa/__init__.py,sha256=4DuGnBBMUVOCPa6vTx3XT5mffGrKk6M6CYbUWBoMTJ0,15792
-gpustack_runtime/detector/pyixml/__init__.py,sha256=6ss_Dyl8lIT4WrKpfwmQqzBmg4Bxi38vg_eey_wsSY0,162681
+gpustack_runtime/detector/pyixml/__init__.py,sha256=2YmNoYhcIvc4CbRZgORM9o-GKdQ6O05J-5L3JbMZdhA,163157
 gpustack_runtime/detector/pymxsml/__init__.py,sha256=YxfNHq7TWd7CpNroP45BGXhcWNpY_sXgVzNGtx68DII,45409
 gpustack_runtime/detector/pyrocmcore/__init__.py,sha256=rgwIdPS-7GG7_5luRMR1XG9QyNM3lJh5ryD7kfZqpWg,2523
-gpustack_runtime/detector/pyrocmsmi/__init__.py,sha256=ACwRtJWVIuJ4NTcBJxk0zrVb_qtDOMkApMdbJoag5g0,11906
-gpustack_runtime/_version_appendix.py,sha256=VLCosBR_TL4DtCZkpcNTzR57fiBiNckG8dZUua523Ok,23
-gpustack_runtime-0.1.41.post2.dist-info/METADATA,sha256=KgdFo9ZEq09i9TQWXLYgfmd7mbvAUxY5SYVocoAOJSM,2364
-gpustack_runtime-0.1.41.post2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-gpustack_runtime-0.1.41.post2.dist-info/entry_points.txt,sha256=bBO_61GxP6dIT74uZwbSDgW5Vt2pTePUS3CgjUJkUgg,68
-gpustack_runtime-0.1.41.post2.dist-info/licenses/LICENSE,sha256=OiPibowBvB-NHV3TP_NOj18XNBlXcshXZFMpa3uvKVE,10362
-gpustack_runtime-0.1.41.post2.dist-info/RECORD,,
+gpustack_runtime/detector/pyrocmsmi/__init__.py,sha256=Gk4pTadOMzLCZJvQJ2S1N_1ivogtYokfVPHj_9Y874Y,12286
+gpustack_runtime/_version_appendix.py,sha256=2B6zFAHFYbVzMJ1w6ZW4XpqNz2XaMa-cAueeeQ4OfJk,23
+gpustack_runtime-0.1.42.dist-info/METADATA,sha256=sUS5YnNvheiK-tDT-rGSzuOKrOIFIKKhUBH1Jxc7lPE,2358
+gpustack_runtime-0.1.42.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+gpustack_runtime-0.1.42.dist-info/entry_points.txt,sha256=bBO_61GxP6dIT74uZwbSDgW5Vt2pTePUS3CgjUJkUgg,68
+gpustack_runtime-0.1.42.dist-info/licenses/LICENSE,sha256=OiPibowBvB-NHV3TP_NOj18XNBlXcshXZFMpa3uvKVE,10362
+gpustack_runtime-0.1.42.dist-info/RECORD,,

{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL RENAMED Viewed

File without changes

{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

gpustack-runtime 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl

gpustack-runtime 0.1.41.post2py3-none-any.whl → 0.1.42py3-none-any.whl