PyPI - gpustack-runtime - Versions diffs - 0.1.41.post3__py3-none-any.whl → 0.1.42__py3-none-any.whl - Mend

gpustack-runtime 0.1.41.post3py3-none-any.whl → 0.1.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

gpustack_runtime/_version.py +2 -2
gpustack_runtime/_version_appendix.py +1 -1
gpustack_runtime/cmds/detector.py +3 -1
gpustack_runtime/deployer/__types__.py +314 -233
gpustack_runtime/deployer/cdi/__utils__.py +4 -1
gpustack_runtime/deployer/docker.py +109 -148
gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +1 -1
gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
gpustack_runtime/deployer/kuberentes.py +89 -108
gpustack_runtime/deployer/podman.py +89 -122
gpustack_runtime/detector/__init__.py +2 -0
gpustack_runtime/detector/__types__.py +26 -0
gpustack_runtime/detector/amd.py +28 -8
gpustack_runtime/detector/ascend.py +49 -4
gpustack_runtime/detector/cambricon.py +3 -0
gpustack_runtime/detector/hygon.py +16 -1
gpustack_runtime/detector/iluvatar.py +6 -0
gpustack_runtime/detector/metax.py +8 -0
gpustack_runtime/detector/mthreads.py +11 -0
gpustack_runtime/detector/nvidia.py +139 -134
gpustack_runtime/detector/pyixml/__init__.py +16 -0
gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
gpustack_runtime/detector/thead.py +135 -127
gpustack_runtime/envs.py +7 -6
{gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
{gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
{gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
{gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
{gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0

gpustack_runtime/detector/ascend.py CHANGED Viewed

@@ -10,6 +10,7 @@ from . import pyacl, pydcmi
 from .__types__ import (
     Detector,
     Device,
+    DeviceMemoryStatusEnum,
     Devices,
     ManufacturerEnum,
     Topology,
@@ -128,7 +129,9 @@ class AscendDetector(Detector):
                         dev_is_vgpu = True
                         dev_cores_aicore = dev_virt_info.query_info.computing.aic
                         dev_name = dev_virt_info.query_info.name
-                        dev_mem, dev_mem_used = 0, 0
+                        dev_mem = 0
+                        dev_mem_used = 0
+                        dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                         if hasattr(dev_virt_info.query_info.computing, "memory_size"):
                             dev_mem = dev_virt_info.query_info.computing.memory_size
                         dev_index = dev_virt_info.vdev_id
@@ -143,6 +146,10 @@ class AscendDetector(Detector):
                             dev_card_id,
                             dev_device_id,
                         )
+                        dev_mem_status = _get_device_memory_status(
+                            dev_card_id,
+                            dev_device_id,
+                        )
                         dev_index = pydcmi.dcmi_get_device_logic_id(
                             dev_card_id,
                             dev_device_id,
@@ -239,6 +246,7 @@ class AscendDetector(Detector):
                             memory=dev_mem,
                             memory_used=dev_mem_used,
                             memory_utilization=get_utilization(dev_mem_used, dev_mem),
+                            memory_status=dev_mem_status,
                             temperature=dev_temp,
                             power_used=dev_power_used,
                             appendix=dev_appendix,
@@ -332,6 +340,12 @@ def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:
     """
     Get device memory information.
+    Args:
+        dev_card_id:
+            The card ID of the device.
+        dev_device_id:
+            The device ID of the device.
     Returns:
         A tuple containing total memory and used memory in MiB.
@@ -370,6 +384,37 @@ def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:
     return dev_mem, dev_mem_used
+def _get_device_memory_status(dev_card_id, dev_device_id) -> DeviceMemoryStatusEnum:
+    """
+    Get device memory ECC status.
+    Args:
+        dev_card_id:
+            The card ID of the device.
+        dev_device_id:
+            The device ID of the device.
+    Returns:
+        DeviceMemoryStatusEnum indicating the ECC status.
+    """
+    for dev_mem_type in [pydcmi.DCMI_DEVICE_TYPE_HBM, pydcmi.DCMI_DEVICE_TYPE_DDR]:
+        with contextlib.suppress(pydcmi.DCMIError):
+            dev_ecc_info = pydcmi.dcmi_get_device_ecc_info(
+                dev_card_id,
+                dev_device_id,
+                dev_mem_type,
+            )
+            if dev_ecc_info.enable_flag and (
+                dev_ecc_info.single_bit_error_cnt > 0
+                or dev_ecc_info.double_bit_error_cnt > 0
+            ):
+                return DeviceMemoryStatusEnum.UNHEALTHY
+            return DeviceMemoryStatusEnum.HEALTHY
+    return DeviceMemoryStatusEnum.HEALTHY
 def _get_device_roce_network_info(
     dev_card_id,
     dev_device_id,
@@ -528,11 +573,11 @@ def get_ascend_cann_variant(name: str | None) -> str | None:
     if version < 220:
         return "310p"
     if version < 240:
-        return "910b"
+        return "910b"  # 910b/a2
     if version < 250:
         return "310b"
     if version < 260:
-        return "a3"  # 910c
+        return "a3"  # 910c/a3
     if version < 270:
-        return "a5"  # 910d
+        return "a5"  # 910d/a5
     return None

gpustack_runtime/detector/cambricon.py CHANGED Viewed

@@ -6,6 +6,7 @@ from functools import lru_cache
 from .. import envs
 from ..logging import debug_log_exception
+from . import DeviceMemoryStatusEnum
 from .__types__ import Detector, Device, Devices, ManufacturerEnum
 from .__utils__ import (
     PCIDevice,
@@ -100,6 +101,7 @@ class CambriconDetector(Detector):
                 dev_mem_usage_info = dev_info.get("PhysicalMemUsage", {})
                 dev_mem = safe_int(dev_mem_usage_info.get("Total", 0))
                 dev_mem_used = safe_int(dev_mem_usage_info.get("Used", 0))
+                dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                 dev_temp_info = dev_info.get("Temperature", {})
                 dev_temp = safe_float(dev_temp_info.get("Chip", 0))
@@ -118,6 +120,7 @@ class CambriconDetector(Detector):
                         memory=dev_mem,
                         memory_used=dev_mem_used,
                         memory_utilization=get_utilization(dev_mem_used, dev_mem),
+                        memory_status=dev_mem_status,
                         temperature=dev_temp,
                         appendix=dev_appendix,
                     ),

gpustack_runtime/detector/hygon.py CHANGED Viewed

@@ -8,7 +8,14 @@ from pathlib import Path
 from .. import envs
 from ..logging import debug_log_exception, debug_log_warning
 from . import Topology, pyamdgpu, pyhsa, pyrocmcore, pyrocmsmi
-from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
+from .__types__ import (
+    Detector,
+    Device,
+    DeviceMemoryStatusEnum,
+    Devices,
+    ManufacturerEnum,
+    TopologyDistanceEnum,
+)
 from .__utils__ import (
     PCIDevice,
     byte_to_mebibyte,
@@ -149,6 +156,13 @@ class HygonDetector(Detector):
                 dev_mem_used = byte_to_mebibyte(  # byte to MiB
                     pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
                 )
+                dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
+                with contextlib.suppress(pyrocmsmi.ROCMSMIError):
+                    dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
+                        dev_idx,
+                    )
+                    if dev_ecc_count.uncorrectable_err > 0:
+                        dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
                 dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
                 dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
@@ -184,6 +198,7 @@ class HygonDetector(Detector):
                         memory=dev_mem,
                         memory_used=dev_mem_used,
                         memory_utilization=get_utilization(dev_mem_used, dev_mem),
+                        memory_status=dev_mem_status,
                         temperature=dev_temp,
                         power=dev_power,
                         power_used=dev_power_used,

gpustack_runtime/detector/iluvatar.py CHANGED Viewed

@@ -10,6 +10,7 @@ from . import pyixml
 from .__types__ import (
     Detector,
     Device,
+    DeviceMemoryStatusEnum,
     Devices,
     ManufacturerEnum,
     Topology,
@@ -135,6 +136,7 @@ class IluvatarDetector(Detector):
                 dev_mem = 0
                 dev_mem_used = 0
+                dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                 with contextlib.suppress(pyixml.NVMLError):
                     dev_mem_info = pyixml.nvmlDeviceGetMemoryInfo(dev)
                     dev_mem = byte_to_mebibyte(  # byte to MiB
@@ -143,6 +145,9 @@ class IluvatarDetector(Detector):
                     dev_mem_used = byte_to_mebibyte(  # byte to MiB
                         dev_mem_info.used,
                     )
+                    dev_health = pyixml.ixmlDeviceGetHealth(dev)
+                    if dev_health != pyixml.IXML_HEALTH_OK:
+                        dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
                 dev_cores_util = None
                 with contextlib.suppress(pyixml.NVMLError):
@@ -213,6 +218,7 @@ class IluvatarDetector(Detector):
                         memory=dev_mem,
                         memory_used=dev_mem_used,
                         memory_utilization=get_utilization(dev_mem_used, dev_mem),
+                        memory_status=dev_mem_status,
                         temperature=dev_temp,
                         power=dev_power,
                         power_used=dev_power_used,

gpustack_runtime/detector/metax.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations as __future_annotations__
+import contextlib
 import logging
 from functools import lru_cache
 from pathlib import Path
@@ -10,6 +11,7 @@ from . import pymxsml
 from .__types__ import (
     Detector,
     Device,
+    DeviceMemoryStatusEnum,
     Devices,
     ManufacturerEnum,
     Topology,
@@ -145,6 +147,11 @@ class MetaXDetector(Detector):
                 dev_mem_used = kibibyte_to_mebibyte(  # KiB to MiB
                     dev_mem_info.vramUse,
                 )
+                dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
+                with contextlib.suppress(pymxsml.MXSMLError):
+                    dev_ecc_errors = pymxsml.mxSmlGetTotalEccErrors(dev_idx)
+                    if dev_ecc_errors.dramUE > 0:
+                        dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
                 dev_temp = (
                     pymxsml.mxSmlGetTemperatureInfo(
@@ -201,6 +208,7 @@ class MetaXDetector(Detector):
                         memory=dev_mem,
                         memory_used=dev_mem_used,
                         memory_utilization=get_utilization(dev_mem_used, dev_mem),
+                        memory_status=dev_mem_status,
                         temperature=dev_temp,
                         power=dev_power,
                         power_used=dev_power_used,

gpustack_runtime/detector/mthreads.py CHANGED Viewed

@@ -7,6 +7,7 @@ import pymtml
 from .. import envs
 from ..logging import debug_log_exception, debug_log_warning
+from . import DeviceMemoryStatusEnum
 from .__types__ import (
     Detector,
     Device,
@@ -140,6 +141,7 @@ class MThreadsDetector(Detector):
                 dev_mem = 0
                 dev_mem_used = 0
+                dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                 with pymtml.mtmlMemoryContext(dev) as devmem:
                     dev_mem = byte_to_mebibyte(  # byte to MiB
                         pymtml.mtmlMemoryGetTotal(devmem),
@@ -147,6 +149,14 @@ class MThreadsDetector(Detector):
                     dev_mem_used = byte_to_mebibyte(  # byte to MiB
                         pymtml.mtmlMemoryGetUsed(devmem),
                     )
+                    dev_mem_ecc_errors = pymtml.mtmlMemoryGetEccErrorCounter(
+                        devmem,
+                        pymtml.MTML_MEMORY_ERROR_TYPE_UNCORRECTED,
+                        pymtml.MTML_VOLATILE_ECC,
+                        pymtml.MTML_MEMORY_LOCATION_DRAM,
+                    )
+                    if dev_mem_ecc_errors > 0:
+                        dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
                 dev_cores_util = None
                 dev_temp = None
@@ -192,6 +202,7 @@ class MThreadsDetector(Detector):
                         memory=dev_mem,
                         memory_used=dev_mem_used,
                         memory_utilization=get_utilization(dev_mem_used, dev_mem),
+                        memory_status=dev_mem_status,
                         temperature=dev_temp,
                         power_used=dev_power_used,
                         appendix=dev_appendix,

gpustack_runtime/detector/nvidia.py CHANGED Viewed

@@ -3,17 +3,17 @@ from __future__ import annotations as __future_annotations__
 import contextlib
 import logging
 import math
+import re
 import time
 from _ctypes import byref
 from functools import lru_cache
 from pathlib import Path
-from typing import re
 import pynvml
 from .. import envs
 from ..logging import debug_log_exception, debug_log_warning
-from . import Topology, pycuda
+from . import DeviceMemoryStatusEnum, Topology, pycuda
 from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
 from .__utils__ import (
     PCIDevice,
@@ -78,7 +78,7 @@ class NVIDIADetector(Detector):
     def __init__(self):
         super().__init__(ManufacturerEnum.NVIDIA)
-    def detect(self) -> Devices | None:
+    def detect(self) -> Devices | None:  # noqa: PLR0915
         """
         Detect NVIDIA GPUs using pynvml.
@@ -141,6 +141,22 @@ class NVIDIADetector(Detector):
                     )
                     dev_numa = bitmask_to_str(list(dev_node_affinity))
+                dev_temp = None
+                with contextlib.suppress(pynvml.NVMLError):
+                    dev_temp = pynvml.nvmlDeviceGetTemperature(
+                        dev,
+                        pynvml.NVML_TEMPERATURE_GPU,
+                    )
+                dev_power = None
+                dev_power_used = None
+                with contextlib.suppress(pynvml.NVMLError):
+                    dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
+                    dev_power = dev_power // 1000  # mW to W
+                    dev_power_used = (
+                        pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
+                    )  # mW to W
                 dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
                 with contextlib.suppress(pynvml.NVMLError):
                     dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
@@ -180,6 +196,7 @@ class NVIDIADetector(Detector):
                     dev_mem = 0
                     dev_mem_used = 0
+                    dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                     with contextlib.suppress(pynvml.NVMLError):
                         dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
                         dev_mem = byte_to_mebibyte(  # byte to MiB
@@ -188,24 +205,16 @@ class NVIDIADetector(Detector):
                         dev_mem_used = byte_to_mebibyte(  # byte to MiB
                             dev_mem_info.used,
                         )
-                    if dev_mem == 0:
-                        dev_mem, dev_mem_used = get_memory()
-                    dev_temp = None
-                    with contextlib.suppress(pynvml.NVMLError):
-                        dev_temp = pynvml.nvmlDeviceGetTemperature(
+                        dev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
                             dev,
-                            pynvml.NVML_TEMPERATURE_GPU,
+                            pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+                            pynvml.NVML_VOLATILE_ECC,
+                            pynvml.NVML_MEMORY_LOCATION_DRAM,
                         )
-                    dev_power = None
-                    dev_power_used = None
-                    with contextlib.suppress(pynvml.NVMLError):
-                        dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
-                        dev_power = dev_power // 1000  # mW to W
-                        dev_power_used = (
-                            pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
-                        )  # mW to W
+                        if dev_mem_ecc_errors > 0:
+                            dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
+                    if dev_mem == 0:
+                        dev_mem, dev_mem_used = get_memory()
                     dev_is_vgpu = False
                     if dev_bdf in pci_devs:
@@ -236,6 +245,7 @@ class NVIDIADetector(Detector):
                             memory=dev_mem,
                             memory_used=dev_mem_used,
                             memory_utilization=get_utilization(dev_mem_used, dev_mem),
+                            memory_status=dev_mem_status,
                             temperature=dev_temp,
                             power=dev_power,
                             power_used=dev_power_used,
@@ -254,12 +264,18 @@ class NVIDIADetector(Detector):
                 mdev_cores = None
                 mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
                 for mdev_idx in range(mdev_count):
-                    mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
+                    mdev = None
+                    with contextlib.suppress(pynvml.NVMLError):
+                        mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
+                    if not mdev:
+                        continue
-                    mdev_index = mdev_idx
+                    mdev_index = mdev_idx + dev_count * (dev_idx + 1)
                     mdev_uuid = pynvml.nvmlDeviceGetUUID(mdev)
-                    mdev_mem, mdev_mem_used = 0, 0
+                    mdev_mem = 0
+                    mdev_mem_used = 0
+                    mdev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                     with contextlib.suppress(pynvml.NVMLError):
                         mdev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(mdev)
                         mdev_mem = byte_to_mebibyte(  # byte to MiB
@@ -268,21 +284,14 @@ class NVIDIADetector(Detector):
                         mdev_mem_used = byte_to_mebibyte(  # byte to MiB
                             mdev_mem_info.used,
                         )
-                    mdev_temp = pynvml.nvmlDeviceGetTemperature(
-                        mdev,
-                        pynvml.NVML_TEMPERATURE_GPU,
-                    )
-                    mdev_power = None
-                    with contextlib.suppress(pynvml.NVMLError):
-                        mdev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(
+                        mdev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
                             mdev,
+                            pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+                            pynvml.NVML_AGGREGATE_ECC,
+                            pynvml.NVML_MEMORY_LOCATION_SRAM,
                         )
-                        mdev_power = mdev_power // 1000  # mW to W
-                    mdev_power_used = (
-                        pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
-                    )  # mW to W
+                        if mdev_mem_ecc_errors > 0:
+                            mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
                     mdev_appendix = {
                         "arch_family": _get_arch_family(dev_cc_t),
@@ -305,71 +314,70 @@ class NVIDIADetector(Detector):
                     mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
-                    if not mdev_name:
-                        mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
-                        mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
-                            mdev_gi,
-                            mdev_ci_id,
-                        )
-                        mdev_gi_info = pynvml.nvmlGpuInstanceGetInfo(mdev_gi)
-                        mdev_ci_info = pynvml.nvmlComputeInstanceGetInfo(mdev_ci)
-                        for dev_gi_prf_id in range(
-                            pynvml.NVML_GPU_INSTANCE_PROFILE_COUNT,
-                        ):
-                            try:
-                                dev_gi_prf = pynvml.nvmlDeviceGetGpuInstanceProfileInfo(
-                                    dev,
-                                    dev_gi_prf_id,
-                                )
-                                if dev_gi_prf.id != mdev_gi_info.profileId:
-                                    continue
-                            except pynvml.NVMLError:
+                    mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
+                    mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
+                        mdev_gi,
+                        mdev_ci_id,
+                    )
+                    mdev_gi_info = pynvml.nvmlGpuInstanceGetInfo(mdev_gi)
+                    mdev_ci_info = pynvml.nvmlComputeInstanceGetInfo(mdev_ci)
+                    for dev_gi_prf_id in range(
+                        pynvml.NVML_GPU_INSTANCE_PROFILE_COUNT,
+                    ):
+                        try:
+                            dev_gi_prf = pynvml.nvmlDeviceGetGpuInstanceProfileInfo(
+                                dev,
+                                dev_gi_prf_id,
+                            )
+                            if dev_gi_prf.id != mdev_gi_info.profileId:
                                 continue
+                        except pynvml.NVMLError:
+                            continue
-                            for dev_ci_prf_id in range(
-                                pynvml.NVML_COMPUTE_INSTANCE_PROFILE_COUNT,
+                        for dev_ci_prf_id in range(
+                            pynvml.NVML_COMPUTE_INSTANCE_PROFILE_COUNT,
+                        ):
+                            for dev_cig_prf_id in range(
+                                pynvml.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
                             ):
-                                for dev_cig_prf_id in range(
-                                    pynvml.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
-                                ):
-                                    try:
-                                        mdev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
-                                            mdev_gi,
-                                            dev_ci_prf_id,
-                                            dev_cig_prf_id,
-                                        )
-                                        if mdev_ci_prf.id != mdev_ci_info.profileId:
-                                            continue
-                                    except pynvml.NVMLError:
-                                        continue
-                                    ci_slice = _get_compute_instance_slice(
+                                try:
+                                    dev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
+                                        mdev_gi,
                                         dev_ci_prf_id,
+                                        dev_cig_prf_id,
                                     )
-                                    gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
-                                    gi_mem = _get_gpu_instance_memory(
-                                        dev_mem_info,
-                                        dev_gi_prf,
-                                    )
-                                    gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
-                                    gi_neg_attrs = _get_gpu_instance_negattrs(
-                                        dev_gi_prf_id,
-                                    )
+                                    if dev_ci_prf.id != mdev_ci_info.profileId:
+                                        continue
+                                except pynvml.NVMLError:
+                                    continue
-                                    if ci_slice == gi_slice:
-                                        mdev_name = f"{gi_slice}g.{gi_mem}gb"
+                                ci_slice = _get_compute_instance_slice(dev_ci_prf_id)
+                                gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
+                                if ci_slice == gi_slice:
+                                    if hasattr(dev_gi_prf, "name"):
+                                        mdev_name = dev_gi_prf.name
                                     else:
-                                        mdev_name = (
-                                            f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
+                                        gi_mem = round(
+                                            math.ceil(dev_gi_prf.memorySizeMB >> 10),
                                         )
-                                    if gi_attrs:
-                                        mdev_name += f"+{gi_attrs}"
-                                    if gi_neg_attrs:
-                                        mdev_name += f"-{gi_neg_attrs}"
+                                        mdev_name = f"{gi_slice}g.{gi_mem}gb"
+                                elif hasattr(dev_ci_prf, "name"):
+                                    mdev_name = dev_ci_prf.name
+                                else:
+                                    gi_mem = round(
+                                        math.ceil(dev_gi_prf.memorySizeMB >> 10),
+                                    )
+                                    mdev_name = f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
+                                gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
+                                if gi_attrs:
+                                    mdev_name += f"+{gi_attrs}"
+                                gi_neg_attrs = _get_gpu_instance_negattrs(dev_gi_prf_id)
+                                if gi_neg_attrs:
+                                    mdev_name += f"-{gi_neg_attrs}"
-                                    mdev_cores = mdev_ci_prf.multiprocessorCount
+                                mdev_cores = dev_ci_prf.multiprocessorCount
-                                    break
+                                break
                     ret.append(
                         Device(
@@ -386,9 +394,10 @@ class NVIDIADetector(Detector):
                             memory=mdev_mem,
                             memory_used=mdev_mem_used,
                             memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
-                            temperature=mdev_temp,
-                            power=mdev_power,
-                            power_used=mdev_power_used,
+                            memory_status=mdev_mem_status,
+                            temperature=dev_temp,
+                            power=dev_power,
+                            power_used=dev_power_used,
                             appendix=mdev_appendix,
                         ),
                     )
@@ -426,11 +435,17 @@ class NVIDIADetector(Detector):
             devices_count=len(devices),
         )
+        get_links_cache = {}
         try:
             pynvml.nvmlInit()
             for i, dev_i in enumerate(devices):
-                dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
+                dev_i_bdf = dev_i.appendix.get("bdf")
+                if dev_i.appendix.get("vgpu", False):
+                    dev_i_handle = pynvml.nvmlDeviceGetHandleByPciBusId(dev_i_bdf)
+                else:
+                    dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
                 # Get NUMA and CPU affinities.
                 ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
@@ -439,7 +454,12 @@ class NVIDIADetector(Detector):
                 )
                 # Get links state if applicable.
-                if dev_i_links_state := _get_links_state(dev_i_handle):
+                if dev_i_bdf in get_links_cache:
+                    dev_i_links_state = get_links_cache[dev_i_bdf]
+                else:
+                    dev_i_links_state = _get_links_state(dev_i_handle)
+                    get_links_cache[dev_i_bdf] = dev_i_links_state
+                if dev_i_links_state:
                     ret.appendices[i].update(dev_i_links_state)
                     # In practice, if a card has an active *Link,
                     # then other cards in the same machine should be interconnected with it through the *Link.
@@ -456,21 +476,30 @@ class NVIDIADetector(Detector):
                     if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
                         continue
-                    dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
-                    distance = TopologyDistanceEnum.UNK
-                    try:
-                        distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
-                            dev_i_handle,
-                            dev_j_handle,
-                        )
-                    except pynvml.NVMLError:
-                        debug_log_exception(
-                            logger,
-                            "Failed to get distance between device %d and %d",
-                            dev_i.index,
-                            dev_j.index,
-                        )
+                    dev_j_bdf = dev_j.appendix.get("bdf")
+                    if dev_i_bdf == dev_j_bdf:
+                        distance = TopologyDistanceEnum.SELF
+                    else:
+                        if dev_j.appendix.get("vgpu", False):
+                            dev_j_handle = pynvml.nvmlDeviceGetHandleByPciBusId(
+                                dev_j_bdf,
+                            )
+                        else:
+                            dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
+                        distance = TopologyDistanceEnum.UNK
+                        try:
+                            distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
+                                dev_i_handle,
+                                dev_j_handle,
+                            )
+                        except pynvml.NVMLError:
+                            debug_log_exception(
+                                logger,
+                                "Failed to get distance between device %d and %d",
+                                dev_i.index,
+                                dev_j.index,
+                            )
                     ret.devices_distances[i][j] = distance
                     ret.devices_distances[j][i] = distance
@@ -767,30 +796,6 @@ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
     raise AttributeError(msg)
-def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
-    """
-    Compute the memory size of a MIG compute instance in GiB.
-    Args:
-        dev_mem:
-            The total memory info of the parent GPU device.
-        dev_gi_prf:
-            The profile info of the GPU instance.
-    Returns:
-        The memory size in GiB.
-    """
-    mem = dev_gi_prf.memorySizeMB * (1 << 20)  # MiB to byte
-    gib = round(
-        math.ceil(mem / dev_mem.total * 8)
-        / 8
-        * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
-    )
-    return gib
 def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
     """
     Get the number of slice for a given Compute Instance Profile ID.

gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42__py3-none-any.whl

gpustack-runtime 0.1.41.post3py3-none-any.whl → 0.1.42py3-none-any.whl