PyPI - gpustack-runtime - Versions diffs - 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl - Mend

gpustack-runtime 0.1.41.post3py3-none-any.whl → 0.1.42.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

gpustack_runtime/_version.py +2 -2
gpustack_runtime/_version_appendix.py +1 -1
gpustack_runtime/cmds/detector.py +4 -2
gpustack_runtime/deployer/__types__.py +314 -233
gpustack_runtime/deployer/cdi/__init__.py +1 -1
gpustack_runtime/deployer/cdi/__types__.py +2 -2
gpustack_runtime/deployer/cdi/__utils__.py +4 -1
gpustack_runtime/deployer/cdi/amd.py +6 -8
gpustack_runtime/deployer/cdi/ascend.py +7 -9
gpustack_runtime/deployer/cdi/hygon.py +6 -8
gpustack_runtime/deployer/cdi/iluvatar.py +6 -8
gpustack_runtime/deployer/cdi/metax.py +6 -8
gpustack_runtime/deployer/cdi/thead.py +6 -8
gpustack_runtime/deployer/docker.py +133 -146
gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +13 -8
gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +26 -21
gpustack_runtime/deployer/kuberentes.py +89 -108
gpustack_runtime/deployer/podman.py +113 -120
gpustack_runtime/detector/__init__.py +2 -0
gpustack_runtime/detector/__types__.py +26 -0
gpustack_runtime/detector/__utils__.py +3 -0
gpustack_runtime/detector/amd.py +32 -10
gpustack_runtime/detector/ascend.py +67 -13
gpustack_runtime/detector/cambricon.py +3 -0
gpustack_runtime/detector/hygon.py +22 -3
gpustack_runtime/detector/iluvatar.py +15 -7
gpustack_runtime/detector/metax.py +16 -6
gpustack_runtime/detector/mthreads.py +22 -8
gpustack_runtime/detector/nvidia.py +148 -140
gpustack_runtime/detector/pyacl/__init__.py +34 -14
gpustack_runtime/detector/pydcmi/__init__.py +4 -2
gpustack_runtime/detector/pyixml/__init__.py +16 -0
gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
gpustack_runtime/detector/thead.py +145 -134
gpustack_runtime/envs.py +7 -6
{gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/METADATA +2 -2
gpustack_runtime-0.1.42.post1.dist-info/RECORD +67 -0
gpustack_runtime-0.1.41.post3.dist-info/RECORD +0 -67
{gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/WHEEL +0 -0
{gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/entry_points.txt +0 -0
{gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/licenses/LICENSE +0 -0

gpustack_runtime/detector/nvidia.py CHANGED Viewed

@@ -3,17 +3,17 @@ from __future__ import annotations as __future_annotations__
 import contextlib
 import logging
 import math
+import re
 import time
 from _ctypes import byref
 from functools import lru_cache
 from pathlib import Path
-from typing import re
 import pynvml
 from .. import envs
 from ..logging import debug_log_exception, debug_log_warning
-from . import Topology, pycuda
+from . import DeviceMemoryStatusEnum, Topology, pycuda
 from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
 from .__utils__ import (
     PCIDevice,
@@ -78,7 +78,7 @@ class NVIDIADetector(Detector):
     def __init__(self):
         super().__init__(ManufacturerEnum.NVIDIA)
-    def detect(self) -> Devices | None:
+    def detect(self) -> Devices | None:  # noqa: PLR0915
         """
         Detect NVIDIA GPUs using pynvml.
@@ -134,12 +134,29 @@ class NVIDIADetector(Detector):
                 dev_numa = get_numa_node_by_bdf(dev_bdf)
                 if not dev_numa:
-                    dev_node_affinity = pynvml.nvmlDeviceGetMemoryAffinity(
+                    with contextlib.suppress(pynvml.NVMLError):
+                        dev_node_affinity = pynvml.nvmlDeviceGetMemoryAffinity(
+                            dev,
+                            get_numa_nodeset_size(),
+                            pynvml.NVML_AFFINITY_SCOPE_NODE,
+                        )
+                        dev_numa = bitmask_to_str(list(dev_node_affinity))
+                dev_temp = None
+                with contextlib.suppress(pynvml.NVMLError):
+                    dev_temp = pynvml.nvmlDeviceGetTemperature(
                         dev,
-                        get_numa_nodeset_size(),
-                        pynvml.NVML_AFFINITY_SCOPE_NODE,
+                        pynvml.NVML_TEMPERATURE_GPU,
                     )
-                    dev_numa = bitmask_to_str(list(dev_node_affinity))
+                dev_power = None
+                dev_power_used = None
+                with contextlib.suppress(pynvml.NVMLError):
+                    dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
+                    dev_power = dev_power // 1000  # mW to W
+                    dev_power_used = (
+                        pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
+                    )  # mW to W
                 dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
                 with contextlib.suppress(pynvml.NVMLError):
@@ -180,6 +197,7 @@ class NVIDIADetector(Detector):
                     dev_mem = 0
                     dev_mem_used = 0
+                    dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                     with contextlib.suppress(pynvml.NVMLError):
                         dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
                         dev_mem = byte_to_mebibyte(  # byte to MiB
@@ -188,24 +206,16 @@ class NVIDIADetector(Detector):
                         dev_mem_used = byte_to_mebibyte(  # byte to MiB
                             dev_mem_info.used,
                         )
-                    if dev_mem == 0:
-                        dev_mem, dev_mem_used = get_memory()
-                    dev_temp = None
-                    with contextlib.suppress(pynvml.NVMLError):
-                        dev_temp = pynvml.nvmlDeviceGetTemperature(
+                        dev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
                             dev,
-                            pynvml.NVML_TEMPERATURE_GPU,
+                            pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+                            pynvml.NVML_VOLATILE_ECC,
+                            pynvml.NVML_MEMORY_LOCATION_DRAM,
                         )
-                    dev_power = None
-                    dev_power_used = None
-                    with contextlib.suppress(pynvml.NVMLError):
-                        dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
-                        dev_power = dev_power // 1000  # mW to W
-                        dev_power_used = (
-                            pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
-                        )  # mW to W
+                        if dev_mem_ecc_errors > 0:
+                            dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
+                    if dev_mem == 0:
+                        dev_mem, dev_mem_used = get_memory()
                     dev_is_vgpu = False
                     if dev_bdf in pci_devs:
@@ -215,8 +225,9 @@ class NVIDIADetector(Detector):
                         "arch_family": _get_arch_family(dev_cc_t),
                         "vgpu": dev_is_vgpu,
                         "bdf": dev_bdf,
-                        "numa": dev_numa,
                     }
+                    if dev_numa:
+                        dev_appendix["numa"] = dev_numa
                     if dev_fabric_info := _get_fabric_info(dev):
                         dev_appendix.update(dev_fabric_info)
@@ -236,6 +247,7 @@ class NVIDIADetector(Detector):
                             memory=dev_mem,
                             memory_used=dev_mem_used,
                             memory_utilization=get_utilization(dev_mem_used, dev_mem),
+                            memory_status=dev_mem_status,
                             temperature=dev_temp,
                             power=dev_power,
                             power_used=dev_power_used,
@@ -254,12 +266,18 @@ class NVIDIADetector(Detector):
                 mdev_cores = None
                 mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
                 for mdev_idx in range(mdev_count):
-                    mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
+                    mdev = None
+                    with contextlib.suppress(pynvml.NVMLError):
+                        mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
+                    if not mdev:
+                        continue
-                    mdev_index = mdev_idx
+                    mdev_index = mdev_idx + dev_count * (dev_idx + 1)
                     mdev_uuid = pynvml.nvmlDeviceGetUUID(mdev)
-                    mdev_mem, mdev_mem_used = 0, 0
+                    mdev_mem = 0
+                    mdev_mem_used = 0
+                    mdev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                     with contextlib.suppress(pynvml.NVMLError):
                         mdev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(mdev)
                         mdev_mem = byte_to_mebibyte(  # byte to MiB
@@ -268,28 +286,22 @@ class NVIDIADetector(Detector):
                         mdev_mem_used = byte_to_mebibyte(  # byte to MiB
                             mdev_mem_info.used,
                         )
-                    mdev_temp = pynvml.nvmlDeviceGetTemperature(
-                        mdev,
-                        pynvml.NVML_TEMPERATURE_GPU,
-                    )
-                    mdev_power = None
-                    with contextlib.suppress(pynvml.NVMLError):
-                        mdev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(
+                        mdev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
                             mdev,
+                            pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
+                            pynvml.NVML_AGGREGATE_ECC,
+                            pynvml.NVML_MEMORY_LOCATION_SRAM,
                         )
-                        mdev_power = mdev_power // 1000  # mW to W
-                    mdev_power_used = (
-                        pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
-                    )  # mW to W
+                        if mdev_mem_ecc_errors > 0:
+                            mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
                     mdev_appendix = {
                         "arch_family": _get_arch_family(dev_cc_t),
                         "vgpu": True,
                         "bdf": dev_bdf,
-                        "numa": dev_numa,
                     }
+                    if dev_numa:
+                        mdev_appendix["numa"] = dev_numa
                     mdev_gi_id = pynvml.nvmlDeviceGetGpuInstanceId(mdev)
                     mdev_appendix["gpu_instance_id"] = mdev_gi_id
@@ -305,71 +317,70 @@ class NVIDIADetector(Detector):
                     mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
-                    if not mdev_name:
-                        mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
-                        mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
-                            mdev_gi,
-                            mdev_ci_id,
-                        )
-                        mdev_gi_info = pynvml.nvmlGpuInstanceGetInfo(mdev_gi)
-                        mdev_ci_info = pynvml.nvmlComputeInstanceGetInfo(mdev_ci)
-                        for dev_gi_prf_id in range(
-                            pynvml.NVML_GPU_INSTANCE_PROFILE_COUNT,
-                        ):
-                            try:
-                                dev_gi_prf = pynvml.nvmlDeviceGetGpuInstanceProfileInfo(
-                                    dev,
-                                    dev_gi_prf_id,
-                                )
-                                if dev_gi_prf.id != mdev_gi_info.profileId:
-                                    continue
-                            except pynvml.NVMLError:
+                    mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
+                    mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
+                        mdev_gi,
+                        mdev_ci_id,
+                    )
+                    mdev_gi_info = pynvml.nvmlGpuInstanceGetInfo(mdev_gi)
+                    mdev_ci_info = pynvml.nvmlComputeInstanceGetInfo(mdev_ci)
+                    for dev_gi_prf_id in range(
+                        pynvml.NVML_GPU_INSTANCE_PROFILE_COUNT,
+                    ):
+                        try:
+                            dev_gi_prf = pynvml.nvmlDeviceGetGpuInstanceProfileInfo(
+                                dev,
+                                dev_gi_prf_id,
+                            )
+                            if dev_gi_prf.id != mdev_gi_info.profileId:
                                 continue
+                        except pynvml.NVMLError:
+                            continue
-                            for dev_ci_prf_id in range(
-                                pynvml.NVML_COMPUTE_INSTANCE_PROFILE_COUNT,
+                        for dev_ci_prf_id in range(
+                            pynvml.NVML_COMPUTE_INSTANCE_PROFILE_COUNT,
+                        ):
+                            for dev_cig_prf_id in range(
+                                pynvml.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
                             ):
-                                for dev_cig_prf_id in range(
-                                    pynvml.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
-                                ):
-                                    try:
-                                        mdev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
-                                            mdev_gi,
-                                            dev_ci_prf_id,
-                                            dev_cig_prf_id,
-                                        )
-                                        if mdev_ci_prf.id != mdev_ci_info.profileId:
-                                            continue
-                                    except pynvml.NVMLError:
-                                        continue
-                                    ci_slice = _get_compute_instance_slice(
+                                try:
+                                    dev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
+                                        mdev_gi,
                                         dev_ci_prf_id,
+                                        dev_cig_prf_id,
                                     )
-                                    gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
-                                    gi_mem = _get_gpu_instance_memory(
-                                        dev_mem_info,
-                                        dev_gi_prf,
-                                    )
-                                    gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
-                                    gi_neg_attrs = _get_gpu_instance_negattrs(
-                                        dev_gi_prf_id,
-                                    )
+                                    if dev_ci_prf.id != mdev_ci_info.profileId:
+                                        continue
+                                except pynvml.NVMLError:
+                                    continue
-                                    if ci_slice == gi_slice:
-                                        mdev_name = f"{gi_slice}g.{gi_mem}gb"
+                                ci_slice = _get_compute_instance_slice(dev_ci_prf_id)
+                                gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
+                                if ci_slice == gi_slice:
+                                    if hasattr(dev_gi_prf, "name"):
+                                        mdev_name = dev_gi_prf.name
                                     else:
-                                        mdev_name = (
-                                            f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
+                                        gi_mem = round(
+                                            math.ceil(dev_gi_prf.memorySizeMB >> 10),
                                         )
-                                    if gi_attrs:
-                                        mdev_name += f"+{gi_attrs}"
-                                    if gi_neg_attrs:
-                                        mdev_name += f"-{gi_neg_attrs}"
+                                        mdev_name = f"{gi_slice}g.{gi_mem}gb"
+                                elif hasattr(dev_ci_prf, "name"):
+                                    mdev_name = dev_ci_prf.name
+                                else:
+                                    gi_mem = round(
+                                        math.ceil(dev_gi_prf.memorySizeMB >> 10),
+                                    )
+                                    mdev_name = f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
+                                gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
+                                if gi_attrs:
+                                    mdev_name += f"+{gi_attrs}"
+                                gi_neg_attrs = _get_gpu_instance_negattrs(dev_gi_prf_id)
+                                if gi_neg_attrs:
+                                    mdev_name += f"-{gi_neg_attrs}"
-                                    mdev_cores = mdev_ci_prf.multiprocessorCount
+                                mdev_cores = dev_ci_prf.multiprocessorCount
-                                    break
+                                break
                     ret.append(
                         Device(
@@ -386,9 +397,10 @@ class NVIDIADetector(Detector):
                             memory=mdev_mem,
                             memory_used=mdev_mem_used,
                             memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
-                            temperature=mdev_temp,
-                            power=mdev_power,
-                            power_used=mdev_power_used,
+                            memory_status=mdev_mem_status,
+                            temperature=dev_temp,
+                            power=dev_power,
+                            power_used=dev_power_used,
                             appendix=mdev_appendix,
                         ),
                     )
@@ -426,11 +438,17 @@ class NVIDIADetector(Detector):
             devices_count=len(devices),
         )
+        get_links_cache = {}
         try:
             pynvml.nvmlInit()
             for i, dev_i in enumerate(devices):
-                dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
+                dev_i_bdf = dev_i.appendix.get("bdf")
+                if dev_i.appendix.get("vgpu", False):
+                    dev_i_handle = pynvml.nvmlDeviceGetHandleByPciBusId(dev_i_bdf)
+                else:
+                    dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
                 # Get NUMA and CPU affinities.
                 ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
@@ -439,7 +457,12 @@ class NVIDIADetector(Detector):
                 )
                 # Get links state if applicable.
-                if dev_i_links_state := _get_links_state(dev_i_handle):
+                if dev_i_bdf in get_links_cache:
+                    dev_i_links_state = get_links_cache[dev_i_bdf]
+                else:
+                    dev_i_links_state = _get_links_state(dev_i_handle)
+                    get_links_cache[dev_i_bdf] = dev_i_links_state
+                if dev_i_links_state:
                     ret.appendices[i].update(dev_i_links_state)
                     # In practice, if a card has an active *Link,
                     # then other cards in the same machine should be interconnected with it through the *Link.
@@ -456,21 +479,30 @@ class NVIDIADetector(Detector):
                     if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
                         continue
-                    dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
-                    distance = TopologyDistanceEnum.UNK
-                    try:
-                        distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
-                            dev_i_handle,
-                            dev_j_handle,
-                        )
-                    except pynvml.NVMLError:
-                        debug_log_exception(
-                            logger,
-                            "Failed to get distance between device %d and %d",
-                            dev_i.index,
-                            dev_j.index,
-                        )
+                    dev_j_bdf = dev_j.appendix.get("bdf")
+                    if dev_i_bdf == dev_j_bdf:
+                        distance = TopologyDistanceEnum.SELF
+                    else:
+                        if dev_j.appendix.get("vgpu", False):
+                            dev_j_handle = pynvml.nvmlDeviceGetHandleByPciBusId(
+                                dev_j_bdf,
+                            )
+                        else:
+                            dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
+                        distance = TopologyDistanceEnum.UNK
+                        try:
+                            distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
+                                dev_i_handle,
+                                dev_j_handle,
+                            )
+                        except pynvml.NVMLError:
+                            debug_log_exception(
+                                logger,
+                                "Failed to get distance between device %d and %d",
+                                dev_i.index,
+                                dev_j.index,
+                            )
                     ret.devices_distances[i][j] = distance
                     ret.devices_distances[j][i] = distance
@@ -767,30 +799,6 @@ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
     raise AttributeError(msg)
-def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
-    """
-    Compute the memory size of a MIG compute instance in GiB.
-    Args:
-        dev_mem:
-            The total memory info of the parent GPU device.
-        dev_gi_prf:
-            The profile info of the GPU instance.
-    Returns:
-        The memory size in GiB.
-    """
-    mem = dev_gi_prf.memorySizeMB * (1 << 20)  # MiB to byte
-    gib = round(
-        math.ceil(mem / dev_mem.total * 8)
-        / 8
-        * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
-    )
-    return gib
 def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
     """
     Get the number of slice for a given Compute Instance Profile ID.

gpustack_runtime/detector/pyacl/__init__.py CHANGED Viewed

@@ -403,20 +403,21 @@ def _LoadAclLibrary():
                 locs = [
                     "libascendcl.so",
                 ]
-                ascend_path = Path(
-                    os.getenv(
-                        "ASCEND_HOME_PATH",
-                        "/usr/local/Ascend/ascend-toolkit/latest",
-                    ),
-                )
-                if ascend_path.exists():
-                    locs.extend(
-                        [
-                            str(ascend_path / "runtime/lib64/libascendcl.so"),
-                            str(ascend_path / "aarch64-linux/lib64/libascendcl.so"),
-                            str(ascend_path / "x86_64-linux/lib64/libascendcl.so"),
-                        ]
+                for default_path in [
+                    "/usr/local/Ascend/ascend-toolkit/latest",
+                    "/usr/local/Ascend/cann",
+                ]:
+                    ascend_path = Path(
+                        os.getenv("ASCEND_HOME_PATH", default_path),
                     )
+                    if ascend_path.exists():
+                        locs.extend(
+                            [
+                                str(ascend_path / "runtime/lib64/libascendcl.so"),
+                                str(ascend_path / "aarch64-linux/lib64/libascendcl.so"),
+                                str(ascend_path / "x86_64-linux/lib64/libascendcl.so"),
+                            ]
+                        )
                 for loc in locs:
                     try:
                         aclLib = CDLL(loc)
@@ -439,7 +440,8 @@ def aclrtGetSocName():
         fn = _aclGetFunctionPointer("aclrtGetSocName")
         fn.restype = c_char_p
         c_version = fn()
-        return c_version.decode()
+        if c_version is not None:
+            return c_version.decode()
     return None
@@ -456,3 +458,21 @@ def aclsysGetCANNVersion(package_name=ACL_PKG_NAME_CANN):
         return f"{c_version.version}".lower()
     return None
+def aclsysGetVersion():
+    cann_version = aclsysGetCANNVersion()
+    if cann_version is not None:
+        return cann_version
+    with contextlib.suppress(ACLError):
+        _LoadAclLibrary()
+        c_version = create_string_buffer(ACL_PKG_VERSION_MAX_SIZE)
+        package_name = b"runtime"
+        fn = _aclGetFunctionPointer("aclsysGetVersionStr")
+        ret = fn(package_name, c_version)
+        _aclCheckReturn(ret)
+        return c_version.value.decode().lower()
+    return None

gpustack_runtime/detector/pydcmi/__init__.py CHANGED Viewed

@@ -135,8 +135,10 @@ DCMI_TOPO_TYPE_PHB = 2
 DCMI_TOPO_TYPE_HCCS = 3
 DCMI_TOPO_TYPE_PXB = 4
 DCMI_TOPO_TYPE_PIX = 5
-DCMI_TOPO_TYPE_BUTT = 6  # Unknown
-DCMI_TOPO_TYOE_MAX = 7
+DCMI_TOPO_TYPE_SIO = 6
+DCMI_TOPO_TYPE_HCCS_SW = 7
+DCMI_TOPO_TYPE_BUTT = 8  # Unknown
+DCMI_TOPO_TYOE_MAX = 9
 ## Error Codes ##

gpustack_runtime/detector/pyixml/__init__.py CHANGED Viewed

@@ -960,6 +960,14 @@ NVML_HOST_VGPU_MODE_SRIOV = 1
 # GSP firmware
 NVML_GSP_FIRMWARE_VERSION_BUF_SIZE = 0x40
+# Health
+IXML_HEALTH_SYSHUB_ERROR = 0x0000000000000001
+IXML_HEALTH_MC_ERROR = 0x0000000000000002
+IXML_HEALTH_ECC_ERROR = 0x0000000000000010
+IXML_HEALTH_MEMORY_ERROR = 0x0000000000000020
+IXML_HEALTH_PCIE_ERROR = 0x0000000000000040
+IXML_HEALTH_OK = 0x0000000000000000
 ## Error Checking ##
 class NVMLError(Exception):
@@ -5267,3 +5275,11 @@ def nvmlDeviceGetGpuFabricInfo(device, gpuFabricInfo):
     ret = fn(device, gpuFabricInfo)
     _nvmlCheckReturn(ret)
     return ret
+def ixmlDeviceGetHealth(device):
+    c_health = c_longlong()
+    fn = _nvmlGetFunctionPointer("ixmlDeviceGetHealth")
+    ret = fn(device, byref(c_health))
+    _nvmlCheckReturn(ret)
+    return c_health.value

gpustack_runtime/detector/pyrocmsmi/__init__.py CHANGED Viewed

@@ -393,3 +393,17 @@ def rsmi_is_p2p_accessible(device_a=0, device_b=0):
     )
     _rocmsmiCheckReturn(ret)
     return c_accessible.value
+def rsmi_dev_ecc_count_get(device=0, gpu_block=None):
+    if gpu_block is None:
+        gpu_block = rsmi_gpu_block_t.RSMI_GPU_BLOCK_UMC
+    c_error_count = rsmi_error_count_t()
+    fn = _rocmsmiGetFunctionPointer("rsmi_dev_ecc_count_get")
+    ret = fn(
+        device,
+        gpu_block,
+        byref(c_error_count),
+    )
+    _rocmsmiCheckReturn(ret)
+    return c_error_count

gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl

gpustack-runtime 0.1.41.post3py3-none-any.whl → 0.1.42.post1py3-none-any.whl