PyPI - gpustack-runtime - Versions diffs - 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl - Mend

gpustack-runtime 0.1.41.post2py3-none-any.whl → 0.1.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

gpustack_runtime/_version.py +2 -2
gpustack_runtime/_version_appendix.py +1 -1
gpustack_runtime/cmds/detector.py +3 -1
gpustack_runtime/deployer/__types__.py +314 -233
gpustack_runtime/deployer/cdi/__utils__.py +4 -1
gpustack_runtime/deployer/docker.py +109 -148
gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +21 -3
gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
gpustack_runtime/deployer/kuberentes.py +91 -126
gpustack_runtime/deployer/podman.py +89 -122
gpustack_runtime/detector/__init__.py +2 -0
gpustack_runtime/detector/__types__.py +26 -0
gpustack_runtime/detector/amd.py +28 -8
gpustack_runtime/detector/ascend.py +49 -4
gpustack_runtime/detector/cambricon.py +3 -0
gpustack_runtime/detector/hygon.py +16 -1
gpustack_runtime/detector/iluvatar.py +6 -0
gpustack_runtime/detector/metax.py +8 -0
gpustack_runtime/detector/mthreads.py +11 -0
gpustack_runtime/detector/nvidia.py +139 -134
gpustack_runtime/detector/pyixml/__init__.py +16 -0
gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
gpustack_runtime/detector/thead.py +135 -127
gpustack_runtime/envs.py +7 -6
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0

gpustack_runtime/deployer/kuberentes.py CHANGED Viewed

@@ -3,13 +3,11 @@ from __future__ import annotations as __future_annotations__
 import contextlib
 import json
 import logging
-import operator
 import os
 from dataclasses import dataclass, field
 from enum import Enum
-from functools import lru_cache, reduce
 from pathlib import Path
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING
 import kubernetes
 import kubernetes.stream.ws_client
@@ -43,7 +41,7 @@ from .__utils__ import (
     sensitive_env_var,
     validate_rfc1123_domain_name,
 )
-from .k8s.deviceplugin import cdi_kind_to_kdp_resource, is_kubelet_socket_accessible
+from .k8s.deviceplugin import get_resource_injection_policy
 if TYPE_CHECKING:
     from collections.abc import Callable, Generator
@@ -88,17 +86,6 @@ class KubernetesWorkloadPlan(WorkloadPlan):
             Domain suffix for the cluster. Default is "cluster.local".
         service_type (KubernetesWorkloadServiceTypeEnum):
             Service type for the workload. Default is CLUSTER_IP.
-        resource_key_runtime_env_mapping: (dict[str, str]):
-            Mapping from resource names to environment variable names for device allocation,
-            which is used to tell the Container Runtime which GPUs to mount into the container.
-            For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
-            which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
-            With privileged mode, the container can access all GPUs even if specified.
-        resource_key_backend_env_mapping: (dict[str, list[str]]):
-            Mapping from resource names to environment variable names for device runtime,
-            which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
-            For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
-            which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
         namespace (str | None):
             Namespace of the workload.
         name (str):
@@ -381,22 +368,6 @@ class KubernetesDeployer(EndoscopicDeployer):
         return wrapper
-    @staticmethod
-    @lru_cache
-    def _get_resource_injection_policy() -> Literal["env", "kdp"]:
-        """
-        Get the resource injection policy (in lowercase) for the deployer.
-        Returns:
-            The resource injection policy.
-        """
-        policy = envs.GPUSTACK_RUNTIME_KUBERNETES_RESOURCE_INJECTION_POLICY.lower()
-        if policy != "auto":
-            return policy
-        return "kdp" if is_kubelet_socket_accessible() else "env"
     def _create_ephemeral_configmaps(
         self,
         workload: KubernetesWorkloadPlan,
@@ -1008,114 +979,104 @@ class KubernetesDeployer(EndoscopicDeployer):
             # Parameterize resources
             if c.resources:
-                kdp = self._get_resource_injection_policy() == "kdp"
+                kdp = get_resource_injection_policy() == "kdp"
+                fmt = "kdp" if kdp else "plain"
                 resources: dict[str, str] = {}
-                r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
-                r_k_backend_env = workload.resource_key_backend_env_mapping or {}
-                _, vd_env, vd_cdis, vd_values = self.get_visible_devices_materials()
                 for r_k, r_v in c.resources.items():
                     if r_k in ("cpu", "memory"):
                         resources[r_k] = str(r_v)
+                        continue
+                    if (
+                        r_k
+                        in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
+                    ):
+                        # Set env if resource key is mapped.
+                        runtime_envs = [
+                            envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
+                                r_k
+                            ],
+                        ]
+                    elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
+                        # Set env if auto-mapping key is matched.
+                        runtime_envs = self.get_runtime_envs()
                     else:
-                        if r_k in r_k_runtime_env:
-                            # Set env if resource key is mapped.
-                            runtime_env = [r_k_runtime_env[r_k]]
-                        elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
-                            # Set env if auto-mapping key is matched.
-                            runtime_env = list(vd_env.keys())
-                        else:
-                            resources[r_k] = str(r_v)
-                            continue
-                        if r_k in r_k_backend_env:
-                            # Set env if resource key is mapped.
-                            backend_env = r_k_backend_env[r_k]
-                        else:
-                            # Otherwise, use the default backend env names.
-                            backend_env = reduce(operator.add, list(vd_env.values()))
-                        privileged = (
+                        resources[r_k] = str(r_v)
+                        continue
+                    privileged = (
+                        container.security_context
+                        and container.security_context.privileged
+                    )
+                    resource_values = [x.strip() for x in r_v.split(",")]
+                    # Request devices.
+                    if r_v == "all":
+                        # Configure privileged.
+                        container.security_context = (
                             container.security_context
-                            and container.security_context.privileged
+                            or kubernetes.client.V1SecurityContext()
                         )
-                        # Configure device access environment variable.
-                        if r_v == "all" and backend_env:
-                            # Configure privileged if requested all devices.
-                            container.security_context = (
-                                container.security_context
-                                or kubernetes.client.V1SecurityContext()
+                        container.security_context.privileged = True
+                        # Request all devices.
+                        for ren in runtime_envs:
+                            r_vs = self.get_runtime_visible_devices(ren, fmt)
+                            # Request device via KDP.
+                            if kdp:
+                                resources.update(
+                                    dict.fromkeys(r_vs, "1"),
+                                )
+                                continue
+                            # Request device via visible devices env.
+                            container.env.append(
+                                kubernetes.client.V1EnvVar(
+                                    name=ren,
+                                    value=",".join(r_vs),
+                                ),
                             )
-                            container.security_context.privileged = True
-                            # Then, set container backend visible devices env to all devices,
-                            # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
-                            # and mount corresponding libs if needed.
-                            for re in runtime_env:
-                                # Request device via KDP.
-                                if kdp:
-                                    for v in vd_values.get(re) or []:
-                                        kdp_resource = cdi_kind_to_kdp_resource(
-                                            cdi_kind=vd_cdis[re],
-                                            device_index=v,
-                                        )
-                                        resources[kdp_resource] = "1"
-                                    continue
-                                # Request device via visible devices env.
-                                rv = ",".join(vd_values.get(re) or ["all"])
-                                container.env.append(
-                                    kubernetes.client.V1EnvVar(
-                                        name=re,
-                                        value=rv,
-                                    ),
+                    else:
+                        # Request specific devices.
+                        for ren in runtime_envs:
+                            # Request all devices if privileged,
+                            # otherwise, normalize requested devices.
+                            if privileged:
+                                r_vs = self.get_runtime_visible_devices(ren, fmt)
+                            else:
+                                r_vs = self.map_runtime_visible_devices(
+                                    ren,
+                                    resource_values,
+                                    fmt,
                                 )
-                        else:
-                            # Set env to the allocated device IDs if no privileged,
-                            # otherwise, set container backend visible devices env to all devices,
-                            # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
-                            # and mount corresponding libs if needed.
-                            for re in runtime_env:
-                                # Request device via KDP.
-                                if kdp:
-                                    if not privileged:
-                                        for v in str(r_v).split(","):
-                                            kdp_resource = cdi_kind_to_kdp_resource(
-                                                cdi_kind=vd_cdis[re],
-                                                device_index=int(v.strip()),
-                                            )
-                                            resources[kdp_resource] = "1"
-                                    else:
-                                        for v in vd_values.get(re) or []:
-                                            kdp_resource = cdi_kind_to_kdp_resource(
-                                                cdi_kind=vd_cdis[re],
-                                                device_index=v,
-                                            )
-                                            resources[kdp_resource] = "1"
-                                    continue
-                                # Request device via visible devices env.
-                                if not privileged:
-                                    rv = str(r_v)
-                                else:
-                                    rv = ",".join(vd_values.get(re) or ["all"])
-                                container.env.append(
-                                    kubernetes.client.V1EnvVar(
-                                        name=re,
-                                        value=rv,
-                                    ),
+                            # Request device via KDP.
+                            if kdp:
+                                resources.update(
+                                    dict.fromkeys(r_vs, "1"),
                                 )
+                                continue
+                            # Request device via visible devices env.
+                            container.env.append(
+                                kubernetes.client.V1EnvVar(
+                                    name=ren,
+                                    value=",".join(r_vs),
+                                ),
+                            )
-                        # Configure runtime device access environment variables.
-                        if r_v != "all" and privileged:
-                            for be in backend_env:
-                                container.env.append(
-                                    kubernetes.client.V1EnvVar(
-                                        name=be,
-                                        value=self.align_backend_visible_devices_env_values(
-                                            be,
-                                            str(r_v),
-                                        ),
-                                    ),
+                    # Configure runtime device access environment variables.
+                    if r_v != "all" and privileged:
+                        b_vs = self.map_backend_visible_devices(
+                            runtime_envs,
+                            resource_values,
+                        )
+                        container.env.extend(
+                            [
+                                kubernetes.client.V1EnvVar(
+                                    name=be,
+                                    value=be_v,
                                 )
+                                for be, be_v in b_vs.items()
+                            ],
+                        )
                 container.resources = kubernetes.client.V1ResourceRequirements(
                     limits=(resources if resources else None),
@@ -1245,6 +1206,10 @@ class KubernetesDeployer(EndoscopicDeployer):
         self._client = self._get_client()
         self._node_name = envs.GPUSTACK_RUNTIME_KUBERNETES_NODE_NAME
+    @property
+    def allowed_uuid_values(self) -> bool:
+        return get_resource_injection_policy() != "kdp"
     def _prepare_mirrored_deployment(self):
         """
         Prepare for mirrored deployment.

gpustack_runtime/deployer/podman.py CHANGED Viewed

@@ -4,13 +4,11 @@ import contextlib
 import io
 import json
 import logging
-import operator
 import os
 import socket
 import sys
 import tarfile
 from dataclasses import dataclass, field
-from functools import reduce
 from math import ceil
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
@@ -84,17 +82,6 @@ class PodmanWorkloadPlan(WorkloadPlan):
             Image used for the pause container.
         unhealthy_restart_image (str):
             Image used for unhealthy restart container.
-        resource_key_runtime_env_mapping: (dict[str, str]):
-            Mapping from resource names to environment variable names for device allocation,
-            which is used to tell the Container Runtime which GPUs to mount into the container.
-            For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
-            which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
-            With privileged mode, the container can access all GPUs even if specified.
-        resource_key_backend_env_mapping: (dict[str, list[str]]):
-            Mapping from resource names to environment variable names for device runtime,
-            which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
-            For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
-            which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
         namespace (str | None):
             Namespace of the workload.
         name (str):
@@ -952,120 +939,100 @@ class PodmanDeployer(EndoscopicDeployer):
             # Parameterize resources.
             if c.resources:
-                r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
-                r_k_backend_env = workload.resource_key_backend_env_mapping or {}
-                vd_manus, vd_env, vd_cdis, vd_values = (
-                    self.get_visible_devices_materials()
-                )
+                fmt = "cdi"
                 for r_k, r_v in c.resources.items():
-                    match r_k:
-                        case "cpu":
-                            if isinstance(r_v, int | float):
-                                create_options["cpu_shares"] = ceil(r_v * 1024)
-                            elif isinstance(r_v, str) and r_v.isdigit():
-                                create_options["cpu_shares"] = ceil(float(r_v) * 1024)
-                        case "memory":
-                            if isinstance(r_v, int):
-                                create_options["mem_limit"] = r_v
-                                create_options["mem_reservation"] = r_v
-                                create_options["memswap_limit"] = r_v
-                            elif isinstance(r_v, str):
-                                v = r_v.lower().removesuffix("i")
-                                create_options["mem_limit"] = v
-                                create_options["mem_reservation"] = v
-                                create_options["memswap_limit"] = v
-                        case _:
-                            if r_k in r_k_runtime_env:
-                                # Set env if resource key is mapped.
-                                runtime_env = [r_k_runtime_env[r_k]]
-                            elif (
-                                r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
-                            ):
-                                # Set env if auto-mapping key is matched.
-                                runtime_env = list(vd_env.keys())
-                            else:
-                                continue
+                    if r_k == "cpu":
+                        if isinstance(r_v, int | float):
+                            create_options["cpu_shares"] = ceil(r_v * 1024)
+                        elif isinstance(r_v, str) and r_v.isdigit():
+                            create_options["cpu_shares"] = ceil(float(r_v) * 1024)
+                        continue
+                    if r_k == "memory":
+                        if isinstance(r_v, int):
+                            create_options["mem_limit"] = r_v
+                            create_options["mem_reservation"] = r_v
+                            create_options["memswap_limit"] = r_v
+                        elif isinstance(r_v, str):
+                            v = r_v.lower().removesuffix("i")
+                            create_options["mem_limit"] = v
+                            create_options["mem_reservation"] = v
+                            create_options["memswap_limit"] = v
+                        continue
-                            if r_k in r_k_backend_env:
-                                # Set env if resource key is mapped.
-                                backend_env = r_k_backend_env[r_k]
-                            else:
-                                # Otherwise, use the default backend env names.
-                                backend_env = reduce(
-                                    operator.add,
-                                    list(vd_env.values()),
-                                )
+                    if (
+                        r_k
+                        in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
+                    ):
+                        # Set env if resource key is mapped.
+                        runtime_envs = [
+                            envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
+                                r_k
+                            ],
+                        ]
+                    elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
+                        # Set env if auto-mapping key is matched.
+                        runtime_envs = self.get_runtime_envs()
+                    else:
+                        continue
-                            privileged = create_options.get("privileged", False)
-                            # Generate CDI config if not yet.
-                            if envs.GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE:
-                                for re in runtime_env:
-                                    cdi_dump_config(
-                                        manufacturer=vd_manus[re],
-                                        output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
-                                    )
-                            # Configure device access environment variable.
-                            if r_v == "all" and backend_env:
-                                # Configure privileged if requested all devices.
-                                create_options["privileged"] = True
-                                # Then, set container backend visible devices env to all devices,
-                                # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
-                                # and mount corresponding libs if needed.
-                                for re in runtime_env:
-                                    # Request device via CDI.
-                                    rv = [
-                                        f"{vd_cdis[re]}={v}"
-                                        for v in (vd_values.get(re) or ["all"])
-                                    ]
-                                    if "devices" not in create_options:
-                                        create_options["devices"] = []
-                                    create_options["devices"].extend(rv)
+                    privileged = create_options.get("privileged", False)
+                    resource_values = [x.strip() for x in r_v.split(",")]
+                    # Generate CDI config if not yet.
+                    if envs.GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE:
+                        for ren in runtime_envs:
+                            r_m = self.get_manufacturer(ren)
+                            cdi_dump_config(
+                                manufacturer=r_m,
+                                output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
+                            )
+                    # Configure device access environment variable.
+                    if r_v == "all":
+                        # Configure privileged.
+                        create_options["privileged"] = True
+                        # Request all devices.
+                        for ren in runtime_envs:
+                            r_vs = self.get_runtime_visible_devices(ren, fmt)
+                            # Request device via CDI.
+                            if "devices" not in create_options:
+                                create_options["devices"] = []
+                            create_options["devices"].extend(r_vs)
+                    else:
+                        # Request specific devices.
+                        for ren in runtime_envs:
+                            # Request all devices if privileged,
+                            # otherwise, normalize requested devices.
+                            if privileged:
+                                r_vs = self.get_runtime_visible_devices(ren, fmt)
                             else:
-                                # Set env to the allocated device IDs if no privileged,
-                                # otherwise, set container backend visible devices env to all devices,
-                                # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
-                                # and mount corresponding libs if needed.
-                                for re in runtime_env:
-                                    # Request device via CDI.
-                                    if not privileged:
-                                        rv = [
-                                            f"{vd_cdis[re]}={v.strip()}"
-                                            for v in r_v.split(",")
-                                        ]
-                                    else:
-                                        rv = [
-                                            f"{vd_cdis[re]}={v}"
-                                            for v in (vd_values.get(re) or ["all"])
-                                        ]
-                                    if "devices" not in create_options:
-                                        create_options["devices"] = []
-                                    create_options["devices"].extend(rv)
-                            # Configure runtime device access environment variables.
-                            if r_v != "all" and privileged:
-                                for be in backend_env:
-                                    create_options["environment"][be] = (
-                                        self.align_backend_visible_devices_env_values(
-                                            be,
-                                            str(r_v),
-                                        )
-                                    )
-                            # Configure affinity if applicable.
-                            if (
-                                envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
-                                or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
-                            ):
-                                cpus, numas = self.get_visible_devices_affinities(
-                                    runtime_env,
-                                    r_v,
+                                r_vs = self.map_runtime_visible_devices(
+                                    ren,
+                                    resource_values,
+                                    fmt,
                                 )
-                                if cpus:
-                                    create_options["cpuset_cpus"] = cpus
-                                if numas and envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
-                                    create_options["cpuset_mems"] = numas
+                            # Request device via CDI.
+                            if "devices" not in create_options:
+                                create_options["devices"] = []
+                            create_options["devices"].extend(r_vs)
+                    # If not requesting all devices but privileged,
+                    # must configure visible devices.
+                    if r_v != "all" and privileged:
+                        b_vs = self.map_backend_visible_devices(
+                            runtime_envs,
+                            resource_values,
+                        )
+                        create_options["environment"].update(b_vs)
+                    # Configure affinity if applicable.
+                    create_options.update(
+                        self.map_visible_devices_affinities(
+                            runtime_envs,
+                            resource_values,
+                        ),
+                    )
             # Parameterize mounts.
             self._append_container_mounts(

gpustack_runtime/detector/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ from ..logging import debug_log_exception
 from .__types__ import (
     Detector,
     Device,
+    DeviceMemoryStatusEnum,
     Devices,
     ManufacturerEnum,
     Topology,
@@ -292,6 +293,7 @@ def filter_devices_by_manufacturer(
 __all__ = [
     "Device",
+    "DeviceMemoryStatusEnum",
     "Devices",
     "ManufacturerEnum",
     "Topology",

gpustack_runtime/detector/__types__.py CHANGED Viewed

@@ -122,6 +122,28 @@ def backend_to_manufacturer(backend: str) -> ManufacturerEnum:
     return ManufacturerEnum.UNKNOWN
+class DeviceMemoryStatusEnum(str, Enum):
+    """
+    Enum for Device Memory Status.
+    """
+    HEALTHY = "healthy"
+    """
+    Device is healthy.
+    """
+    UNHEALTHY = "unhealthy"
+    """
+    Device is unhealthy.
+    """
+    UNKNOWN = "unknown"
+    """
+    Device status is unknown.
+    """
+    def __str__(self):
+        return self.value
 @dataclass_json
 @dataclass
 class Device:
@@ -185,6 +207,10 @@ class Device:
     """
     Memory utilization of the device in percentage.
     """
+    memory_status: DeviceMemoryStatusEnum = DeviceMemoryStatusEnum.UNKNOWN
+    """
+    Status of the device.
+    """
     temperature: int | float | None = None
     """
     Temperature of the device in Celsius.

gpustack_runtime/detector/amd.py CHANGED Viewed

@@ -8,7 +8,14 @@ from pathlib import Path
 from .. import envs
 from ..logging import debug_log_exception, debug_log_warning
 from . import Topology, pyamdgpu, pyamdsmi, pyhsa, pyrocmcore, pyrocmsmi
-from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
+from .__types__ import (
+    Detector,
+    Device,
+    DeviceMemoryStatusEnum,
+    Devices,
+    ManufacturerEnum,
+    TopologyDistanceEnum,
+)
 from .__utils__ import (
     PCIDevice,
     byte_to_mebibyte,
@@ -165,20 +172,32 @@ class AMDDetector(Detector):
                     )
                     dev_cores_util = 0
-                dev_mem = None
-                dev_mem_used = None
+                dev_mem = 0
+                dev_mem_used = 0
+                dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
                 try:
                     dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev)
                     dev_mem = dev_gpu_vram_usage.get("vram_total")
                     dev_mem_used = dev_gpu_vram_usage.get("vram_used")
+                    dev_ecc_count = pyamdsmi.amdsmi_get_gpu_ecc_count(
+                        dev,
+                        pyamdsmi.AmdSmiGpuBlock.UMC,
+                    )
+                    if dev_ecc_count.get("uncorrectable_count", 0) > 0:
+                        dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
                 except pyamdsmi.AmdSmiException:
+                    dev_mem = byte_to_mebibyte(  # byte to MiB
+                        pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
+                    )
+                    dev_mem_used = byte_to_mebibyte(  # byte to MiB
+                        pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
+                    )
                     with contextlib.suppress(pyrocmsmi.ROCMSMIError):
-                        dev_mem = byte_to_mebibyte(  # byte to MiB
-                            pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
-                        )
-                        dev_mem_used = byte_to_mebibyte(  # byte to MiB
-                            pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
+                        dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
+                            dev_idx,
                         )
+                        if dev_ecc_count.uncorrectable_err > 0:
+                            dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
                 dev_power = None
                 dev_power_used = None
@@ -232,6 +251,7 @@ class AMDDetector(Detector):
                         memory=dev_mem,
                         memory_used=dev_mem_used,
                         memory_utilization=get_utilization(dev_mem_used, dev_mem),
+                        memory_status=dev_mem_status,
                         temperature=dev_temp,
                         power=dev_power,
                         power_used=dev_power_used,

gpustack-runtime 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl

gpustack-runtime 0.1.41.post2py3-none-any.whl → 0.1.42py3-none-any.whl