gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. gpustack_runtime/_version.py +2 -2
  2. gpustack_runtime/_version_appendix.py +1 -1
  3. gpustack_runtime/cmds/detector.py +3 -1
  4. gpustack_runtime/deployer/__types__.py +314 -233
  5. gpustack_runtime/deployer/cdi/__utils__.py +4 -1
  6. gpustack_runtime/deployer/docker.py +109 -148
  7. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +1 -1
  8. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
  9. gpustack_runtime/deployer/kuberentes.py +89 -108
  10. gpustack_runtime/deployer/podman.py +89 -122
  11. gpustack_runtime/detector/__init__.py +2 -0
  12. gpustack_runtime/detector/__types__.py +26 -0
  13. gpustack_runtime/detector/amd.py +28 -8
  14. gpustack_runtime/detector/ascend.py +49 -4
  15. gpustack_runtime/detector/cambricon.py +3 -0
  16. gpustack_runtime/detector/hygon.py +16 -1
  17. gpustack_runtime/detector/iluvatar.py +6 -0
  18. gpustack_runtime/detector/metax.py +8 -0
  19. gpustack_runtime/detector/mthreads.py +11 -0
  20. gpustack_runtime/detector/nvidia.py +139 -134
  21. gpustack_runtime/detector/pyixml/__init__.py +16 -0
  22. gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
  23. gpustack_runtime/detector/thead.py +135 -127
  24. gpustack_runtime/envs.py +7 -6
  25. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
  26. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
  27. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
  28. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
  29. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0
@@ -3,11 +3,9 @@ from __future__ import annotations as __future_annotations__
3
3
  import contextlib
4
4
  import json
5
5
  import logging
6
- import operator
7
6
  import os
8
7
  from dataclasses import dataclass, field
9
8
  from enum import Enum
10
- from functools import reduce
11
9
  from pathlib import Path
12
10
  from typing import TYPE_CHECKING
13
11
 
@@ -43,7 +41,7 @@ from .__utils__ import (
43
41
  sensitive_env_var,
44
42
  validate_rfc1123_domain_name,
45
43
  )
46
- from .k8s.deviceplugin import cdi_kind_to_kdp_resource, get_resource_injection_policy
44
+ from .k8s.deviceplugin import get_resource_injection_policy
47
45
 
48
46
  if TYPE_CHECKING:
49
47
  from collections.abc import Callable, Generator
@@ -88,17 +86,6 @@ class KubernetesWorkloadPlan(WorkloadPlan):
88
86
  Domain suffix for the cluster. Default is "cluster.local".
89
87
  service_type (KubernetesWorkloadServiceTypeEnum):
90
88
  Service type for the workload. Default is CLUSTER_IP.
91
- resource_key_runtime_env_mapping: (dict[str, str]):
92
- Mapping from resource names to environment variable names for device allocation,
93
- which is used to tell the Container Runtime which GPUs to mount into the container.
94
- For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
95
- which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
96
- With privileged mode, the container can access all GPUs even if specified.
97
- resource_key_backend_env_mapping: (dict[str, list[str]]):
98
- Mapping from resource names to environment variable names for device runtime,
99
- which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
100
- For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
101
- which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
102
89
  namespace (str | None):
103
90
  Namespace of the workload.
104
91
  name (str):
@@ -993,113 +980,103 @@ class KubernetesDeployer(EndoscopicDeployer):
993
980
  # Parameterize resources
994
981
  if c.resources:
995
982
  kdp = get_resource_injection_policy() == "kdp"
983
+ fmt = "kdp" if kdp else "plain"
996
984
 
997
985
  resources: dict[str, str] = {}
998
- r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
999
- r_k_backend_env = workload.resource_key_backend_env_mapping or {}
1000
- _, vd_env, vd_cdis, vd_values = self.get_visible_devices_materials()
1001
986
  for r_k, r_v in c.resources.items():
1002
987
  if r_k in ("cpu", "memory"):
1003
988
  resources[r_k] = str(r_v)
989
+ continue
990
+
991
+ if (
992
+ r_k
993
+ in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
994
+ ):
995
+ # Set env if resource key is mapped.
996
+ runtime_envs = [
997
+ envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
998
+ r_k
999
+ ],
1000
+ ]
1001
+ elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
1002
+ # Set env if auto-mapping key is matched.
1003
+ runtime_envs = self.get_runtime_envs()
1004
1004
  else:
1005
- if r_k in r_k_runtime_env:
1006
- # Set env if resource key is mapped.
1007
- runtime_env = [r_k_runtime_env[r_k]]
1008
- elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
1009
- # Set env if auto-mapping key is matched.
1010
- runtime_env = list(vd_env.keys())
1011
- else:
1012
- resources[r_k] = str(r_v)
1013
- continue
1014
-
1015
- if r_k in r_k_backend_env:
1016
- # Set env if resource key is mapped.
1017
- backend_env = r_k_backend_env[r_k]
1018
- else:
1019
- # Otherwise, use the default backend env names.
1020
- backend_env = reduce(operator.add, list(vd_env.values()))
1021
-
1022
- privileged = (
1005
+ resources[r_k] = str(r_v)
1006
+ continue
1007
+
1008
+ privileged = (
1009
+ container.security_context
1010
+ and container.security_context.privileged
1011
+ )
1012
+ resource_values = [x.strip() for x in r_v.split(",")]
1013
+
1014
+ # Request devices.
1015
+ if r_v == "all":
1016
+ # Configure privileged.
1017
+ container.security_context = (
1023
1018
  container.security_context
1024
- and container.security_context.privileged
1019
+ or kubernetes.client.V1SecurityContext()
1025
1020
  )
1026
-
1027
- # Configure device access environment variable.
1028
- if r_v == "all" and backend_env:
1029
- # Configure privileged if requested all devices.
1030
- container.security_context = (
1031
- container.security_context
1032
- or kubernetes.client.V1SecurityContext()
1021
+ container.security_context.privileged = True
1022
+ # Request all devices.
1023
+ for ren in runtime_envs:
1024
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1025
+ # Request device via KDP.
1026
+ if kdp:
1027
+ resources.update(
1028
+ dict.fromkeys(r_vs, "1"),
1029
+ )
1030
+ continue
1031
+ # Request device via visible devices env.
1032
+ container.env.append(
1033
+ kubernetes.client.V1EnvVar(
1034
+ name=ren,
1035
+ value=",".join(r_vs),
1036
+ ),
1033
1037
  )
1034
- container.security_context.privileged = True
1035
- # Then, set container backend visible devices env to all devices,
1036
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1037
- # and mount corresponding libs if needed.
1038
- for re in runtime_env:
1039
- # Request device via KDP.
1040
- if kdp:
1041
- for v in vd_values.get(re) or []:
1042
- kdp_resource = cdi_kind_to_kdp_resource(
1043
- cdi_kind=vd_cdis[re],
1044
- device_index=v,
1045
- )
1046
- resources[kdp_resource] = "1"
1047
- continue
1048
- # Request device via visible devices env.
1049
- rv = ",".join(vd_values.get(re) or ["all"])
1050
- container.env.append(
1051
- kubernetes.client.V1EnvVar(
1052
- name=re,
1053
- value=rv,
1054
- ),
1038
+ else:
1039
+ # Request specific devices.
1040
+ for ren in runtime_envs:
1041
+ # Request all devices if privileged,
1042
+ # otherwise, normalize requested devices.
1043
+ if privileged:
1044
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1045
+ else:
1046
+ r_vs = self.map_runtime_visible_devices(
1047
+ ren,
1048
+ resource_values,
1049
+ fmt,
1055
1050
  )
1056
- else:
1057
- # Set env to the allocated device IDs if no privileged,
1058
- # otherwise, set container backend visible devices env to all devices,
1059
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1060
- # and mount corresponding libs if needed.
1061
- for re in runtime_env:
1062
- # Request device via KDP.
1063
- if kdp:
1064
- if not privileged:
1065
- for v in str(r_v).split(","):
1066
- kdp_resource = cdi_kind_to_kdp_resource(
1067
- cdi_kind=vd_cdis[re],
1068
- device_index=int(v.strip()),
1069
- )
1070
- resources[kdp_resource] = "1"
1071
- else:
1072
- for v in vd_values.get(re) or []:
1073
- kdp_resource = cdi_kind_to_kdp_resource(
1074
- cdi_kind=vd_cdis[re],
1075
- device_index=v,
1076
- )
1077
- resources[kdp_resource] = "1"
1078
- continue
1079
- # Request device via visible devices env.
1080
- if not privileged:
1081
- rv = str(r_v)
1082
- else:
1083
- rv = ",".join(vd_values.get(re) or ["all"])
1084
- container.env.append(
1085
- kubernetes.client.V1EnvVar(
1086
- name=re,
1087
- value=rv,
1088
- ),
1051
+ # Request device via KDP.
1052
+ if kdp:
1053
+ resources.update(
1054
+ dict.fromkeys(r_vs, "1"),
1089
1055
  )
1056
+ continue
1057
+ # Request device via visible devices env.
1058
+ container.env.append(
1059
+ kubernetes.client.V1EnvVar(
1060
+ name=ren,
1061
+ value=",".join(r_vs),
1062
+ ),
1063
+ )
1090
1064
 
1091
- # Configure runtime device access environment variables.
1092
- if r_v != "all" and privileged:
1093
- for be in backend_env:
1094
- container.env.append(
1095
- kubernetes.client.V1EnvVar(
1096
- name=be,
1097
- value=self.align_backend_visible_devices_env_values(
1098
- be,
1099
- str(r_v),
1100
- ),
1101
- ),
1065
+ # Configure runtime device access environment variables.
1066
+ if r_v != "all" and privileged:
1067
+ b_vs = self.map_backend_visible_devices(
1068
+ runtime_envs,
1069
+ resource_values,
1070
+ )
1071
+ container.env.extend(
1072
+ [
1073
+ kubernetes.client.V1EnvVar(
1074
+ name=be,
1075
+ value=be_v,
1102
1076
  )
1077
+ for be, be_v in b_vs.items()
1078
+ ],
1079
+ )
1103
1080
 
1104
1081
  container.resources = kubernetes.client.V1ResourceRequirements(
1105
1082
  limits=(resources if resources else None),
@@ -1229,6 +1206,10 @@ class KubernetesDeployer(EndoscopicDeployer):
1229
1206
  self._client = self._get_client()
1230
1207
  self._node_name = envs.GPUSTACK_RUNTIME_KUBERNETES_NODE_NAME
1231
1208
 
1209
+ @property
1210
+ def allowed_uuid_values(self) -> bool:
1211
+ return get_resource_injection_policy() != "kdp"
1212
+
1232
1213
  def _prepare_mirrored_deployment(self):
1233
1214
  """
1234
1215
  Prepare for mirrored deployment.
@@ -4,13 +4,11 @@ import contextlib
4
4
  import io
5
5
  import json
6
6
  import logging
7
- import operator
8
7
  import os
9
8
  import socket
10
9
  import sys
11
10
  import tarfile
12
11
  from dataclasses import dataclass, field
13
- from functools import reduce
14
12
  from math import ceil
15
13
  from pathlib import Path
16
14
  from typing import TYPE_CHECKING, Any
@@ -84,17 +82,6 @@ class PodmanWorkloadPlan(WorkloadPlan):
84
82
  Image used for the pause container.
85
83
  unhealthy_restart_image (str):
86
84
  Image used for unhealthy restart container.
87
- resource_key_runtime_env_mapping: (dict[str, str]):
88
- Mapping from resource names to environment variable names for device allocation,
89
- which is used to tell the Container Runtime which GPUs to mount into the container.
90
- For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
91
- which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
92
- With privileged mode, the container can access all GPUs even if specified.
93
- resource_key_backend_env_mapping: (dict[str, list[str]]):
94
- Mapping from resource names to environment variable names for device runtime,
95
- which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
96
- For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
97
- which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
98
85
  namespace (str | None):
99
86
  Namespace of the workload.
100
87
  name (str):
@@ -952,120 +939,100 @@ class PodmanDeployer(EndoscopicDeployer):
952
939
 
953
940
  # Parameterize resources.
954
941
  if c.resources:
955
- r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
956
- r_k_backend_env = workload.resource_key_backend_env_mapping or {}
957
- vd_manus, vd_env, vd_cdis, vd_values = (
958
- self.get_visible_devices_materials()
959
- )
942
+ fmt = "cdi"
943
+
960
944
  for r_k, r_v in c.resources.items():
961
- match r_k:
962
- case "cpu":
963
- if isinstance(r_v, int | float):
964
- create_options["cpu_shares"] = ceil(r_v * 1024)
965
- elif isinstance(r_v, str) and r_v.isdigit():
966
- create_options["cpu_shares"] = ceil(float(r_v) * 1024)
967
- case "memory":
968
- if isinstance(r_v, int):
969
- create_options["mem_limit"] = r_v
970
- create_options["mem_reservation"] = r_v
971
- create_options["memswap_limit"] = r_v
972
- elif isinstance(r_v, str):
973
- v = r_v.lower().removesuffix("i")
974
- create_options["mem_limit"] = v
975
- create_options["mem_reservation"] = v
976
- create_options["memswap_limit"] = v
977
- case _:
978
- if r_k in r_k_runtime_env:
979
- # Set env if resource key is mapped.
980
- runtime_env = [r_k_runtime_env[r_k]]
981
- elif (
982
- r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
983
- ):
984
- # Set env if auto-mapping key is matched.
985
- runtime_env = list(vd_env.keys())
986
- else:
987
- continue
945
+ if r_k == "cpu":
946
+ if isinstance(r_v, int | float):
947
+ create_options["cpu_shares"] = ceil(r_v * 1024)
948
+ elif isinstance(r_v, str) and r_v.isdigit():
949
+ create_options["cpu_shares"] = ceil(float(r_v) * 1024)
950
+ continue
951
+ if r_k == "memory":
952
+ if isinstance(r_v, int):
953
+ create_options["mem_limit"] = r_v
954
+ create_options["mem_reservation"] = r_v
955
+ create_options["memswap_limit"] = r_v
956
+ elif isinstance(r_v, str):
957
+ v = r_v.lower().removesuffix("i")
958
+ create_options["mem_limit"] = v
959
+ create_options["mem_reservation"] = v
960
+ create_options["memswap_limit"] = v
961
+ continue
988
962
 
989
- if r_k in r_k_backend_env:
990
- # Set env if resource key is mapped.
991
- backend_env = r_k_backend_env[r_k]
992
- else:
993
- # Otherwise, use the default backend env names.
994
- backend_env = reduce(
995
- operator.add,
996
- list(vd_env.values()),
997
- )
963
+ if (
964
+ r_k
965
+ in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
966
+ ):
967
+ # Set env if resource key is mapped.
968
+ runtime_envs = [
969
+ envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
970
+ r_k
971
+ ],
972
+ ]
973
+ elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
974
+ # Set env if auto-mapping key is matched.
975
+ runtime_envs = self.get_runtime_envs()
976
+ else:
977
+ continue
998
978
 
999
- privileged = create_options.get("privileged", False)
1000
-
1001
- # Generate CDI config if not yet.
1002
- if envs.GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE:
1003
- for re in runtime_env:
1004
- cdi_dump_config(
1005
- manufacturer=vd_manus[re],
1006
- output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
1007
- )
1008
-
1009
- # Configure device access environment variable.
1010
- if r_v == "all" and backend_env:
1011
- # Configure privileged if requested all devices.
1012
- create_options["privileged"] = True
1013
- # Then, set container backend visible devices env to all devices,
1014
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1015
- # and mount corresponding libs if needed.
1016
- for re in runtime_env:
1017
- # Request device via CDI.
1018
- rv = [
1019
- f"{vd_cdis[re]}={v}"
1020
- for v in (vd_values.get(re) or ["all"])
1021
- ]
1022
- if "devices" not in create_options:
1023
- create_options["devices"] = []
1024
- create_options["devices"].extend(rv)
979
+ privileged = create_options.get("privileged", False)
980
+ resource_values = [x.strip() for x in r_v.split(",")]
981
+
982
+ # Generate CDI config if not yet.
983
+ if envs.GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE:
984
+ for ren in runtime_envs:
985
+ r_m = self.get_manufacturer(ren)
986
+ cdi_dump_config(
987
+ manufacturer=r_m,
988
+ output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
989
+ )
990
+
991
+ # Configure device access environment variable.
992
+ if r_v == "all":
993
+ # Configure privileged.
994
+ create_options["privileged"] = True
995
+ # Request all devices.
996
+ for ren in runtime_envs:
997
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
998
+ # Request device via CDI.
999
+ if "devices" not in create_options:
1000
+ create_options["devices"] = []
1001
+ create_options["devices"].extend(r_vs)
1002
+ else:
1003
+ # Request specific devices.
1004
+ for ren in runtime_envs:
1005
+ # Request all devices if privileged,
1006
+ # otherwise, normalize requested devices.
1007
+ if privileged:
1008
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1025
1009
  else:
1026
- # Set env to the allocated device IDs if no privileged,
1027
- # otherwise, set container backend visible devices env to all devices,
1028
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1029
- # and mount corresponding libs if needed.
1030
- for re in runtime_env:
1031
- # Request device via CDI.
1032
- if not privileged:
1033
- rv = [
1034
- f"{vd_cdis[re]}={v.strip()}"
1035
- for v in r_v.split(",")
1036
- ]
1037
- else:
1038
- rv = [
1039
- f"{vd_cdis[re]}={v}"
1040
- for v in (vd_values.get(re) or ["all"])
1041
- ]
1042
- if "devices" not in create_options:
1043
- create_options["devices"] = []
1044
- create_options["devices"].extend(rv)
1045
-
1046
- # Configure runtime device access environment variables.
1047
- if r_v != "all" and privileged:
1048
- for be in backend_env:
1049
- create_options["environment"][be] = (
1050
- self.align_backend_visible_devices_env_values(
1051
- be,
1052
- str(r_v),
1053
- )
1054
- )
1055
-
1056
- # Configure affinity if applicable.
1057
- if (
1058
- envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1059
- or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1060
- ):
1061
- cpus, numas = self.get_visible_devices_affinities(
1062
- runtime_env,
1063
- r_v,
1010
+ r_vs = self.map_runtime_visible_devices(
1011
+ ren,
1012
+ resource_values,
1013
+ fmt,
1064
1014
  )
1065
- if cpus:
1066
- create_options["cpuset_cpus"] = cpus
1067
- if numas and envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
1068
- create_options["cpuset_mems"] = numas
1015
+ # Request device via CDI.
1016
+ if "devices" not in create_options:
1017
+ create_options["devices"] = []
1018
+ create_options["devices"].extend(r_vs)
1019
+
1020
+ # If not requesting all devices but privileged,
1021
+ # must configure visible devices.
1022
+ if r_v != "all" and privileged:
1023
+ b_vs = self.map_backend_visible_devices(
1024
+ runtime_envs,
1025
+ resource_values,
1026
+ )
1027
+ create_options["environment"].update(b_vs)
1028
+
1029
+ # Configure affinity if applicable.
1030
+ create_options.update(
1031
+ self.map_visible_devices_affinities(
1032
+ runtime_envs,
1033
+ resource_values,
1034
+ ),
1035
+ )
1069
1036
 
1070
1037
  # Parameterize mounts.
1071
1038
  self._append_container_mounts(
@@ -7,6 +7,7 @@ from ..logging import debug_log_exception
7
7
  from .__types__ import (
8
8
  Detector,
9
9
  Device,
10
+ DeviceMemoryStatusEnum,
10
11
  Devices,
11
12
  ManufacturerEnum,
12
13
  Topology,
@@ -292,6 +293,7 @@ def filter_devices_by_manufacturer(
292
293
 
293
294
  __all__ = [
294
295
  "Device",
296
+ "DeviceMemoryStatusEnum",
295
297
  "Devices",
296
298
  "ManufacturerEnum",
297
299
  "Topology",
@@ -122,6 +122,28 @@ def backend_to_manufacturer(backend: str) -> ManufacturerEnum:
122
122
  return ManufacturerEnum.UNKNOWN
123
123
 
124
124
 
125
+ class DeviceMemoryStatusEnum(str, Enum):
126
+ """
127
+ Enum for Device Memory Status.
128
+ """
129
+
130
+ HEALTHY = "healthy"
131
+ """
132
+ Device is healthy.
133
+ """
134
+ UNHEALTHY = "unhealthy"
135
+ """
136
+ Device is unhealthy.
137
+ """
138
+ UNKNOWN = "unknown"
139
+ """
140
+ Device status is unknown.
141
+ """
142
+
143
+ def __str__(self):
144
+ return self.value
145
+
146
+
125
147
  @dataclass_json
126
148
  @dataclass
127
149
  class Device:
@@ -185,6 +207,10 @@ class Device:
185
207
  """
186
208
  Memory utilization of the device in percentage.
187
209
  """
210
+ memory_status: DeviceMemoryStatusEnum = DeviceMemoryStatusEnum.UNKNOWN
211
+ """
212
+ Status of the device.
213
+ """
188
214
  temperature: int | float | None = None
189
215
  """
190
216
  Temperature of the device in Celsius.
@@ -8,7 +8,14 @@ from pathlib import Path
8
8
  from .. import envs
9
9
  from ..logging import debug_log_exception, debug_log_warning
10
10
  from . import Topology, pyamdgpu, pyamdsmi, pyhsa, pyrocmcore, pyrocmsmi
11
- from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
11
+ from .__types__ import (
12
+ Detector,
13
+ Device,
14
+ DeviceMemoryStatusEnum,
15
+ Devices,
16
+ ManufacturerEnum,
17
+ TopologyDistanceEnum,
18
+ )
12
19
  from .__utils__ import (
13
20
  PCIDevice,
14
21
  byte_to_mebibyte,
@@ -165,20 +172,32 @@ class AMDDetector(Detector):
165
172
  )
166
173
  dev_cores_util = 0
167
174
 
168
- dev_mem = None
169
- dev_mem_used = None
175
+ dev_mem = 0
176
+ dev_mem_used = 0
177
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
170
178
  try:
171
179
  dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev)
172
180
  dev_mem = dev_gpu_vram_usage.get("vram_total")
173
181
  dev_mem_used = dev_gpu_vram_usage.get("vram_used")
182
+ dev_ecc_count = pyamdsmi.amdsmi_get_gpu_ecc_count(
183
+ dev,
184
+ pyamdsmi.AmdSmiGpuBlock.UMC,
185
+ )
186
+ if dev_ecc_count.get("uncorrectable_count", 0) > 0:
187
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
174
188
  except pyamdsmi.AmdSmiException:
189
+ dev_mem = byte_to_mebibyte( # byte to MiB
190
+ pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
191
+ )
192
+ dev_mem_used = byte_to_mebibyte( # byte to MiB
193
+ pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
194
+ )
175
195
  with contextlib.suppress(pyrocmsmi.ROCMSMIError):
176
- dev_mem = byte_to_mebibyte( # byte to MiB
177
- pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
178
- )
179
- dev_mem_used = byte_to_mebibyte( # byte to MiB
180
- pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
196
+ dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
197
+ dev_idx,
181
198
  )
199
+ if dev_ecc_count.uncorrectable_err > 0:
200
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
182
201
 
183
202
  dev_power = None
184
203
  dev_power_used = None
@@ -232,6 +251,7 @@ class AMDDetector(Detector):
232
251
  memory=dev_mem,
233
252
  memory_used=dev_mem_used,
234
253
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
254
+ memory_status=dev_mem_status,
235
255
  temperature=dev_temp,
236
256
  power=dev_power,
237
257
  power_used=dev_power_used,