gpustack-runtime 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. gpustack_runtime/_version.py +2 -2
  2. gpustack_runtime/_version_appendix.py +1 -1
  3. gpustack_runtime/cmds/detector.py +3 -1
  4. gpustack_runtime/deployer/__types__.py +314 -233
  5. gpustack_runtime/deployer/cdi/__utils__.py +4 -1
  6. gpustack_runtime/deployer/docker.py +109 -148
  7. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +21 -3
  8. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
  9. gpustack_runtime/deployer/kuberentes.py +91 -126
  10. gpustack_runtime/deployer/podman.py +89 -122
  11. gpustack_runtime/detector/__init__.py +2 -0
  12. gpustack_runtime/detector/__types__.py +26 -0
  13. gpustack_runtime/detector/amd.py +28 -8
  14. gpustack_runtime/detector/ascend.py +49 -4
  15. gpustack_runtime/detector/cambricon.py +3 -0
  16. gpustack_runtime/detector/hygon.py +16 -1
  17. gpustack_runtime/detector/iluvatar.py +6 -0
  18. gpustack_runtime/detector/metax.py +8 -0
  19. gpustack_runtime/detector/mthreads.py +11 -0
  20. gpustack_runtime/detector/nvidia.py +139 -134
  21. gpustack_runtime/detector/pyixml/__init__.py +16 -0
  22. gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
  23. gpustack_runtime/detector/thead.py +135 -127
  24. gpustack_runtime/envs.py +7 -6
  25. {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
  26. {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
  27. {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
  28. {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
  29. {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0
@@ -3,13 +3,11 @@ from __future__ import annotations as __future_annotations__
3
3
  import contextlib
4
4
  import json
5
5
  import logging
6
- import operator
7
6
  import os
8
7
  from dataclasses import dataclass, field
9
8
  from enum import Enum
10
- from functools import lru_cache, reduce
11
9
  from pathlib import Path
12
- from typing import TYPE_CHECKING, Literal
10
+ from typing import TYPE_CHECKING
13
11
 
14
12
  import kubernetes
15
13
  import kubernetes.stream.ws_client
@@ -43,7 +41,7 @@ from .__utils__ import (
43
41
  sensitive_env_var,
44
42
  validate_rfc1123_domain_name,
45
43
  )
46
- from .k8s.deviceplugin import cdi_kind_to_kdp_resource, is_kubelet_socket_accessible
44
+ from .k8s.deviceplugin import get_resource_injection_policy
47
45
 
48
46
  if TYPE_CHECKING:
49
47
  from collections.abc import Callable, Generator
@@ -88,17 +86,6 @@ class KubernetesWorkloadPlan(WorkloadPlan):
88
86
  Domain suffix for the cluster. Default is "cluster.local".
89
87
  service_type (KubernetesWorkloadServiceTypeEnum):
90
88
  Service type for the workload. Default is CLUSTER_IP.
91
- resource_key_runtime_env_mapping: (dict[str, str]):
92
- Mapping from resource names to environment variable names for device allocation,
93
- which is used to tell the Container Runtime which GPUs to mount into the container.
94
- For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
95
- which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
96
- With privileged mode, the container can access all GPUs even if specified.
97
- resource_key_backend_env_mapping: (dict[str, list[str]]):
98
- Mapping from resource names to environment variable names for device runtime,
99
- which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
100
- For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
101
- which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
102
89
  namespace (str | None):
103
90
  Namespace of the workload.
104
91
  name (str):
@@ -381,22 +368,6 @@ class KubernetesDeployer(EndoscopicDeployer):
381
368
 
382
369
  return wrapper
383
370
 
384
- @staticmethod
385
- @lru_cache
386
- def _get_resource_injection_policy() -> Literal["env", "kdp"]:
387
- """
388
- Get the resource injection policy (in lowercase) for the deployer.
389
-
390
- Returns:
391
- The resource injection policy.
392
-
393
- """
394
- policy = envs.GPUSTACK_RUNTIME_KUBERNETES_RESOURCE_INJECTION_POLICY.lower()
395
- if policy != "auto":
396
- return policy
397
-
398
- return "kdp" if is_kubelet_socket_accessible() else "env"
399
-
400
371
  def _create_ephemeral_configmaps(
401
372
  self,
402
373
  workload: KubernetesWorkloadPlan,
@@ -1008,114 +979,104 @@ class KubernetesDeployer(EndoscopicDeployer):
1008
979
 
1009
980
  # Parameterize resources
1010
981
  if c.resources:
1011
- kdp = self._get_resource_injection_policy() == "kdp"
982
+ kdp = get_resource_injection_policy() == "kdp"
983
+ fmt = "kdp" if kdp else "plain"
1012
984
 
1013
985
  resources: dict[str, str] = {}
1014
- r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
1015
- r_k_backend_env = workload.resource_key_backend_env_mapping or {}
1016
- _, vd_env, vd_cdis, vd_values = self.get_visible_devices_materials()
1017
986
  for r_k, r_v in c.resources.items():
1018
987
  if r_k in ("cpu", "memory"):
1019
988
  resources[r_k] = str(r_v)
989
+ continue
990
+
991
+ if (
992
+ r_k
993
+ in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
994
+ ):
995
+ # Set env if resource key is mapped.
996
+ runtime_envs = [
997
+ envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
998
+ r_k
999
+ ],
1000
+ ]
1001
+ elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
1002
+ # Set env if auto-mapping key is matched.
1003
+ runtime_envs = self.get_runtime_envs()
1020
1004
  else:
1021
- if r_k in r_k_runtime_env:
1022
- # Set env if resource key is mapped.
1023
- runtime_env = [r_k_runtime_env[r_k]]
1024
- elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
1025
- # Set env if auto-mapping key is matched.
1026
- runtime_env = list(vd_env.keys())
1027
- else:
1028
- resources[r_k] = str(r_v)
1029
- continue
1030
-
1031
- if r_k in r_k_backend_env:
1032
- # Set env if resource key is mapped.
1033
- backend_env = r_k_backend_env[r_k]
1034
- else:
1035
- # Otherwise, use the default backend env names.
1036
- backend_env = reduce(operator.add, list(vd_env.values()))
1037
-
1038
- privileged = (
1005
+ resources[r_k] = str(r_v)
1006
+ continue
1007
+
1008
+ privileged = (
1009
+ container.security_context
1010
+ and container.security_context.privileged
1011
+ )
1012
+ resource_values = [x.strip() for x in r_v.split(",")]
1013
+
1014
+ # Request devices.
1015
+ if r_v == "all":
1016
+ # Configure privileged.
1017
+ container.security_context = (
1039
1018
  container.security_context
1040
- and container.security_context.privileged
1019
+ or kubernetes.client.V1SecurityContext()
1041
1020
  )
1042
-
1043
- # Configure device access environment variable.
1044
- if r_v == "all" and backend_env:
1045
- # Configure privileged if requested all devices.
1046
- container.security_context = (
1047
- container.security_context
1048
- or kubernetes.client.V1SecurityContext()
1021
+ container.security_context.privileged = True
1022
+ # Request all devices.
1023
+ for ren in runtime_envs:
1024
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1025
+ # Request device via KDP.
1026
+ if kdp:
1027
+ resources.update(
1028
+ dict.fromkeys(r_vs, "1"),
1029
+ )
1030
+ continue
1031
+ # Request device via visible devices env.
1032
+ container.env.append(
1033
+ kubernetes.client.V1EnvVar(
1034
+ name=ren,
1035
+ value=",".join(r_vs),
1036
+ ),
1049
1037
  )
1050
- container.security_context.privileged = True
1051
- # Then, set container backend visible devices env to all devices,
1052
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1053
- # and mount corresponding libs if needed.
1054
- for re in runtime_env:
1055
- # Request device via KDP.
1056
- if kdp:
1057
- for v in vd_values.get(re) or []:
1058
- kdp_resource = cdi_kind_to_kdp_resource(
1059
- cdi_kind=vd_cdis[re],
1060
- device_index=v,
1061
- )
1062
- resources[kdp_resource] = "1"
1063
- continue
1064
- # Request device via visible devices env.
1065
- rv = ",".join(vd_values.get(re) or ["all"])
1066
- container.env.append(
1067
- kubernetes.client.V1EnvVar(
1068
- name=re,
1069
- value=rv,
1070
- ),
1038
+ else:
1039
+ # Request specific devices.
1040
+ for ren in runtime_envs:
1041
+ # Request all devices if privileged,
1042
+ # otherwise, normalize requested devices.
1043
+ if privileged:
1044
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1045
+ else:
1046
+ r_vs = self.map_runtime_visible_devices(
1047
+ ren,
1048
+ resource_values,
1049
+ fmt,
1071
1050
  )
1072
- else:
1073
- # Set env to the allocated device IDs if no privileged,
1074
- # otherwise, set container backend visible devices env to all devices,
1075
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1076
- # and mount corresponding libs if needed.
1077
- for re in runtime_env:
1078
- # Request device via KDP.
1079
- if kdp:
1080
- if not privileged:
1081
- for v in str(r_v).split(","):
1082
- kdp_resource = cdi_kind_to_kdp_resource(
1083
- cdi_kind=vd_cdis[re],
1084
- device_index=int(v.strip()),
1085
- )
1086
- resources[kdp_resource] = "1"
1087
- else:
1088
- for v in vd_values.get(re) or []:
1089
- kdp_resource = cdi_kind_to_kdp_resource(
1090
- cdi_kind=vd_cdis[re],
1091
- device_index=v,
1092
- )
1093
- resources[kdp_resource] = "1"
1094
- continue
1095
- # Request device via visible devices env.
1096
- if not privileged:
1097
- rv = str(r_v)
1098
- else:
1099
- rv = ",".join(vd_values.get(re) or ["all"])
1100
- container.env.append(
1101
- kubernetes.client.V1EnvVar(
1102
- name=re,
1103
- value=rv,
1104
- ),
1051
+ # Request device via KDP.
1052
+ if kdp:
1053
+ resources.update(
1054
+ dict.fromkeys(r_vs, "1"),
1105
1055
  )
1056
+ continue
1057
+ # Request device via visible devices env.
1058
+ container.env.append(
1059
+ kubernetes.client.V1EnvVar(
1060
+ name=ren,
1061
+ value=",".join(r_vs),
1062
+ ),
1063
+ )
1106
1064
 
1107
- # Configure runtime device access environment variables.
1108
- if r_v != "all" and privileged:
1109
- for be in backend_env:
1110
- container.env.append(
1111
- kubernetes.client.V1EnvVar(
1112
- name=be,
1113
- value=self.align_backend_visible_devices_env_values(
1114
- be,
1115
- str(r_v),
1116
- ),
1117
- ),
1065
+ # Configure runtime device access environment variables.
1066
+ if r_v != "all" and privileged:
1067
+ b_vs = self.map_backend_visible_devices(
1068
+ runtime_envs,
1069
+ resource_values,
1070
+ )
1071
+ container.env.extend(
1072
+ [
1073
+ kubernetes.client.V1EnvVar(
1074
+ name=be,
1075
+ value=be_v,
1118
1076
  )
1077
+ for be, be_v in b_vs.items()
1078
+ ],
1079
+ )
1119
1080
 
1120
1081
  container.resources = kubernetes.client.V1ResourceRequirements(
1121
1082
  limits=(resources if resources else None),
@@ -1245,6 +1206,10 @@ class KubernetesDeployer(EndoscopicDeployer):
1245
1206
  self._client = self._get_client()
1246
1207
  self._node_name = envs.GPUSTACK_RUNTIME_KUBERNETES_NODE_NAME
1247
1208
 
1209
+ @property
1210
+ def allowed_uuid_values(self) -> bool:
1211
+ return get_resource_injection_policy() != "kdp"
1212
+
1248
1213
  def _prepare_mirrored_deployment(self):
1249
1214
  """
1250
1215
  Prepare for mirrored deployment.
@@ -4,13 +4,11 @@ import contextlib
4
4
  import io
5
5
  import json
6
6
  import logging
7
- import operator
8
7
  import os
9
8
  import socket
10
9
  import sys
11
10
  import tarfile
12
11
  from dataclasses import dataclass, field
13
- from functools import reduce
14
12
  from math import ceil
15
13
  from pathlib import Path
16
14
  from typing import TYPE_CHECKING, Any
@@ -84,17 +82,6 @@ class PodmanWorkloadPlan(WorkloadPlan):
84
82
  Image used for the pause container.
85
83
  unhealthy_restart_image (str):
86
84
  Image used for unhealthy restart container.
87
- resource_key_runtime_env_mapping: (dict[str, str]):
88
- Mapping from resource names to environment variable names for device allocation,
89
- which is used to tell the Container Runtime which GPUs to mount into the container.
90
- For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
91
- which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
92
- With privileged mode, the container can access all GPUs even if specified.
93
- resource_key_backend_env_mapping: (dict[str, list[str]]):
94
- Mapping from resource names to environment variable names for device runtime,
95
- which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
96
- For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
97
- which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
98
85
  namespace (str | None):
99
86
  Namespace of the workload.
100
87
  name (str):
@@ -952,120 +939,100 @@ class PodmanDeployer(EndoscopicDeployer):
952
939
 
953
940
  # Parameterize resources.
954
941
  if c.resources:
955
- r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
956
- r_k_backend_env = workload.resource_key_backend_env_mapping or {}
957
- vd_manus, vd_env, vd_cdis, vd_values = (
958
- self.get_visible_devices_materials()
959
- )
942
+ fmt = "cdi"
943
+
960
944
  for r_k, r_v in c.resources.items():
961
- match r_k:
962
- case "cpu":
963
- if isinstance(r_v, int | float):
964
- create_options["cpu_shares"] = ceil(r_v * 1024)
965
- elif isinstance(r_v, str) and r_v.isdigit():
966
- create_options["cpu_shares"] = ceil(float(r_v) * 1024)
967
- case "memory":
968
- if isinstance(r_v, int):
969
- create_options["mem_limit"] = r_v
970
- create_options["mem_reservation"] = r_v
971
- create_options["memswap_limit"] = r_v
972
- elif isinstance(r_v, str):
973
- v = r_v.lower().removesuffix("i")
974
- create_options["mem_limit"] = v
975
- create_options["mem_reservation"] = v
976
- create_options["memswap_limit"] = v
977
- case _:
978
- if r_k in r_k_runtime_env:
979
- # Set env if resource key is mapped.
980
- runtime_env = [r_k_runtime_env[r_k]]
981
- elif (
982
- r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
983
- ):
984
- # Set env if auto-mapping key is matched.
985
- runtime_env = list(vd_env.keys())
986
- else:
987
- continue
945
+ if r_k == "cpu":
946
+ if isinstance(r_v, int | float):
947
+ create_options["cpu_shares"] = ceil(r_v * 1024)
948
+ elif isinstance(r_v, str) and r_v.isdigit():
949
+ create_options["cpu_shares"] = ceil(float(r_v) * 1024)
950
+ continue
951
+ if r_k == "memory":
952
+ if isinstance(r_v, int):
953
+ create_options["mem_limit"] = r_v
954
+ create_options["mem_reservation"] = r_v
955
+ create_options["memswap_limit"] = r_v
956
+ elif isinstance(r_v, str):
957
+ v = r_v.lower().removesuffix("i")
958
+ create_options["mem_limit"] = v
959
+ create_options["mem_reservation"] = v
960
+ create_options["memswap_limit"] = v
961
+ continue
988
962
 
989
- if r_k in r_k_backend_env:
990
- # Set env if resource key is mapped.
991
- backend_env = r_k_backend_env[r_k]
992
- else:
993
- # Otherwise, use the default backend env names.
994
- backend_env = reduce(
995
- operator.add,
996
- list(vd_env.values()),
997
- )
963
+ if (
964
+ r_k
965
+ in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
966
+ ):
967
+ # Set env if resource key is mapped.
968
+ runtime_envs = [
969
+ envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
970
+ r_k
971
+ ],
972
+ ]
973
+ elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
974
+ # Set env if auto-mapping key is matched.
975
+ runtime_envs = self.get_runtime_envs()
976
+ else:
977
+ continue
998
978
 
999
- privileged = create_options.get("privileged", False)
1000
-
1001
- # Generate CDI config if not yet.
1002
- if envs.GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE:
1003
- for re in runtime_env:
1004
- cdi_dump_config(
1005
- manufacturer=vd_manus[re],
1006
- output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
1007
- )
1008
-
1009
- # Configure device access environment variable.
1010
- if r_v == "all" and backend_env:
1011
- # Configure privileged if requested all devices.
1012
- create_options["privileged"] = True
1013
- # Then, set container backend visible devices env to all devices,
1014
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1015
- # and mount corresponding libs if needed.
1016
- for re in runtime_env:
1017
- # Request device via CDI.
1018
- rv = [
1019
- f"{vd_cdis[re]}={v}"
1020
- for v in (vd_values.get(re) or ["all"])
1021
- ]
1022
- if "devices" not in create_options:
1023
- create_options["devices"] = []
1024
- create_options["devices"].extend(rv)
979
+ privileged = create_options.get("privileged", False)
980
+ resource_values = [x.strip() for x in r_v.split(",")]
981
+
982
+ # Generate CDI config if not yet.
983
+ if envs.GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE:
984
+ for ren in runtime_envs:
985
+ r_m = self.get_manufacturer(ren)
986
+ cdi_dump_config(
987
+ manufacturer=r_m,
988
+ output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
989
+ )
990
+
991
+ # Configure device access environment variable.
992
+ if r_v == "all":
993
+ # Configure privileged.
994
+ create_options["privileged"] = True
995
+ # Request all devices.
996
+ for ren in runtime_envs:
997
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
998
+ # Request device via CDI.
999
+ if "devices" not in create_options:
1000
+ create_options["devices"] = []
1001
+ create_options["devices"].extend(r_vs)
1002
+ else:
1003
+ # Request specific devices.
1004
+ for ren in runtime_envs:
1005
+ # Request all devices if privileged,
1006
+ # otherwise, normalize requested devices.
1007
+ if privileged:
1008
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1025
1009
  else:
1026
- # Set env to the allocated device IDs if no privileged,
1027
- # otherwise, set container backend visible devices env to all devices,
1028
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1029
- # and mount corresponding libs if needed.
1030
- for re in runtime_env:
1031
- # Request device via CDI.
1032
- if not privileged:
1033
- rv = [
1034
- f"{vd_cdis[re]}={v.strip()}"
1035
- for v in r_v.split(",")
1036
- ]
1037
- else:
1038
- rv = [
1039
- f"{vd_cdis[re]}={v}"
1040
- for v in (vd_values.get(re) or ["all"])
1041
- ]
1042
- if "devices" not in create_options:
1043
- create_options["devices"] = []
1044
- create_options["devices"].extend(rv)
1045
-
1046
- # Configure runtime device access environment variables.
1047
- if r_v != "all" and privileged:
1048
- for be in backend_env:
1049
- create_options["environment"][be] = (
1050
- self.align_backend_visible_devices_env_values(
1051
- be,
1052
- str(r_v),
1053
- )
1054
- )
1055
-
1056
- # Configure affinity if applicable.
1057
- if (
1058
- envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1059
- or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1060
- ):
1061
- cpus, numas = self.get_visible_devices_affinities(
1062
- runtime_env,
1063
- r_v,
1010
+ r_vs = self.map_runtime_visible_devices(
1011
+ ren,
1012
+ resource_values,
1013
+ fmt,
1064
1014
  )
1065
- if cpus:
1066
- create_options["cpuset_cpus"] = cpus
1067
- if numas and envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
1068
- create_options["cpuset_mems"] = numas
1015
+ # Request device via CDI.
1016
+ if "devices" not in create_options:
1017
+ create_options["devices"] = []
1018
+ create_options["devices"].extend(r_vs)
1019
+
1020
+ # If not requesting all devices but privileged,
1021
+ # must configure visible devices.
1022
+ if r_v != "all" and privileged:
1023
+ b_vs = self.map_backend_visible_devices(
1024
+ runtime_envs,
1025
+ resource_values,
1026
+ )
1027
+ create_options["environment"].update(b_vs)
1028
+
1029
+ # Configure affinity if applicable.
1030
+ create_options.update(
1031
+ self.map_visible_devices_affinities(
1032
+ runtime_envs,
1033
+ resource_values,
1034
+ ),
1035
+ )
1069
1036
 
1070
1037
  # Parameterize mounts.
1071
1038
  self._append_container_mounts(
@@ -7,6 +7,7 @@ from ..logging import debug_log_exception
7
7
  from .__types__ import (
8
8
  Detector,
9
9
  Device,
10
+ DeviceMemoryStatusEnum,
10
11
  Devices,
11
12
  ManufacturerEnum,
12
13
  Topology,
@@ -292,6 +293,7 @@ def filter_devices_by_manufacturer(
292
293
 
293
294
  __all__ = [
294
295
  "Device",
296
+ "DeviceMemoryStatusEnum",
295
297
  "Devices",
296
298
  "ManufacturerEnum",
297
299
  "Topology",
@@ -122,6 +122,28 @@ def backend_to_manufacturer(backend: str) -> ManufacturerEnum:
122
122
  return ManufacturerEnum.UNKNOWN
123
123
 
124
124
 
125
+ class DeviceMemoryStatusEnum(str, Enum):
126
+ """
127
+ Enum for Device Memory Status.
128
+ """
129
+
130
+ HEALTHY = "healthy"
131
+ """
132
+ Device is healthy.
133
+ """
134
+ UNHEALTHY = "unhealthy"
135
+ """
136
+ Device is unhealthy.
137
+ """
138
+ UNKNOWN = "unknown"
139
+ """
140
+ Device status is unknown.
141
+ """
142
+
143
+ def __str__(self):
144
+ return self.value
145
+
146
+
125
147
  @dataclass_json
126
148
  @dataclass
127
149
  class Device:
@@ -185,6 +207,10 @@ class Device:
185
207
  """
186
208
  Memory utilization of the device in percentage.
187
209
  """
210
+ memory_status: DeviceMemoryStatusEnum = DeviceMemoryStatusEnum.UNKNOWN
211
+ """
212
+ Status of the device.
213
+ """
188
214
  temperature: int | float | None = None
189
215
  """
190
216
  Temperature of the device in Celsius.
@@ -8,7 +8,14 @@ from pathlib import Path
8
8
  from .. import envs
9
9
  from ..logging import debug_log_exception, debug_log_warning
10
10
  from . import Topology, pyamdgpu, pyamdsmi, pyhsa, pyrocmcore, pyrocmsmi
11
- from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
11
+ from .__types__ import (
12
+ Detector,
13
+ Device,
14
+ DeviceMemoryStatusEnum,
15
+ Devices,
16
+ ManufacturerEnum,
17
+ TopologyDistanceEnum,
18
+ )
12
19
  from .__utils__ import (
13
20
  PCIDevice,
14
21
  byte_to_mebibyte,
@@ -165,20 +172,32 @@ class AMDDetector(Detector):
165
172
  )
166
173
  dev_cores_util = 0
167
174
 
168
- dev_mem = None
169
- dev_mem_used = None
175
+ dev_mem = 0
176
+ dev_mem_used = 0
177
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
170
178
  try:
171
179
  dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev)
172
180
  dev_mem = dev_gpu_vram_usage.get("vram_total")
173
181
  dev_mem_used = dev_gpu_vram_usage.get("vram_used")
182
+ dev_ecc_count = pyamdsmi.amdsmi_get_gpu_ecc_count(
183
+ dev,
184
+ pyamdsmi.AmdSmiGpuBlock.UMC,
185
+ )
186
+ if dev_ecc_count.get("uncorrectable_count", 0) > 0:
187
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
174
188
  except pyamdsmi.AmdSmiException:
189
+ dev_mem = byte_to_mebibyte( # byte to MiB
190
+ pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
191
+ )
192
+ dev_mem_used = byte_to_mebibyte( # byte to MiB
193
+ pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
194
+ )
175
195
  with contextlib.suppress(pyrocmsmi.ROCMSMIError):
176
- dev_mem = byte_to_mebibyte( # byte to MiB
177
- pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
178
- )
179
- dev_mem_used = byte_to_mebibyte( # byte to MiB
180
- pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
196
+ dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
197
+ dev_idx,
181
198
  )
199
+ if dev_ecc_count.uncorrectable_err > 0:
200
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
182
201
 
183
202
  dev_power = None
184
203
  dev_power_used = None
@@ -232,6 +251,7 @@ class AMDDetector(Detector):
232
251
  memory=dev_mem,
233
252
  memory_used=dev_mem_used,
234
253
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
254
+ memory_status=dev_mem_status,
235
255
  temperature=dev_temp,
236
256
  power=dev_power,
237
257
  power_used=dev_power_used,