gpustack-runtime 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/detector.py +3 -1
- gpustack_runtime/deployer/__types__.py +314 -233
- gpustack_runtime/deployer/cdi/__utils__.py +4 -1
- gpustack_runtime/deployer/docker.py +109 -148
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +21 -3
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
- gpustack_runtime/deployer/kuberentes.py +91 -126
- gpustack_runtime/deployer/podman.py +89 -122
- gpustack_runtime/detector/__init__.py +2 -0
- gpustack_runtime/detector/__types__.py +26 -0
- gpustack_runtime/detector/amd.py +28 -8
- gpustack_runtime/detector/ascend.py +49 -4
- gpustack_runtime/detector/cambricon.py +3 -0
- gpustack_runtime/detector/hygon.py +16 -1
- gpustack_runtime/detector/iluvatar.py +6 -0
- gpustack_runtime/detector/metax.py +8 -0
- gpustack_runtime/detector/mthreads.py +11 -0
- gpustack_runtime/detector/nvidia.py +139 -134
- gpustack_runtime/detector/pyixml/__init__.py +16 -0
- gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
- gpustack_runtime/detector/thead.py +135 -127
- gpustack_runtime/envs.py +7 -6
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,13 +3,11 @@ from __future__ import annotations as __future_annotations__
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
|
-
import operator
|
|
7
6
|
import os
|
|
8
7
|
from dataclasses import dataclass, field
|
|
9
8
|
from enum import Enum
|
|
10
|
-
from functools import lru_cache, reduce
|
|
11
9
|
from pathlib import Path
|
|
12
|
-
from typing import TYPE_CHECKING
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
13
11
|
|
|
14
12
|
import kubernetes
|
|
15
13
|
import kubernetes.stream.ws_client
|
|
@@ -43,7 +41,7 @@ from .__utils__ import (
|
|
|
43
41
|
sensitive_env_var,
|
|
44
42
|
validate_rfc1123_domain_name,
|
|
45
43
|
)
|
|
46
|
-
from .k8s.deviceplugin import
|
|
44
|
+
from .k8s.deviceplugin import get_resource_injection_policy
|
|
47
45
|
|
|
48
46
|
if TYPE_CHECKING:
|
|
49
47
|
from collections.abc import Callable, Generator
|
|
@@ -88,17 +86,6 @@ class KubernetesWorkloadPlan(WorkloadPlan):
|
|
|
88
86
|
Domain suffix for the cluster. Default is "cluster.local".
|
|
89
87
|
service_type (KubernetesWorkloadServiceTypeEnum):
|
|
90
88
|
Service type for the workload. Default is CLUSTER_IP.
|
|
91
|
-
resource_key_runtime_env_mapping: (dict[str, str]):
|
|
92
|
-
Mapping from resource names to environment variable names for device allocation,
|
|
93
|
-
which is used to tell the Container Runtime which GPUs to mount into the container.
|
|
94
|
-
For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
|
|
95
|
-
which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
96
|
-
With privileged mode, the container can access all GPUs even if specified.
|
|
97
|
-
resource_key_backend_env_mapping: (dict[str, list[str]]):
|
|
98
|
-
Mapping from resource names to environment variable names for device runtime,
|
|
99
|
-
which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
|
|
100
|
-
For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
|
|
101
|
-
which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
102
89
|
namespace (str | None):
|
|
103
90
|
Namespace of the workload.
|
|
104
91
|
name (str):
|
|
@@ -381,22 +368,6 @@ class KubernetesDeployer(EndoscopicDeployer):
|
|
|
381
368
|
|
|
382
369
|
return wrapper
|
|
383
370
|
|
|
384
|
-
@staticmethod
|
|
385
|
-
@lru_cache
|
|
386
|
-
def _get_resource_injection_policy() -> Literal["env", "kdp"]:
|
|
387
|
-
"""
|
|
388
|
-
Get the resource injection policy (in lowercase) for the deployer.
|
|
389
|
-
|
|
390
|
-
Returns:
|
|
391
|
-
The resource injection policy.
|
|
392
|
-
|
|
393
|
-
"""
|
|
394
|
-
policy = envs.GPUSTACK_RUNTIME_KUBERNETES_RESOURCE_INJECTION_POLICY.lower()
|
|
395
|
-
if policy != "auto":
|
|
396
|
-
return policy
|
|
397
|
-
|
|
398
|
-
return "kdp" if is_kubelet_socket_accessible() else "env"
|
|
399
|
-
|
|
400
371
|
def _create_ephemeral_configmaps(
|
|
401
372
|
self,
|
|
402
373
|
workload: KubernetesWorkloadPlan,
|
|
@@ -1008,114 +979,104 @@ class KubernetesDeployer(EndoscopicDeployer):
|
|
|
1008
979
|
|
|
1009
980
|
# Parameterize resources
|
|
1010
981
|
if c.resources:
|
|
1011
|
-
kdp =
|
|
982
|
+
kdp = get_resource_injection_policy() == "kdp"
|
|
983
|
+
fmt = "kdp" if kdp else "plain"
|
|
1012
984
|
|
|
1013
985
|
resources: dict[str, str] = {}
|
|
1014
|
-
r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
|
|
1015
|
-
r_k_backend_env = workload.resource_key_backend_env_mapping or {}
|
|
1016
|
-
_, vd_env, vd_cdis, vd_values = self.get_visible_devices_materials()
|
|
1017
986
|
for r_k, r_v in c.resources.items():
|
|
1018
987
|
if r_k in ("cpu", "memory"):
|
|
1019
988
|
resources[r_k] = str(r_v)
|
|
989
|
+
continue
|
|
990
|
+
|
|
991
|
+
if (
|
|
992
|
+
r_k
|
|
993
|
+
in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
|
|
994
|
+
):
|
|
995
|
+
# Set env if resource key is mapped.
|
|
996
|
+
runtime_envs = [
|
|
997
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
|
|
998
|
+
r_k
|
|
999
|
+
],
|
|
1000
|
+
]
|
|
1001
|
+
elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
|
|
1002
|
+
# Set env if auto-mapping key is matched.
|
|
1003
|
+
runtime_envs = self.get_runtime_envs()
|
|
1020
1004
|
else:
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
else:
|
|
1035
|
-
# Otherwise, use the default backend env names.
|
|
1036
|
-
backend_env = reduce(operator.add, list(vd_env.values()))
|
|
1037
|
-
|
|
1038
|
-
privileged = (
|
|
1005
|
+
resources[r_k] = str(r_v)
|
|
1006
|
+
continue
|
|
1007
|
+
|
|
1008
|
+
privileged = (
|
|
1009
|
+
container.security_context
|
|
1010
|
+
and container.security_context.privileged
|
|
1011
|
+
)
|
|
1012
|
+
resource_values = [x.strip() for x in r_v.split(",")]
|
|
1013
|
+
|
|
1014
|
+
# Request devices.
|
|
1015
|
+
if r_v == "all":
|
|
1016
|
+
# Configure privileged.
|
|
1017
|
+
container.security_context = (
|
|
1039
1018
|
container.security_context
|
|
1040
|
-
|
|
1019
|
+
or kubernetes.client.V1SecurityContext()
|
|
1041
1020
|
)
|
|
1042
|
-
|
|
1043
|
-
#
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1021
|
+
container.security_context.privileged = True
|
|
1022
|
+
# Request all devices.
|
|
1023
|
+
for ren in runtime_envs:
|
|
1024
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1025
|
+
# Request device via KDP.
|
|
1026
|
+
if kdp:
|
|
1027
|
+
resources.update(
|
|
1028
|
+
dict.fromkeys(r_vs, "1"),
|
|
1029
|
+
)
|
|
1030
|
+
continue
|
|
1031
|
+
# Request device via visible devices env.
|
|
1032
|
+
container.env.append(
|
|
1033
|
+
kubernetes.client.V1EnvVar(
|
|
1034
|
+
name=ren,
|
|
1035
|
+
value=",".join(r_vs),
|
|
1036
|
+
),
|
|
1049
1037
|
)
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
#
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
resources[kdp_resource] = "1"
|
|
1063
|
-
continue
|
|
1064
|
-
# Request device via visible devices env.
|
|
1065
|
-
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1066
|
-
container.env.append(
|
|
1067
|
-
kubernetes.client.V1EnvVar(
|
|
1068
|
-
name=re,
|
|
1069
|
-
value=rv,
|
|
1070
|
-
),
|
|
1038
|
+
else:
|
|
1039
|
+
# Request specific devices.
|
|
1040
|
+
for ren in runtime_envs:
|
|
1041
|
+
# Request all devices if privileged,
|
|
1042
|
+
# otherwise, normalize requested devices.
|
|
1043
|
+
if privileged:
|
|
1044
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1045
|
+
else:
|
|
1046
|
+
r_vs = self.map_runtime_visible_devices(
|
|
1047
|
+
ren,
|
|
1048
|
+
resource_values,
|
|
1049
|
+
fmt,
|
|
1071
1050
|
)
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
# and mount corresponding libs if needed.
|
|
1077
|
-
for re in runtime_env:
|
|
1078
|
-
# Request device via KDP.
|
|
1079
|
-
if kdp:
|
|
1080
|
-
if not privileged:
|
|
1081
|
-
for v in str(r_v).split(","):
|
|
1082
|
-
kdp_resource = cdi_kind_to_kdp_resource(
|
|
1083
|
-
cdi_kind=vd_cdis[re],
|
|
1084
|
-
device_index=int(v.strip()),
|
|
1085
|
-
)
|
|
1086
|
-
resources[kdp_resource] = "1"
|
|
1087
|
-
else:
|
|
1088
|
-
for v in vd_values.get(re) or []:
|
|
1089
|
-
kdp_resource = cdi_kind_to_kdp_resource(
|
|
1090
|
-
cdi_kind=vd_cdis[re],
|
|
1091
|
-
device_index=v,
|
|
1092
|
-
)
|
|
1093
|
-
resources[kdp_resource] = "1"
|
|
1094
|
-
continue
|
|
1095
|
-
# Request device via visible devices env.
|
|
1096
|
-
if not privileged:
|
|
1097
|
-
rv = str(r_v)
|
|
1098
|
-
else:
|
|
1099
|
-
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1100
|
-
container.env.append(
|
|
1101
|
-
kubernetes.client.V1EnvVar(
|
|
1102
|
-
name=re,
|
|
1103
|
-
value=rv,
|
|
1104
|
-
),
|
|
1051
|
+
# Request device via KDP.
|
|
1052
|
+
if kdp:
|
|
1053
|
+
resources.update(
|
|
1054
|
+
dict.fromkeys(r_vs, "1"),
|
|
1105
1055
|
)
|
|
1056
|
+
continue
|
|
1057
|
+
# Request device via visible devices env.
|
|
1058
|
+
container.env.append(
|
|
1059
|
+
kubernetes.client.V1EnvVar(
|
|
1060
|
+
name=ren,
|
|
1061
|
+
value=",".join(r_vs),
|
|
1062
|
+
),
|
|
1063
|
+
)
|
|
1106
1064
|
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1065
|
+
# Configure runtime device access environment variables.
|
|
1066
|
+
if r_v != "all" and privileged:
|
|
1067
|
+
b_vs = self.map_backend_visible_devices(
|
|
1068
|
+
runtime_envs,
|
|
1069
|
+
resource_values,
|
|
1070
|
+
)
|
|
1071
|
+
container.env.extend(
|
|
1072
|
+
[
|
|
1073
|
+
kubernetes.client.V1EnvVar(
|
|
1074
|
+
name=be,
|
|
1075
|
+
value=be_v,
|
|
1118
1076
|
)
|
|
1077
|
+
for be, be_v in b_vs.items()
|
|
1078
|
+
],
|
|
1079
|
+
)
|
|
1119
1080
|
|
|
1120
1081
|
container.resources = kubernetes.client.V1ResourceRequirements(
|
|
1121
1082
|
limits=(resources if resources else None),
|
|
@@ -1245,6 +1206,10 @@ class KubernetesDeployer(EndoscopicDeployer):
|
|
|
1245
1206
|
self._client = self._get_client()
|
|
1246
1207
|
self._node_name = envs.GPUSTACK_RUNTIME_KUBERNETES_NODE_NAME
|
|
1247
1208
|
|
|
1209
|
+
@property
|
|
1210
|
+
def allowed_uuid_values(self) -> bool:
|
|
1211
|
+
return get_resource_injection_policy() != "kdp"
|
|
1212
|
+
|
|
1248
1213
|
def _prepare_mirrored_deployment(self):
|
|
1249
1214
|
"""
|
|
1250
1215
|
Prepare for mirrored deployment.
|
|
@@ -4,13 +4,11 @@ import contextlib
|
|
|
4
4
|
import io
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import operator
|
|
8
7
|
import os
|
|
9
8
|
import socket
|
|
10
9
|
import sys
|
|
11
10
|
import tarfile
|
|
12
11
|
from dataclasses import dataclass, field
|
|
13
|
-
from functools import reduce
|
|
14
12
|
from math import ceil
|
|
15
13
|
from pathlib import Path
|
|
16
14
|
from typing import TYPE_CHECKING, Any
|
|
@@ -84,17 +82,6 @@ class PodmanWorkloadPlan(WorkloadPlan):
|
|
|
84
82
|
Image used for the pause container.
|
|
85
83
|
unhealthy_restart_image (str):
|
|
86
84
|
Image used for unhealthy restart container.
|
|
87
|
-
resource_key_runtime_env_mapping: (dict[str, str]):
|
|
88
|
-
Mapping from resource names to environment variable names for device allocation,
|
|
89
|
-
which is used to tell the Container Runtime which GPUs to mount into the container.
|
|
90
|
-
For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
|
|
91
|
-
which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
92
|
-
With privileged mode, the container can access all GPUs even if specified.
|
|
93
|
-
resource_key_backend_env_mapping: (dict[str, list[str]]):
|
|
94
|
-
Mapping from resource names to environment variable names for device runtime,
|
|
95
|
-
which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
|
|
96
|
-
For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
|
|
97
|
-
which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
98
85
|
namespace (str | None):
|
|
99
86
|
Namespace of the workload.
|
|
100
87
|
name (str):
|
|
@@ -952,120 +939,100 @@ class PodmanDeployer(EndoscopicDeployer):
|
|
|
952
939
|
|
|
953
940
|
# Parameterize resources.
|
|
954
941
|
if c.resources:
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
vd_manus, vd_env, vd_cdis, vd_values = (
|
|
958
|
-
self.get_visible_devices_materials()
|
|
959
|
-
)
|
|
942
|
+
fmt = "cdi"
|
|
943
|
+
|
|
960
944
|
for r_k, r_v in c.resources.items():
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
if r_k in r_k_runtime_env:
|
|
979
|
-
# Set env if resource key is mapped.
|
|
980
|
-
runtime_env = [r_k_runtime_env[r_k]]
|
|
981
|
-
elif (
|
|
982
|
-
r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
|
|
983
|
-
):
|
|
984
|
-
# Set env if auto-mapping key is matched.
|
|
985
|
-
runtime_env = list(vd_env.keys())
|
|
986
|
-
else:
|
|
987
|
-
continue
|
|
945
|
+
if r_k == "cpu":
|
|
946
|
+
if isinstance(r_v, int | float):
|
|
947
|
+
create_options["cpu_shares"] = ceil(r_v * 1024)
|
|
948
|
+
elif isinstance(r_v, str) and r_v.isdigit():
|
|
949
|
+
create_options["cpu_shares"] = ceil(float(r_v) * 1024)
|
|
950
|
+
continue
|
|
951
|
+
if r_k == "memory":
|
|
952
|
+
if isinstance(r_v, int):
|
|
953
|
+
create_options["mem_limit"] = r_v
|
|
954
|
+
create_options["mem_reservation"] = r_v
|
|
955
|
+
create_options["memswap_limit"] = r_v
|
|
956
|
+
elif isinstance(r_v, str):
|
|
957
|
+
v = r_v.lower().removesuffix("i")
|
|
958
|
+
create_options["mem_limit"] = v
|
|
959
|
+
create_options["mem_reservation"] = v
|
|
960
|
+
create_options["memswap_limit"] = v
|
|
961
|
+
continue
|
|
988
962
|
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
963
|
+
if (
|
|
964
|
+
r_k
|
|
965
|
+
in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
|
|
966
|
+
):
|
|
967
|
+
# Set env if resource key is mapped.
|
|
968
|
+
runtime_envs = [
|
|
969
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
|
|
970
|
+
r_k
|
|
971
|
+
],
|
|
972
|
+
]
|
|
973
|
+
elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
|
|
974
|
+
# Set env if auto-mapping key is matched.
|
|
975
|
+
runtime_envs = self.get_runtime_envs()
|
|
976
|
+
else:
|
|
977
|
+
continue
|
|
998
978
|
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
979
|
+
privileged = create_options.get("privileged", False)
|
|
980
|
+
resource_values = [x.strip() for x in r_v.split(",")]
|
|
981
|
+
|
|
982
|
+
# Generate CDI config if not yet.
|
|
983
|
+
if envs.GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE:
|
|
984
|
+
for ren in runtime_envs:
|
|
985
|
+
r_m = self.get_manufacturer(ren)
|
|
986
|
+
cdi_dump_config(
|
|
987
|
+
manufacturer=r_m,
|
|
988
|
+
output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
# Configure device access environment variable.
|
|
992
|
+
if r_v == "all":
|
|
993
|
+
# Configure privileged.
|
|
994
|
+
create_options["privileged"] = True
|
|
995
|
+
# Request all devices.
|
|
996
|
+
for ren in runtime_envs:
|
|
997
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
998
|
+
# Request device via CDI.
|
|
999
|
+
if "devices" not in create_options:
|
|
1000
|
+
create_options["devices"] = []
|
|
1001
|
+
create_options["devices"].extend(r_vs)
|
|
1002
|
+
else:
|
|
1003
|
+
# Request specific devices.
|
|
1004
|
+
for ren in runtime_envs:
|
|
1005
|
+
# Request all devices if privileged,
|
|
1006
|
+
# otherwise, normalize requested devices.
|
|
1007
|
+
if privileged:
|
|
1008
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1025
1009
|
else:
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
for re in runtime_env:
|
|
1031
|
-
# Request device via CDI.
|
|
1032
|
-
if not privileged:
|
|
1033
|
-
rv = [
|
|
1034
|
-
f"{vd_cdis[re]}={v.strip()}"
|
|
1035
|
-
for v in r_v.split(",")
|
|
1036
|
-
]
|
|
1037
|
-
else:
|
|
1038
|
-
rv = [
|
|
1039
|
-
f"{vd_cdis[re]}={v}"
|
|
1040
|
-
for v in (vd_values.get(re) or ["all"])
|
|
1041
|
-
]
|
|
1042
|
-
if "devices" not in create_options:
|
|
1043
|
-
create_options["devices"] = []
|
|
1044
|
-
create_options["devices"].extend(rv)
|
|
1045
|
-
|
|
1046
|
-
# Configure runtime device access environment variables.
|
|
1047
|
-
if r_v != "all" and privileged:
|
|
1048
|
-
for be in backend_env:
|
|
1049
|
-
create_options["environment"][be] = (
|
|
1050
|
-
self.align_backend_visible_devices_env_values(
|
|
1051
|
-
be,
|
|
1052
|
-
str(r_v),
|
|
1053
|
-
)
|
|
1054
|
-
)
|
|
1055
|
-
|
|
1056
|
-
# Configure affinity if applicable.
|
|
1057
|
-
if (
|
|
1058
|
-
envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
|
|
1059
|
-
or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
|
|
1060
|
-
):
|
|
1061
|
-
cpus, numas = self.get_visible_devices_affinities(
|
|
1062
|
-
runtime_env,
|
|
1063
|
-
r_v,
|
|
1010
|
+
r_vs = self.map_runtime_visible_devices(
|
|
1011
|
+
ren,
|
|
1012
|
+
resource_values,
|
|
1013
|
+
fmt,
|
|
1064
1014
|
)
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1015
|
+
# Request device via CDI.
|
|
1016
|
+
if "devices" not in create_options:
|
|
1017
|
+
create_options["devices"] = []
|
|
1018
|
+
create_options["devices"].extend(r_vs)
|
|
1019
|
+
|
|
1020
|
+
# If not requesting all devices but privileged,
|
|
1021
|
+
# must configure visible devices.
|
|
1022
|
+
if r_v != "all" and privileged:
|
|
1023
|
+
b_vs = self.map_backend_visible_devices(
|
|
1024
|
+
runtime_envs,
|
|
1025
|
+
resource_values,
|
|
1026
|
+
)
|
|
1027
|
+
create_options["environment"].update(b_vs)
|
|
1028
|
+
|
|
1029
|
+
# Configure affinity if applicable.
|
|
1030
|
+
create_options.update(
|
|
1031
|
+
self.map_visible_devices_affinities(
|
|
1032
|
+
runtime_envs,
|
|
1033
|
+
resource_values,
|
|
1034
|
+
),
|
|
1035
|
+
)
|
|
1069
1036
|
|
|
1070
1037
|
# Parameterize mounts.
|
|
1071
1038
|
self._append_container_mounts(
|
|
@@ -7,6 +7,7 @@ from ..logging import debug_log_exception
|
|
|
7
7
|
from .__types__ import (
|
|
8
8
|
Detector,
|
|
9
9
|
Device,
|
|
10
|
+
DeviceMemoryStatusEnum,
|
|
10
11
|
Devices,
|
|
11
12
|
ManufacturerEnum,
|
|
12
13
|
Topology,
|
|
@@ -292,6 +293,7 @@ def filter_devices_by_manufacturer(
|
|
|
292
293
|
|
|
293
294
|
__all__ = [
|
|
294
295
|
"Device",
|
|
296
|
+
"DeviceMemoryStatusEnum",
|
|
295
297
|
"Devices",
|
|
296
298
|
"ManufacturerEnum",
|
|
297
299
|
"Topology",
|
|
@@ -122,6 +122,28 @@ def backend_to_manufacturer(backend: str) -> ManufacturerEnum:
|
|
|
122
122
|
return ManufacturerEnum.UNKNOWN
|
|
123
123
|
|
|
124
124
|
|
|
125
|
+
class DeviceMemoryStatusEnum(str, Enum):
|
|
126
|
+
"""
|
|
127
|
+
Enum for Device Memory Status.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
HEALTHY = "healthy"
|
|
131
|
+
"""
|
|
132
|
+
Device is healthy.
|
|
133
|
+
"""
|
|
134
|
+
UNHEALTHY = "unhealthy"
|
|
135
|
+
"""
|
|
136
|
+
Device is unhealthy.
|
|
137
|
+
"""
|
|
138
|
+
UNKNOWN = "unknown"
|
|
139
|
+
"""
|
|
140
|
+
Device status is unknown.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
def __str__(self):
|
|
144
|
+
return self.value
|
|
145
|
+
|
|
146
|
+
|
|
125
147
|
@dataclass_json
|
|
126
148
|
@dataclass
|
|
127
149
|
class Device:
|
|
@@ -185,6 +207,10 @@ class Device:
|
|
|
185
207
|
"""
|
|
186
208
|
Memory utilization of the device in percentage.
|
|
187
209
|
"""
|
|
210
|
+
memory_status: DeviceMemoryStatusEnum = DeviceMemoryStatusEnum.UNKNOWN
|
|
211
|
+
"""
|
|
212
|
+
Status of the device.
|
|
213
|
+
"""
|
|
188
214
|
temperature: int | float | None = None
|
|
189
215
|
"""
|
|
190
216
|
Temperature of the device in Celsius.
|
gpustack_runtime/detector/amd.py
CHANGED
|
@@ -8,7 +8,14 @@ from pathlib import Path
|
|
|
8
8
|
from .. import envs
|
|
9
9
|
from ..logging import debug_log_exception, debug_log_warning
|
|
10
10
|
from . import Topology, pyamdgpu, pyamdsmi, pyhsa, pyrocmcore, pyrocmsmi
|
|
11
|
-
from .__types__ import
|
|
11
|
+
from .__types__ import (
|
|
12
|
+
Detector,
|
|
13
|
+
Device,
|
|
14
|
+
DeviceMemoryStatusEnum,
|
|
15
|
+
Devices,
|
|
16
|
+
ManufacturerEnum,
|
|
17
|
+
TopologyDistanceEnum,
|
|
18
|
+
)
|
|
12
19
|
from .__utils__ import (
|
|
13
20
|
PCIDevice,
|
|
14
21
|
byte_to_mebibyte,
|
|
@@ -165,20 +172,32 @@ class AMDDetector(Detector):
|
|
|
165
172
|
)
|
|
166
173
|
dev_cores_util = 0
|
|
167
174
|
|
|
168
|
-
dev_mem =
|
|
169
|
-
dev_mem_used =
|
|
175
|
+
dev_mem = 0
|
|
176
|
+
dev_mem_used = 0
|
|
177
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
170
178
|
try:
|
|
171
179
|
dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev)
|
|
172
180
|
dev_mem = dev_gpu_vram_usage.get("vram_total")
|
|
173
181
|
dev_mem_used = dev_gpu_vram_usage.get("vram_used")
|
|
182
|
+
dev_ecc_count = pyamdsmi.amdsmi_get_gpu_ecc_count(
|
|
183
|
+
dev,
|
|
184
|
+
pyamdsmi.AmdSmiGpuBlock.UMC,
|
|
185
|
+
)
|
|
186
|
+
if dev_ecc_count.get("uncorrectable_count", 0) > 0:
|
|
187
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
174
188
|
except pyamdsmi.AmdSmiException:
|
|
189
|
+
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
190
|
+
pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
|
|
191
|
+
)
|
|
192
|
+
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
193
|
+
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
|
|
194
|
+
)
|
|
175
195
|
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
)
|
|
179
|
-
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
180
|
-
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
|
|
196
|
+
dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
|
|
197
|
+
dev_idx,
|
|
181
198
|
)
|
|
199
|
+
if dev_ecc_count.uncorrectable_err > 0:
|
|
200
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
182
201
|
|
|
183
202
|
dev_power = None
|
|
184
203
|
dev_power_used = None
|
|
@@ -232,6 +251,7 @@ class AMDDetector(Detector):
|
|
|
232
251
|
memory=dev_mem,
|
|
233
252
|
memory_used=dev_mem_used,
|
|
234
253
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
254
|
+
memory_status=dev_mem_status,
|
|
235
255
|
temperature=dev_temp,
|
|
236
256
|
power=dev_power,
|
|
237
257
|
power_used=dev_power_used,
|