gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/detector.py +4 -2
- gpustack_runtime/deployer/__types__.py +314 -233
- gpustack_runtime/deployer/cdi/__init__.py +1 -1
- gpustack_runtime/deployer/cdi/__types__.py +2 -2
- gpustack_runtime/deployer/cdi/__utils__.py +4 -1
- gpustack_runtime/deployer/cdi/amd.py +6 -8
- gpustack_runtime/deployer/cdi/ascend.py +7 -9
- gpustack_runtime/deployer/cdi/hygon.py +6 -8
- gpustack_runtime/deployer/cdi/iluvatar.py +6 -8
- gpustack_runtime/deployer/cdi/metax.py +6 -8
- gpustack_runtime/deployer/cdi/thead.py +6 -8
- gpustack_runtime/deployer/docker.py +133 -146
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +13 -8
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +26 -21
- gpustack_runtime/deployer/kuberentes.py +89 -108
- gpustack_runtime/deployer/podman.py +113 -120
- gpustack_runtime/detector/__init__.py +2 -0
- gpustack_runtime/detector/__types__.py +26 -0
- gpustack_runtime/detector/__utils__.py +3 -0
- gpustack_runtime/detector/amd.py +32 -10
- gpustack_runtime/detector/ascend.py +67 -13
- gpustack_runtime/detector/cambricon.py +3 -0
- gpustack_runtime/detector/hygon.py +22 -3
- gpustack_runtime/detector/iluvatar.py +15 -7
- gpustack_runtime/detector/metax.py +16 -6
- gpustack_runtime/detector/mthreads.py +22 -8
- gpustack_runtime/detector/nvidia.py +148 -140
- gpustack_runtime/detector/pyacl/__init__.py +34 -14
- gpustack_runtime/detector/pydcmi/__init__.py +4 -2
- gpustack_runtime/detector/pyixml/__init__.py +16 -0
- gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
- gpustack_runtime/detector/thead.py +145 -134
- gpustack_runtime/envs.py +7 -6
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/METADATA +2 -2
- gpustack_runtime-0.1.42.post1.dist-info/RECORD +67 -0
- gpustack_runtime-0.1.41.post3.dist-info/RECORD +0 -67
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,11 +3,9 @@ from __future__ import annotations as __future_annotations__
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
|
-
import operator
|
|
7
6
|
import os
|
|
8
7
|
from dataclasses import dataclass, field
|
|
9
8
|
from enum import Enum
|
|
10
|
-
from functools import reduce
|
|
11
9
|
from pathlib import Path
|
|
12
10
|
from typing import TYPE_CHECKING
|
|
13
11
|
|
|
@@ -43,7 +41,7 @@ from .__utils__ import (
|
|
|
43
41
|
sensitive_env_var,
|
|
44
42
|
validate_rfc1123_domain_name,
|
|
45
43
|
)
|
|
46
|
-
from .k8s.deviceplugin import
|
|
44
|
+
from .k8s.deviceplugin import get_resource_injection_policy
|
|
47
45
|
|
|
48
46
|
if TYPE_CHECKING:
|
|
49
47
|
from collections.abc import Callable, Generator
|
|
@@ -88,17 +86,6 @@ class KubernetesWorkloadPlan(WorkloadPlan):
|
|
|
88
86
|
Domain suffix for the cluster. Default is "cluster.local".
|
|
89
87
|
service_type (KubernetesWorkloadServiceTypeEnum):
|
|
90
88
|
Service type for the workload. Default is CLUSTER_IP.
|
|
91
|
-
resource_key_runtime_env_mapping: (dict[str, str]):
|
|
92
|
-
Mapping from resource names to environment variable names for device allocation,
|
|
93
|
-
which is used to tell the Container Runtime which GPUs to mount into the container.
|
|
94
|
-
For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
|
|
95
|
-
which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
96
|
-
With privileged mode, the container can access all GPUs even if specified.
|
|
97
|
-
resource_key_backend_env_mapping: (dict[str, list[str]]):
|
|
98
|
-
Mapping from resource names to environment variable names for device runtime,
|
|
99
|
-
which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
|
|
100
|
-
For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
|
|
101
|
-
which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
102
89
|
namespace (str | None):
|
|
103
90
|
Namespace of the workload.
|
|
104
91
|
name (str):
|
|
@@ -993,113 +980,103 @@ class KubernetesDeployer(EndoscopicDeployer):
|
|
|
993
980
|
# Parameterize resources
|
|
994
981
|
if c.resources:
|
|
995
982
|
kdp = get_resource_injection_policy() == "kdp"
|
|
983
|
+
fmt = "kdp" if kdp else "plain"
|
|
996
984
|
|
|
997
985
|
resources: dict[str, str] = {}
|
|
998
|
-
r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
|
|
999
|
-
r_k_backend_env = workload.resource_key_backend_env_mapping or {}
|
|
1000
|
-
_, vd_env, vd_cdis, vd_values = self.get_visible_devices_materials()
|
|
1001
986
|
for r_k, r_v in c.resources.items():
|
|
1002
987
|
if r_k in ("cpu", "memory"):
|
|
1003
988
|
resources[r_k] = str(r_v)
|
|
989
|
+
continue
|
|
990
|
+
|
|
991
|
+
if (
|
|
992
|
+
r_k
|
|
993
|
+
in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
|
|
994
|
+
):
|
|
995
|
+
# Set env if resource key is mapped.
|
|
996
|
+
runtime_envs = [
|
|
997
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
|
|
998
|
+
r_k
|
|
999
|
+
],
|
|
1000
|
+
]
|
|
1001
|
+
elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
|
|
1002
|
+
# Set env if auto-mapping key is matched.
|
|
1003
|
+
runtime_envs = self.get_runtime_envs()
|
|
1004
1004
|
else:
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
else:
|
|
1019
|
-
# Otherwise, use the default backend env names.
|
|
1020
|
-
backend_env = reduce(operator.add, list(vd_env.values()))
|
|
1021
|
-
|
|
1022
|
-
privileged = (
|
|
1005
|
+
resources[r_k] = str(r_v)
|
|
1006
|
+
continue
|
|
1007
|
+
|
|
1008
|
+
privileged = (
|
|
1009
|
+
container.security_context
|
|
1010
|
+
and container.security_context.privileged
|
|
1011
|
+
)
|
|
1012
|
+
resource_values = [x.strip() for x in r_v.split(",")]
|
|
1013
|
+
|
|
1014
|
+
# Request devices.
|
|
1015
|
+
if r_v == "all":
|
|
1016
|
+
# Configure privileged.
|
|
1017
|
+
container.security_context = (
|
|
1023
1018
|
container.security_context
|
|
1024
|
-
|
|
1019
|
+
or kubernetes.client.V1SecurityContext()
|
|
1025
1020
|
)
|
|
1026
|
-
|
|
1027
|
-
#
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1021
|
+
container.security_context.privileged = True
|
|
1022
|
+
# Request all devices.
|
|
1023
|
+
for ren in runtime_envs:
|
|
1024
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1025
|
+
# Request device via KDP.
|
|
1026
|
+
if kdp:
|
|
1027
|
+
resources.update(
|
|
1028
|
+
dict.fromkeys(r_vs, "1"),
|
|
1029
|
+
)
|
|
1030
|
+
continue
|
|
1031
|
+
# Request device via visible devices env.
|
|
1032
|
+
container.env.append(
|
|
1033
|
+
kubernetes.client.V1EnvVar(
|
|
1034
|
+
name=ren,
|
|
1035
|
+
value=",".join(r_vs),
|
|
1036
|
+
),
|
|
1033
1037
|
)
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
#
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
resources[kdp_resource] = "1"
|
|
1047
|
-
continue
|
|
1048
|
-
# Request device via visible devices env.
|
|
1049
|
-
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1050
|
-
container.env.append(
|
|
1051
|
-
kubernetes.client.V1EnvVar(
|
|
1052
|
-
name=re,
|
|
1053
|
-
value=rv,
|
|
1054
|
-
),
|
|
1038
|
+
else:
|
|
1039
|
+
# Request specific devices.
|
|
1040
|
+
for ren in runtime_envs:
|
|
1041
|
+
# Request all devices if privileged,
|
|
1042
|
+
# otherwise, normalize requested devices.
|
|
1043
|
+
if privileged:
|
|
1044
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1045
|
+
else:
|
|
1046
|
+
r_vs = self.map_runtime_visible_devices(
|
|
1047
|
+
ren,
|
|
1048
|
+
resource_values,
|
|
1049
|
+
fmt,
|
|
1055
1050
|
)
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
# and mount corresponding libs if needed.
|
|
1061
|
-
for re in runtime_env:
|
|
1062
|
-
# Request device via KDP.
|
|
1063
|
-
if kdp:
|
|
1064
|
-
if not privileged:
|
|
1065
|
-
for v in str(r_v).split(","):
|
|
1066
|
-
kdp_resource = cdi_kind_to_kdp_resource(
|
|
1067
|
-
cdi_kind=vd_cdis[re],
|
|
1068
|
-
device_index=int(v.strip()),
|
|
1069
|
-
)
|
|
1070
|
-
resources[kdp_resource] = "1"
|
|
1071
|
-
else:
|
|
1072
|
-
for v in vd_values.get(re) or []:
|
|
1073
|
-
kdp_resource = cdi_kind_to_kdp_resource(
|
|
1074
|
-
cdi_kind=vd_cdis[re],
|
|
1075
|
-
device_index=v,
|
|
1076
|
-
)
|
|
1077
|
-
resources[kdp_resource] = "1"
|
|
1078
|
-
continue
|
|
1079
|
-
# Request device via visible devices env.
|
|
1080
|
-
if not privileged:
|
|
1081
|
-
rv = str(r_v)
|
|
1082
|
-
else:
|
|
1083
|
-
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1084
|
-
container.env.append(
|
|
1085
|
-
kubernetes.client.V1EnvVar(
|
|
1086
|
-
name=re,
|
|
1087
|
-
value=rv,
|
|
1088
|
-
),
|
|
1051
|
+
# Request device via KDP.
|
|
1052
|
+
if kdp:
|
|
1053
|
+
resources.update(
|
|
1054
|
+
dict.fromkeys(r_vs, "1"),
|
|
1089
1055
|
)
|
|
1056
|
+
continue
|
|
1057
|
+
# Request device via visible devices env.
|
|
1058
|
+
container.env.append(
|
|
1059
|
+
kubernetes.client.V1EnvVar(
|
|
1060
|
+
name=ren,
|
|
1061
|
+
value=",".join(r_vs),
|
|
1062
|
+
),
|
|
1063
|
+
)
|
|
1090
1064
|
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1065
|
+
# Configure runtime device access environment variables.
|
|
1066
|
+
if r_v != "all" and privileged:
|
|
1067
|
+
b_vs = self.map_backend_visible_devices(
|
|
1068
|
+
runtime_envs,
|
|
1069
|
+
resource_values,
|
|
1070
|
+
)
|
|
1071
|
+
container.env.extend(
|
|
1072
|
+
[
|
|
1073
|
+
kubernetes.client.V1EnvVar(
|
|
1074
|
+
name=be,
|
|
1075
|
+
value=be_v,
|
|
1102
1076
|
)
|
|
1077
|
+
for be, be_v in b_vs.items()
|
|
1078
|
+
],
|
|
1079
|
+
)
|
|
1103
1080
|
|
|
1104
1081
|
container.resources = kubernetes.client.V1ResourceRequirements(
|
|
1105
1082
|
limits=(resources if resources else None),
|
|
@@ -1229,6 +1206,10 @@ class KubernetesDeployer(EndoscopicDeployer):
|
|
|
1229
1206
|
self._client = self._get_client()
|
|
1230
1207
|
self._node_name = envs.GPUSTACK_RUNTIME_KUBERNETES_NODE_NAME
|
|
1231
1208
|
|
|
1209
|
+
@property
|
|
1210
|
+
def allowed_uuid_values(self) -> bool:
|
|
1211
|
+
return get_resource_injection_policy() != "kdp"
|
|
1212
|
+
|
|
1232
1213
|
def _prepare_mirrored_deployment(self):
|
|
1233
1214
|
"""
|
|
1234
1215
|
Prepare for mirrored deployment.
|
|
@@ -4,13 +4,11 @@ import contextlib
|
|
|
4
4
|
import io
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import operator
|
|
8
7
|
import os
|
|
9
8
|
import socket
|
|
10
9
|
import sys
|
|
11
10
|
import tarfile
|
|
12
11
|
from dataclasses import dataclass, field
|
|
13
|
-
from functools import reduce
|
|
14
12
|
from math import ceil
|
|
15
13
|
from pathlib import Path
|
|
16
14
|
from typing import TYPE_CHECKING, Any
|
|
@@ -84,17 +82,6 @@ class PodmanWorkloadPlan(WorkloadPlan):
|
|
|
84
82
|
Image used for the pause container.
|
|
85
83
|
unhealthy_restart_image (str):
|
|
86
84
|
Image used for unhealthy restart container.
|
|
87
|
-
resource_key_runtime_env_mapping: (dict[str, str]):
|
|
88
|
-
Mapping from resource names to environment variable names for device allocation,
|
|
89
|
-
which is used to tell the Container Runtime which GPUs to mount into the container.
|
|
90
|
-
For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
|
|
91
|
-
which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
92
|
-
With privileged mode, the container can access all GPUs even if specified.
|
|
93
|
-
resource_key_backend_env_mapping: (dict[str, list[str]]):
|
|
94
|
-
Mapping from resource names to environment variable names for device runtime,
|
|
95
|
-
which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
|
|
96
|
-
For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
|
|
97
|
-
which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
98
85
|
namespace (str | None):
|
|
99
86
|
Namespace of the workload.
|
|
100
87
|
name (str):
|
|
@@ -952,120 +939,126 @@ class PodmanDeployer(EndoscopicDeployer):
|
|
|
952
939
|
|
|
953
940
|
# Parameterize resources.
|
|
954
941
|
if c.resources:
|
|
955
|
-
|
|
956
|
-
r_k_backend_env = workload.resource_key_backend_env_mapping or {}
|
|
957
|
-
vd_manus, vd_env, vd_cdis, vd_values = (
|
|
958
|
-
self.get_visible_devices_materials()
|
|
959
|
-
)
|
|
960
|
-
for r_k, r_v in c.resources.items():
|
|
961
|
-
match r_k:
|
|
962
|
-
case "cpu":
|
|
963
|
-
if isinstance(r_v, int | float):
|
|
964
|
-
create_options["cpu_shares"] = ceil(r_v * 1024)
|
|
965
|
-
elif isinstance(r_v, str) and r_v.isdigit():
|
|
966
|
-
create_options["cpu_shares"] = ceil(float(r_v) * 1024)
|
|
967
|
-
case "memory":
|
|
968
|
-
if isinstance(r_v, int):
|
|
969
|
-
create_options["mem_limit"] = r_v
|
|
970
|
-
create_options["mem_reservation"] = r_v
|
|
971
|
-
create_options["memswap_limit"] = r_v
|
|
972
|
-
elif isinstance(r_v, str):
|
|
973
|
-
v = r_v.lower().removesuffix("i")
|
|
974
|
-
create_options["mem_limit"] = v
|
|
975
|
-
create_options["mem_reservation"] = v
|
|
976
|
-
create_options["memswap_limit"] = v
|
|
977
|
-
case _:
|
|
978
|
-
if r_k in r_k_runtime_env:
|
|
979
|
-
# Set env if resource key is mapped.
|
|
980
|
-
runtime_env = [r_k_runtime_env[r_k]]
|
|
981
|
-
elif (
|
|
982
|
-
r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
|
|
983
|
-
):
|
|
984
|
-
# Set env if auto-mapping key is matched.
|
|
985
|
-
runtime_env = list(vd_env.keys())
|
|
986
|
-
else:
|
|
987
|
-
continue
|
|
942
|
+
fmt = "cdi"
|
|
988
943
|
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
944
|
+
for r_k, r_v in c.resources.items():
|
|
945
|
+
if r_k == "cpu":
|
|
946
|
+
if isinstance(r_v, int | float):
|
|
947
|
+
create_options["cpu_shares"] = ceil(r_v * 1024)
|
|
948
|
+
elif isinstance(r_v, str) and r_v.isdigit():
|
|
949
|
+
create_options["cpu_shares"] = ceil(float(r_v) * 1024)
|
|
950
|
+
continue
|
|
951
|
+
if r_k == "memory":
|
|
952
|
+
if isinstance(r_v, int):
|
|
953
|
+
create_options["mem_limit"] = r_v
|
|
954
|
+
create_options["mem_reservation"] = r_v
|
|
955
|
+
create_options["memswap_limit"] = r_v
|
|
956
|
+
elif isinstance(r_v, str):
|
|
957
|
+
v = r_v.lower().removesuffix("i")
|
|
958
|
+
create_options["mem_limit"] = v
|
|
959
|
+
create_options["mem_reservation"] = v
|
|
960
|
+
create_options["memswap_limit"] = v
|
|
961
|
+
continue
|
|
998
962
|
|
|
999
|
-
|
|
963
|
+
if (
|
|
964
|
+
r_k
|
|
965
|
+
in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
|
|
966
|
+
):
|
|
967
|
+
# Set env if resource key is mapped.
|
|
968
|
+
runtime_envs = [
|
|
969
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
|
|
970
|
+
r_k
|
|
971
|
+
],
|
|
972
|
+
]
|
|
973
|
+
elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
|
|
974
|
+
# Set env if auto-mapping key is matched.
|
|
975
|
+
runtime_envs = self.get_runtime_envs()
|
|
976
|
+
else:
|
|
977
|
+
continue
|
|
1000
978
|
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
979
|
+
privileged = create_options.get("privileged", False)
|
|
980
|
+
resource_values = [x.strip() for x in r_v.split(",")]
|
|
981
|
+
|
|
982
|
+
# Generate CDI config if not yet.
|
|
983
|
+
if envs.GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE:
|
|
984
|
+
for ren in runtime_envs:
|
|
985
|
+
manu = self.get_manufacturer(ren)
|
|
986
|
+
cdi_config, cdi_config_path = cdi_dump_config(
|
|
987
|
+
manufacturer=manu,
|
|
988
|
+
output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
|
|
989
|
+
)
|
|
990
|
+
if cdi_config and cdi_config_path:
|
|
991
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
992
|
+
logger.debug(
|
|
993
|
+
"Generated CDI configuration for '%s' at '%s':\n%s",
|
|
994
|
+
manu,
|
|
995
|
+
cdi_config_path,
|
|
996
|
+
cdi_config,
|
|
1007
997
|
)
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
# Then, set container backend visible devices env to all devices,
|
|
1014
|
-
# so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
|
|
1015
|
-
# and mount corresponding libs if needed.
|
|
1016
|
-
for re in runtime_env:
|
|
1017
|
-
# Request device via CDI.
|
|
1018
|
-
rv = [
|
|
1019
|
-
f"{vd_cdis[re]}={v}"
|
|
1020
|
-
for v in (vd_values.get(re) or ["all"])
|
|
1021
|
-
]
|
|
1022
|
-
if "devices" not in create_options:
|
|
1023
|
-
create_options["devices"] = []
|
|
1024
|
-
create_options["devices"].extend(rv)
|
|
1025
|
-
else:
|
|
1026
|
-
# Set env to the allocated device IDs if no privileged,
|
|
1027
|
-
# otherwise, set container backend visible devices env to all devices,
|
|
1028
|
-
# so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
|
|
1029
|
-
# and mount corresponding libs if needed.
|
|
1030
|
-
for re in runtime_env:
|
|
1031
|
-
# Request device via CDI.
|
|
1032
|
-
if not privileged:
|
|
1033
|
-
rv = [
|
|
1034
|
-
f"{vd_cdis[re]}={v.strip()}"
|
|
1035
|
-
for v in r_v.split(",")
|
|
1036
|
-
]
|
|
1037
|
-
else:
|
|
1038
|
-
rv = [
|
|
1039
|
-
f"{vd_cdis[re]}={v}"
|
|
1040
|
-
for v in (vd_values.get(re) or ["all"])
|
|
1041
|
-
]
|
|
1042
|
-
if "devices" not in create_options:
|
|
1043
|
-
create_options["devices"] = []
|
|
1044
|
-
create_options["devices"].extend(rv)
|
|
1045
|
-
|
|
1046
|
-
# Configure runtime device access environment variables.
|
|
1047
|
-
if r_v != "all" and privileged:
|
|
1048
|
-
for be in backend_env:
|
|
1049
|
-
create_options["environment"][be] = (
|
|
1050
|
-
self.align_backend_visible_devices_env_values(
|
|
1051
|
-
be,
|
|
1052
|
-
str(r_v),
|
|
1053
|
-
)
|
|
998
|
+
else:
|
|
999
|
+
logger.info(
|
|
1000
|
+
"Generated CDI configuration for '%s' at '%s'",
|
|
1001
|
+
manu,
|
|
1002
|
+
cdi_config_path,
|
|
1054
1003
|
)
|
|
1004
|
+
elif cdi_config:
|
|
1005
|
+
logger.info(
|
|
1006
|
+
"Reuse generated CDI configuration for '%s'",
|
|
1007
|
+
manu,
|
|
1008
|
+
)
|
|
1009
|
+
else:
|
|
1010
|
+
logger.warning(
|
|
1011
|
+
"Delegated CDI configuration by other tools for '%s', "
|
|
1012
|
+
"e.g. for NVIDIA devices, please follow NVIDIA Container Toolkit Manual CDI Specification Generation to generate the CDI configuration, "
|
|
1013
|
+
"see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html#manual-cdi-specification-generation",
|
|
1014
|
+
manu,
|
|
1015
|
+
)
|
|
1055
1016
|
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1017
|
+
# Configure device access environment variable.
|
|
1018
|
+
if r_v == "all":
|
|
1019
|
+
# Configure privileged.
|
|
1020
|
+
create_options["privileged"] = True
|
|
1021
|
+
# Request all devices.
|
|
1022
|
+
for ren in runtime_envs:
|
|
1023
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1024
|
+
# Request device via CDI.
|
|
1025
|
+
if "devices" not in create_options:
|
|
1026
|
+
create_options["devices"] = []
|
|
1027
|
+
create_options["devices"].extend(r_vs)
|
|
1028
|
+
else:
|
|
1029
|
+
# Request specific devices.
|
|
1030
|
+
for ren in runtime_envs:
|
|
1031
|
+
# Request all devices if privileged,
|
|
1032
|
+
# otherwise, normalize requested devices.
|
|
1033
|
+
if privileged:
|
|
1034
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1035
|
+
else:
|
|
1036
|
+
r_vs = self.map_runtime_visible_devices(
|
|
1037
|
+
ren,
|
|
1038
|
+
resource_values,
|
|
1039
|
+
fmt,
|
|
1064
1040
|
)
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1041
|
+
# Request device via CDI.
|
|
1042
|
+
if "devices" not in create_options:
|
|
1043
|
+
create_options["devices"] = []
|
|
1044
|
+
create_options["devices"].extend(r_vs)
|
|
1045
|
+
|
|
1046
|
+
# If not requesting all devices but privileged,
|
|
1047
|
+
# must configure visible devices.
|
|
1048
|
+
if r_v != "all" and privileged:
|
|
1049
|
+
b_vs = self.map_backend_visible_devices(
|
|
1050
|
+
runtime_envs,
|
|
1051
|
+
resource_values,
|
|
1052
|
+
)
|
|
1053
|
+
create_options["environment"].update(b_vs)
|
|
1054
|
+
|
|
1055
|
+
# Configure affinity if applicable.
|
|
1056
|
+
create_options.update(
|
|
1057
|
+
self.map_visible_devices_affinities(
|
|
1058
|
+
runtime_envs,
|
|
1059
|
+
resource_values,
|
|
1060
|
+
),
|
|
1061
|
+
)
|
|
1069
1062
|
|
|
1070
1063
|
# Parameterize mounts.
|
|
1071
1064
|
self._append_container_mounts(
|
|
@@ -7,6 +7,7 @@ from ..logging import debug_log_exception
|
|
|
7
7
|
from .__types__ import (
|
|
8
8
|
Detector,
|
|
9
9
|
Device,
|
|
10
|
+
DeviceMemoryStatusEnum,
|
|
10
11
|
Devices,
|
|
11
12
|
ManufacturerEnum,
|
|
12
13
|
Topology,
|
|
@@ -292,6 +293,7 @@ def filter_devices_by_manufacturer(
|
|
|
292
293
|
|
|
293
294
|
__all__ = [
|
|
294
295
|
"Device",
|
|
296
|
+
"DeviceMemoryStatusEnum",
|
|
295
297
|
"Devices",
|
|
296
298
|
"ManufacturerEnum",
|
|
297
299
|
"Topology",
|
|
@@ -122,6 +122,28 @@ def backend_to_manufacturer(backend: str) -> ManufacturerEnum:
|
|
|
122
122
|
return ManufacturerEnum.UNKNOWN
|
|
123
123
|
|
|
124
124
|
|
|
125
|
+
class DeviceMemoryStatusEnum(str, Enum):
|
|
126
|
+
"""
|
|
127
|
+
Enum for Device Memory Status.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
HEALTHY = "healthy"
|
|
131
|
+
"""
|
|
132
|
+
Device is healthy.
|
|
133
|
+
"""
|
|
134
|
+
UNHEALTHY = "unhealthy"
|
|
135
|
+
"""
|
|
136
|
+
Device is unhealthy.
|
|
137
|
+
"""
|
|
138
|
+
UNKNOWN = "unknown"
|
|
139
|
+
"""
|
|
140
|
+
Device status is unknown.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
def __str__(self):
|
|
144
|
+
return self.value
|
|
145
|
+
|
|
146
|
+
|
|
125
147
|
@dataclass_json
|
|
126
148
|
@dataclass
|
|
127
149
|
class Device:
|
|
@@ -185,6 +207,10 @@ class Device:
|
|
|
185
207
|
"""
|
|
186
208
|
Memory utilization of the device in percentage.
|
|
187
209
|
"""
|
|
210
|
+
memory_status: DeviceMemoryStatusEnum = DeviceMemoryStatusEnum.UNKNOWN
|
|
211
|
+
"""
|
|
212
|
+
Status of the device.
|
|
213
|
+
"""
|
|
188
214
|
temperature: int | float | None = None
|
|
189
215
|
"""
|
|
190
216
|
Temperature of the device in Celsius.
|