gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/detector.py +4 -2
- gpustack_runtime/deployer/__types__.py +314 -233
- gpustack_runtime/deployer/cdi/__init__.py +1 -1
- gpustack_runtime/deployer/cdi/__types__.py +2 -2
- gpustack_runtime/deployer/cdi/__utils__.py +4 -1
- gpustack_runtime/deployer/cdi/amd.py +6 -8
- gpustack_runtime/deployer/cdi/ascend.py +7 -9
- gpustack_runtime/deployer/cdi/hygon.py +6 -8
- gpustack_runtime/deployer/cdi/iluvatar.py +6 -8
- gpustack_runtime/deployer/cdi/metax.py +6 -8
- gpustack_runtime/deployer/cdi/thead.py +6 -8
- gpustack_runtime/deployer/docker.py +133 -146
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +13 -8
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +26 -21
- gpustack_runtime/deployer/kuberentes.py +89 -108
- gpustack_runtime/deployer/podman.py +113 -120
- gpustack_runtime/detector/__init__.py +2 -0
- gpustack_runtime/detector/__types__.py +26 -0
- gpustack_runtime/detector/__utils__.py +3 -0
- gpustack_runtime/detector/amd.py +32 -10
- gpustack_runtime/detector/ascend.py +67 -13
- gpustack_runtime/detector/cambricon.py +3 -0
- gpustack_runtime/detector/hygon.py +22 -3
- gpustack_runtime/detector/iluvatar.py +15 -7
- gpustack_runtime/detector/metax.py +16 -6
- gpustack_runtime/detector/mthreads.py +22 -8
- gpustack_runtime/detector/nvidia.py +148 -140
- gpustack_runtime/detector/pyacl/__init__.py +34 -14
- gpustack_runtime/detector/pydcmi/__init__.py +4 -2
- gpustack_runtime/detector/pyixml/__init__.py +16 -0
- gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
- gpustack_runtime/detector/thead.py +145 -134
- gpustack_runtime/envs.py +7 -6
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/METADATA +2 -2
- gpustack_runtime-0.1.42.post1.dist-info/RECORD +67 -0
- gpustack_runtime-0.1.41.post3.dist-info/RECORD +0 -67
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/licenses/LICENSE +0 -0
|
@@ -82,7 +82,7 @@ def dump_config(
|
|
|
82
82
|
if cdi_path.exists():
|
|
83
83
|
actual = cdi_path.read_text(encoding="utf-8")
|
|
84
84
|
if actual == expected:
|
|
85
|
-
return expected,
|
|
85
|
+
return expected, None
|
|
86
86
|
|
|
87
87
|
cdi_path.write_text(expected, encoding="utf-8")
|
|
88
88
|
return expected, str(cdi_path)
|
|
@@ -527,7 +527,7 @@ class Config(dict):
|
|
|
527
527
|
self,
|
|
528
528
|
kind: str,
|
|
529
529
|
devices: list[ConfigDevice],
|
|
530
|
-
container_edits:
|
|
530
|
+
container_edits: ConfigContainerEdits | None = None,
|
|
531
531
|
cdi_version: str = _DEFAULT_CDI_VERSION,
|
|
532
532
|
annotations: dict[str, str] | None = None,
|
|
533
533
|
):
|
|
@@ -581,7 +581,7 @@ class Config(dict):
|
|
|
581
581
|
return self["kind"]
|
|
582
582
|
|
|
583
583
|
@property
|
|
584
|
-
def container_edits(self) ->
|
|
584
|
+
def container_edits(self) -> ConfigContainerEdits | None:
|
|
585
585
|
"""
|
|
586
586
|
Return the list of container edits in the CDI configuration.
|
|
587
587
|
|
|
@@ -147,6 +147,7 @@ def path_to_cdi_mount(
|
|
|
147
147
|
path: str,
|
|
148
148
|
container_path: str | None = None,
|
|
149
149
|
options: list[str] | None = None,
|
|
150
|
+
ignore_notfound: bool = False,
|
|
150
151
|
) -> ConfigMount | None:
|
|
151
152
|
"""
|
|
152
153
|
Convert a file/directory path to a ConfigMount.
|
|
@@ -158,13 +159,15 @@ def path_to_cdi_mount(
|
|
|
158
159
|
Path to the file or directory inside the container.
|
|
159
160
|
options:
|
|
160
161
|
Mount options.
|
|
162
|
+
ignore_notfound:
|
|
163
|
+
Whether to ignore if the path does not exist.
|
|
161
164
|
|
|
162
165
|
Returns:
|
|
163
166
|
The ConfigMount object.
|
|
164
167
|
None if the path does not exist.
|
|
165
168
|
|
|
166
169
|
"""
|
|
167
|
-
if not Path(path).exists():
|
|
170
|
+
if not Path(path).exists() and not ignore_notfound:
|
|
168
171
|
return None
|
|
169
172
|
|
|
170
173
|
if container_path is None:
|
|
@@ -135,12 +135,10 @@ class AMDGenerator(Generator):
|
|
|
135
135
|
return Config(
|
|
136
136
|
kind=kind,
|
|
137
137
|
devices=cdi_devices,
|
|
138
|
-
container_edits=
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
),
|
|
145
|
-
],
|
|
138
|
+
container_edits=ConfigContainerEdits(
|
|
139
|
+
env=[
|
|
140
|
+
f"{runtime_env}=void",
|
|
141
|
+
],
|
|
142
|
+
device_nodes=common_device_nodes,
|
|
143
|
+
),
|
|
146
144
|
)
|
|
@@ -152,13 +152,11 @@ class AscendGenerator(Generator):
|
|
|
152
152
|
return Config(
|
|
153
153
|
kind=kind,
|
|
154
154
|
devices=cdi_devices,
|
|
155
|
-
container_edits=
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
),
|
|
163
|
-
],
|
|
155
|
+
container_edits=ConfigContainerEdits(
|
|
156
|
+
env=[
|
|
157
|
+
f"{runtime_env}=void",
|
|
158
|
+
],
|
|
159
|
+
device_nodes=common_device_nodes,
|
|
160
|
+
mounts=common_mounts,
|
|
161
|
+
),
|
|
164
162
|
)
|
|
@@ -136,12 +136,10 @@ class HygonGenerator(Generator):
|
|
|
136
136
|
return Config(
|
|
137
137
|
kind=kind,
|
|
138
138
|
devices=cdi_devices,
|
|
139
|
-
container_edits=
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
),
|
|
146
|
-
],
|
|
139
|
+
container_edits=ConfigContainerEdits(
|
|
140
|
+
env=[
|
|
141
|
+
f"{runtime_env}=void",
|
|
142
|
+
],
|
|
143
|
+
device_nodes=common_device_nodes,
|
|
144
|
+
),
|
|
147
145
|
)
|
|
@@ -125,12 +125,10 @@ class IluvatarGenerator(Generator):
|
|
|
125
125
|
return Config(
|
|
126
126
|
kind=kind,
|
|
127
127
|
devices=cdi_devices,
|
|
128
|
-
container_edits=
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
),
|
|
135
|
-
],
|
|
128
|
+
container_edits=ConfigContainerEdits(
|
|
129
|
+
env=[
|
|
130
|
+
f"{runtime_env}=void",
|
|
131
|
+
],
|
|
132
|
+
device_nodes=common_device_nodes,
|
|
133
|
+
),
|
|
136
134
|
)
|
|
@@ -137,12 +137,10 @@ class MetaXGenerator(Generator):
|
|
|
137
137
|
return Config(
|
|
138
138
|
kind=kind,
|
|
139
139
|
devices=cdi_devices,
|
|
140
|
-
container_edits=
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
),
|
|
147
|
-
],
|
|
140
|
+
container_edits=ConfigContainerEdits(
|
|
141
|
+
env=[
|
|
142
|
+
f"{runtime_env}=void",
|
|
143
|
+
],
|
|
144
|
+
device_nodes=common_device_nodes,
|
|
145
|
+
),
|
|
148
146
|
)
|
|
@@ -126,12 +126,10 @@ class THeadGenerator(Generator):
|
|
|
126
126
|
return Config(
|
|
127
127
|
kind=kind,
|
|
128
128
|
devices=cdi_devices,
|
|
129
|
-
container_edits=
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
),
|
|
136
|
-
],
|
|
129
|
+
container_edits=ConfigContainerEdits(
|
|
130
|
+
env=[
|
|
131
|
+
f"{runtime_env}=void",
|
|
132
|
+
],
|
|
133
|
+
device_nodes=common_device_nodes,
|
|
134
|
+
),
|
|
137
135
|
)
|
|
@@ -4,13 +4,11 @@ import contextlib
|
|
|
4
4
|
import io
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import operator
|
|
8
7
|
import os
|
|
9
8
|
import socket
|
|
10
9
|
import sys
|
|
11
10
|
import tarfile
|
|
12
11
|
from dataclasses import dataclass, field
|
|
13
|
-
from functools import reduce
|
|
14
12
|
from math import ceil
|
|
15
13
|
from pathlib import Path
|
|
16
14
|
from typing import TYPE_CHECKING, Any
|
|
@@ -81,17 +79,6 @@ class DockerWorkloadPlan(WorkloadPlan):
|
|
|
81
79
|
Image used for the pause container.
|
|
82
80
|
unhealthy_restart_image (str):
|
|
83
81
|
Image used for unhealthy restart container.
|
|
84
|
-
resource_key_runtime_env_mapping: (dict[str, str]):
|
|
85
|
-
Mapping from resource names to environment variable names for device allocation,
|
|
86
|
-
which is used to tell the Container Runtime which GPUs to mount into the container.
|
|
87
|
-
For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
|
|
88
|
-
which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
89
|
-
With privileged mode, the container can access all GPUs even if specified.
|
|
90
|
-
resource_key_backend_env_mapping: (dict[str, list[str]]):
|
|
91
|
-
Mapping from resource names to environment variable names for device runtime,
|
|
92
|
-
which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
|
|
93
|
-
For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
|
|
94
|
-
which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
95
82
|
namespace (str | None):
|
|
96
83
|
Namespace of the workload.
|
|
97
84
|
name (str):
|
|
@@ -845,7 +832,7 @@ class DockerDeployer(EndoscopicDeployer):
|
|
|
845
832
|
msg = f"Failed to upload ephemeral files to container {container.name}"
|
|
846
833
|
raise OperationError(msg)
|
|
847
834
|
|
|
848
|
-
def _create_containers(
|
|
835
|
+
def _create_containers(
|
|
849
836
|
self,
|
|
850
837
|
workload: DockerWorkloadPlan,
|
|
851
838
|
ephemeral_volume_name_mapping: dict[str, str],
|
|
@@ -955,146 +942,146 @@ class DockerDeployer(EndoscopicDeployer):
|
|
|
955
942
|
envs.GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY.lower()
|
|
956
943
|
== "cdi"
|
|
957
944
|
)
|
|
945
|
+
fmt = "plain" if not cdi else "cdi"
|
|
958
946
|
|
|
959
|
-
r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
|
|
960
|
-
r_k_backend_env = workload.resource_key_backend_env_mapping or {}
|
|
961
|
-
vd_manus, vd_env, vd_cdis, vd_values = (
|
|
962
|
-
self.get_visible_devices_materials()
|
|
963
|
-
)
|
|
964
947
|
for r_k, r_v in c.resources.items():
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
if r_k in r_k_runtime_env:
|
|
983
|
-
# Set env if resource key is mapped.
|
|
984
|
-
runtime_env = [r_k_runtime_env[r_k]]
|
|
985
|
-
elif (
|
|
986
|
-
r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
|
|
987
|
-
):
|
|
988
|
-
# Set env if auto-mapping key is matched.
|
|
989
|
-
runtime_env = list(vd_env.keys())
|
|
990
|
-
else:
|
|
991
|
-
continue
|
|
992
|
-
|
|
993
|
-
if r_k in r_k_backend_env:
|
|
994
|
-
# Set env if resource key is mapped.
|
|
995
|
-
backend_env = r_k_backend_env[r_k]
|
|
996
|
-
else:
|
|
997
|
-
# Otherwise, use the default backend env names.
|
|
998
|
-
backend_env = reduce(
|
|
999
|
-
operator.add,
|
|
1000
|
-
list(vd_env.values()),
|
|
1001
|
-
)
|
|
948
|
+
if r_k == "cpu":
|
|
949
|
+
if isinstance(r_v, int | float):
|
|
950
|
+
create_options["cpu_shares"] = ceil(r_v * 1024)
|
|
951
|
+
elif isinstance(r_v, str) and r_v.isdigit():
|
|
952
|
+
create_options["cpu_shares"] = ceil(float(r_v) * 1024)
|
|
953
|
+
continue
|
|
954
|
+
if r_k == "memory":
|
|
955
|
+
if isinstance(r_v, int):
|
|
956
|
+
create_options["mem_limit"] = r_v
|
|
957
|
+
create_options["mem_reservation"] = r_v
|
|
958
|
+
create_options["memswap_limit"] = r_v
|
|
959
|
+
elif isinstance(r_v, str):
|
|
960
|
+
v = r_v.lower().removesuffix("i")
|
|
961
|
+
create_options["mem_limit"] = v
|
|
962
|
+
create_options["mem_reservation"] = v
|
|
963
|
+
create_options["memswap_limit"] = v
|
|
964
|
+
continue
|
|
1002
965
|
|
|
1003
|
-
|
|
966
|
+
if (
|
|
967
|
+
r_k
|
|
968
|
+
in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
|
|
969
|
+
):
|
|
970
|
+
# Set env if resource key is mapped.
|
|
971
|
+
runtime_envs = [
|
|
972
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
|
|
973
|
+
r_k
|
|
974
|
+
],
|
|
975
|
+
]
|
|
976
|
+
elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
|
|
977
|
+
# Set env if auto-mapping key is matched.
|
|
978
|
+
runtime_envs = self.get_runtime_envs()
|
|
979
|
+
else:
|
|
980
|
+
continue
|
|
1004
981
|
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
982
|
+
privileged = create_options.get("privileged", False)
|
|
983
|
+
resource_values = [x.strip() for x in r_v.split(",")]
|
|
984
|
+
|
|
985
|
+
# Generate CDI config if not yet.
|
|
986
|
+
if cdi and envs.GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE:
|
|
987
|
+
for ren in runtime_envs:
|
|
988
|
+
manu = self.get_manufacturer(ren)
|
|
989
|
+
cdi_config, cdi_config_path = cdi_dump_config(
|
|
990
|
+
manufacturer=manu,
|
|
991
|
+
output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
|
|
992
|
+
)
|
|
993
|
+
if cdi_config and cdi_config_path:
|
|
994
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
995
|
+
logger.debug(
|
|
996
|
+
"Generated CDI configuration for '%s' at '%s':\n%s",
|
|
997
|
+
manu,
|
|
998
|
+
cdi_config_path,
|
|
999
|
+
cdi_config,
|
|
1011
1000
|
)
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
# Then, set container backend visible devices env to all devices,
|
|
1018
|
-
# so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
|
|
1019
|
-
# and mount corresponding libs if needed.
|
|
1020
|
-
for re in runtime_env:
|
|
1021
|
-
# Request device via CDI.
|
|
1022
|
-
if cdi:
|
|
1023
|
-
rv = [
|
|
1024
|
-
f"{vd_cdis[re]}={v}"
|
|
1025
|
-
for v in (vd_values.get(re) or ["all"])
|
|
1026
|
-
]
|
|
1027
|
-
if "device_requests" not in create_options:
|
|
1028
|
-
create_options["device_requests"] = []
|
|
1029
|
-
create_options["device_requests"].append(
|
|
1030
|
-
docker.types.DeviceRequest(
|
|
1031
|
-
driver="cdi",
|
|
1032
|
-
count=0,
|
|
1033
|
-
device_ids=rv,
|
|
1034
|
-
),
|
|
1035
|
-
)
|
|
1036
|
-
continue
|
|
1037
|
-
# Request device via visible devices env.
|
|
1038
|
-
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1039
|
-
create_options["environment"][re] = rv
|
|
1040
|
-
else:
|
|
1041
|
-
# Set env to the allocated device IDs if no privileged,
|
|
1042
|
-
# otherwise, set container backend visible devices env to all devices,
|
|
1043
|
-
# so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
|
|
1044
|
-
# and mount corresponding libs if needed.
|
|
1045
|
-
for re in runtime_env:
|
|
1046
|
-
# Request device via CDI.
|
|
1047
|
-
if cdi:
|
|
1048
|
-
if not privileged:
|
|
1049
|
-
rv = [
|
|
1050
|
-
f"{vd_cdis[re]}={v.strip()}"
|
|
1051
|
-
for v in r_v.split(",")
|
|
1052
|
-
]
|
|
1053
|
-
else:
|
|
1054
|
-
rv = [
|
|
1055
|
-
f"{vd_cdis[re]}={v}"
|
|
1056
|
-
for v in (vd_values.get(re) or ["all"])
|
|
1057
|
-
]
|
|
1058
|
-
if "device_requests" not in create_options:
|
|
1059
|
-
create_options["device_requests"] = []
|
|
1060
|
-
create_options["device_requests"].append(
|
|
1061
|
-
docker.types.DeviceRequest(
|
|
1062
|
-
driver="cdi",
|
|
1063
|
-
count=0,
|
|
1064
|
-
device_ids=rv,
|
|
1065
|
-
),
|
|
1066
|
-
)
|
|
1067
|
-
continue
|
|
1068
|
-
# Request device via visible devices env.
|
|
1069
|
-
if not privileged:
|
|
1070
|
-
rv = str(r_v)
|
|
1071
|
-
else:
|
|
1072
|
-
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1073
|
-
create_options["environment"][re] = rv
|
|
1074
|
-
|
|
1075
|
-
# Configure runtime device access environment variables.
|
|
1076
|
-
if r_v != "all" and privileged:
|
|
1077
|
-
for be in backend_env:
|
|
1078
|
-
create_options["environment"][be] = (
|
|
1079
|
-
self.align_backend_visible_devices_env_values(
|
|
1080
|
-
be,
|
|
1081
|
-
str(r_v),
|
|
1082
|
-
)
|
|
1001
|
+
else:
|
|
1002
|
+
logger.info(
|
|
1003
|
+
"Generated CDI configuration for '%s' at '%s'",
|
|
1004
|
+
manu,
|
|
1005
|
+
cdi_config_path,
|
|
1083
1006
|
)
|
|
1007
|
+
elif cdi_config:
|
|
1008
|
+
logger.info(
|
|
1009
|
+
"Reuse generated CDI configuration for '%s'",
|
|
1010
|
+
manu,
|
|
1011
|
+
)
|
|
1012
|
+
else:
|
|
1013
|
+
logger.warning(
|
|
1014
|
+
"Delegated CDI configuration by other tools for '%s', "
|
|
1015
|
+
"e.g. for NVIDIA devices, please follow NVIDIA Container Toolkit Manual CDI Specification Generation to generate the CDI configuration, "
|
|
1016
|
+
"see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html#manual-cdi-specification-generation",
|
|
1017
|
+
manu,
|
|
1018
|
+
)
|
|
1084
1019
|
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1020
|
+
# Request devices.
|
|
1021
|
+
if r_v == "all":
|
|
1022
|
+
# Configure privileged.
|
|
1023
|
+
create_options["privileged"] = True
|
|
1024
|
+
# Request all devices.
|
|
1025
|
+
for ren in runtime_envs:
|
|
1026
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1027
|
+
# Request device via CDI.
|
|
1028
|
+
if cdi:
|
|
1029
|
+
if "device_requests" not in create_options:
|
|
1030
|
+
create_options["device_requests"] = []
|
|
1031
|
+
create_options["device_requests"].append(
|
|
1032
|
+
docker.types.DeviceRequest(
|
|
1033
|
+
driver="cdi",
|
|
1034
|
+
count=0,
|
|
1035
|
+
device_ids=r_vs,
|
|
1036
|
+
),
|
|
1037
|
+
)
|
|
1038
|
+
continue
|
|
1039
|
+
# Request device via visible devices env.
|
|
1040
|
+
create_options["environment"][ren] = ",".join(r_vs)
|
|
1041
|
+
else:
|
|
1042
|
+
# Request specific devices.
|
|
1043
|
+
for ren in runtime_envs:
|
|
1044
|
+
# Request all devices if privileged,
|
|
1045
|
+
# otherwise, normalize requested devices.
|
|
1046
|
+
if privileged:
|
|
1047
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1048
|
+
else:
|
|
1049
|
+
r_vs = self.map_runtime_visible_devices(
|
|
1050
|
+
ren,
|
|
1051
|
+
resource_values,
|
|
1052
|
+
fmt,
|
|
1093
1053
|
)
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
if
|
|
1097
|
-
create_options["
|
|
1054
|
+
# Request device via CDI.
|
|
1055
|
+
if cdi:
|
|
1056
|
+
if "device_requests" not in create_options:
|
|
1057
|
+
create_options["device_requests"] = []
|
|
1058
|
+
create_options["device_requests"].append(
|
|
1059
|
+
docker.types.DeviceRequest(
|
|
1060
|
+
driver="cdi",
|
|
1061
|
+
count=0,
|
|
1062
|
+
device_ids=r_vs,
|
|
1063
|
+
),
|
|
1064
|
+
)
|
|
1065
|
+
continue
|
|
1066
|
+
# Request device via visible devices env.
|
|
1067
|
+
create_options["environment"][ren] = ",".join(r_vs)
|
|
1068
|
+
|
|
1069
|
+
# If not requesting all devices but privileged,
|
|
1070
|
+
# must configure visible devices.
|
|
1071
|
+
if r_v != "all" and privileged:
|
|
1072
|
+
b_vs = self.map_backend_visible_devices(
|
|
1073
|
+
runtime_envs,
|
|
1074
|
+
resource_values,
|
|
1075
|
+
)
|
|
1076
|
+
create_options["environment"].update(b_vs)
|
|
1077
|
+
|
|
1078
|
+
# Configure affinity if applicable.
|
|
1079
|
+
create_options.update(
|
|
1080
|
+
self.map_visible_devices_affinities(
|
|
1081
|
+
runtime_envs,
|
|
1082
|
+
resource_values,
|
|
1083
|
+
),
|
|
1084
|
+
)
|
|
1098
1085
|
|
|
1099
1086
|
# Parameterize mounts.
|
|
1100
1087
|
self._append_container_mounts(
|
|
@@ -78,28 +78,33 @@ async def serve_async(
|
|
|
78
78
|
allocation_policy == "cdi"
|
|
79
79
|
and envs.GPUSTACK_RUNTIME_KUBERNETES_KDP_CDI_SPECS_GENERATE
|
|
80
80
|
):
|
|
81
|
-
|
|
81
|
+
cdi_config, cdi_config_path = cdi_dump_config(
|
|
82
82
|
manufacturer=manu,
|
|
83
83
|
output=cdi_generation_output,
|
|
84
84
|
)
|
|
85
|
-
if
|
|
85
|
+
if cdi_config and cdi_config_path:
|
|
86
86
|
if logger.isEnabledFor(logging.DEBUG):
|
|
87
87
|
logger.debug(
|
|
88
88
|
"Generated CDI configuration for '%s' at '%s':\n%s",
|
|
89
89
|
manu,
|
|
90
|
-
|
|
91
|
-
|
|
90
|
+
cdi_config_path,
|
|
91
|
+
cdi_config,
|
|
92
92
|
)
|
|
93
93
|
else:
|
|
94
94
|
logger.info(
|
|
95
95
|
"Generated CDI configuration for '%s' at '%s'",
|
|
96
96
|
manu,
|
|
97
|
-
|
|
97
|
+
cdi_config_path,
|
|
98
98
|
)
|
|
99
|
+
elif cdi_config:
|
|
100
|
+
logger.info(
|
|
101
|
+
"Reuse generated CDI configuration for '%s'",
|
|
102
|
+
manu,
|
|
103
|
+
)
|
|
99
104
|
else:
|
|
100
105
|
logger.warning(
|
|
101
|
-
"Delegated CDI configuration by other tools for
|
|
102
|
-
"e.g. NVIDIA Container Toolkit Manual CDI Specification Generation, "
|
|
106
|
+
"Delegated CDI configuration by other tools for '%s', "
|
|
107
|
+
"e.g. for NVIDIA devices, please follow NVIDIA Container Toolkit Manual CDI Specification Generation to generate the CDI configuration, "
|
|
103
108
|
"see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html#manual-cdi-specification-generation",
|
|
104
109
|
manu,
|
|
105
110
|
)
|
|
@@ -323,7 +328,7 @@ def get_device_allocation_policy(
|
|
|
323
328
|
|
|
324
329
|
if manufacturer in [
|
|
325
330
|
ManufacturerEnum.AMD,
|
|
326
|
-
|
|
331
|
+
ManufacturerEnum.ASCEND,
|
|
327
332
|
ManufacturerEnum.HYGON,
|
|
328
333
|
ManufacturerEnum.ILUVATAR,
|
|
329
334
|
ManufacturerEnum.METAX,
|
|
@@ -11,7 +11,8 @@ import grpc
|
|
|
11
11
|
from grpc_interceptor import AsyncServerInterceptor
|
|
12
12
|
from grpc_interceptor.exceptions import GrpcException
|
|
13
13
|
|
|
14
|
-
from ....detector import Device, str_range_to_list
|
|
14
|
+
from ....detector import Device, DeviceMemoryStatusEnum, str_range_to_list
|
|
15
|
+
from ....detector.__utils__ import get_numa_node_size
|
|
15
16
|
from ...cdi import (
|
|
16
17
|
generate_config,
|
|
17
18
|
manufacturer_to_cdi_kind,
|
|
@@ -40,6 +41,7 @@ from ..types.kubelet.deviceplugin.v1beta1 import (
|
|
|
40
41
|
RegisterRequest,
|
|
41
42
|
RegistrationStub,
|
|
42
43
|
TopologyInfo,
|
|
44
|
+
Unhealthy,
|
|
43
45
|
Version,
|
|
44
46
|
add_DevicePluginServicer_to_server,
|
|
45
47
|
)
|
|
@@ -159,7 +161,7 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
|
|
|
159
161
|
self._runtime_env = manufacturer_to_runtime_env(device.manufacturer)
|
|
160
162
|
self._kdp_resource = cdi_kind_to_kdp_resource(
|
|
161
163
|
cdi_kind=self._cdi_kind,
|
|
162
|
-
device_index=device.index,
|
|
164
|
+
device_index=str(device.index),
|
|
163
165
|
)
|
|
164
166
|
|
|
165
167
|
super().__init__(self._kdp_resource)
|
|
@@ -334,25 +336,31 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
|
|
|
334
336
|
The response containing the list of devices.
|
|
335
337
|
|
|
336
338
|
"""
|
|
337
|
-
device_id = (
|
|
338
|
-
self._device.uuid if self._id_by == "uuid" else str(self._device.index)
|
|
339
|
-
)
|
|
340
|
-
|
|
341
339
|
dp_devices: list[DevicePluginDevice] = []
|
|
342
|
-
dp_device_health =
|
|
340
|
+
dp_device_health = (
|
|
341
|
+
Healthy
|
|
342
|
+
if self._device.memory_status == DeviceMemoryStatusEnum.HEALTHY
|
|
343
|
+
else Unhealthy
|
|
344
|
+
)
|
|
343
345
|
dp_device_topo = TopologyInfo(
|
|
344
346
|
nodes=[
|
|
345
347
|
NUMANode(
|
|
346
348
|
ID=node_id,
|
|
347
349
|
)
|
|
348
|
-
for node_id in
|
|
349
|
-
|
|
350
|
+
for node_id in (
|
|
351
|
+
str_range_to_list(
|
|
352
|
+
self._device.appendix.get("numa", ""),
|
|
353
|
+
)
|
|
354
|
+
or list(range(get_numa_node_size()))
|
|
350
355
|
)
|
|
351
356
|
],
|
|
352
357
|
)
|
|
353
358
|
|
|
354
359
|
for device_replica in range(1, self._max_allocations + 1):
|
|
355
|
-
dp_device_id = _to_device_plugin_device_id(
|
|
360
|
+
dp_device_id = _to_device_plugin_device_id(
|
|
361
|
+
str(self._device.index),
|
|
362
|
+
device_replica,
|
|
363
|
+
)
|
|
356
364
|
dp_devices.append(
|
|
357
365
|
DevicePluginDevice(
|
|
358
366
|
ID=dp_device_id,
|
|
@@ -419,28 +427,25 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
|
|
|
419
427
|
req: ContainerAllocateRequest,
|
|
420
428
|
) -> ContainerAllocateResponse:
|
|
421
429
|
policy = self._allocation_policy
|
|
422
|
-
|
|
430
|
+
device_id = self._device.uuid
|
|
431
|
+
if self._id_by == "index":
|
|
432
|
+
device_id = str(self._device.index)
|
|
423
433
|
|
|
424
434
|
# CDI device allocation.
|
|
425
435
|
if policy == "cdi":
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
device_id, _ = _from_device_plugin_device_id(dp_device_id)
|
|
429
|
-
cdi_devices.append(
|
|
436
|
+
return ContainerAllocateResponse(
|
|
437
|
+
cdi_devices=[
|
|
430
438
|
CDIDevice(
|
|
431
439
|
name=f"{self._cdi_kind}={device_id}",
|
|
432
440
|
),
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
return ContainerAllocateResponse(
|
|
436
|
-
cdi_devices=cdi_devices,
|
|
441
|
+
],
|
|
437
442
|
)
|
|
438
443
|
|
|
439
444
|
# Environment variable device allocation.
|
|
440
445
|
if policy == "env":
|
|
441
446
|
return ContainerAllocateResponse(
|
|
442
447
|
envs={
|
|
443
|
-
self._runtime_env:
|
|
448
|
+
self._runtime_env: device_id,
|
|
444
449
|
},
|
|
445
450
|
)
|
|
446
451
|
|
|
@@ -509,7 +514,7 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
|
|
|
509
514
|
@lru_cache
|
|
510
515
|
def cdi_kind_to_kdp_resource(
|
|
511
516
|
cdi_kind: str,
|
|
512
|
-
device_index:
|
|
517
|
+
device_index: str,
|
|
513
518
|
) -> str:
|
|
514
519
|
"""
|
|
515
520
|
Map CDI kind and device index to a Kubernetes Device Plugin resource name.
|