gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/detector.py +3 -1
- gpustack_runtime/deployer/__types__.py +314 -233
- gpustack_runtime/deployer/cdi/__utils__.py +4 -1
- gpustack_runtime/deployer/docker.py +109 -148
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +1 -1
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
- gpustack_runtime/deployer/kuberentes.py +89 -108
- gpustack_runtime/deployer/podman.py +89 -122
- gpustack_runtime/detector/__init__.py +2 -0
- gpustack_runtime/detector/__types__.py +26 -0
- gpustack_runtime/detector/amd.py +28 -8
- gpustack_runtime/detector/ascend.py +49 -4
- gpustack_runtime/detector/cambricon.py +3 -0
- gpustack_runtime/detector/hygon.py +16 -1
- gpustack_runtime/detector/iluvatar.py +6 -0
- gpustack_runtime/detector/metax.py +8 -0
- gpustack_runtime/detector/mthreads.py +11 -0
- gpustack_runtime/detector/nvidia.py +139 -134
- gpustack_runtime/detector/pyixml/__init__.py +16 -0
- gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
- gpustack_runtime/detector/thead.py +135 -127
- gpustack_runtime/envs.py +7 -6
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0
|
@@ -147,6 +147,7 @@ def path_to_cdi_mount(
|
|
|
147
147
|
path: str,
|
|
148
148
|
container_path: str | None = None,
|
|
149
149
|
options: list[str] | None = None,
|
|
150
|
+
ignore_notfound: bool = False,
|
|
150
151
|
) -> ConfigMount | None:
|
|
151
152
|
"""
|
|
152
153
|
Convert a file/directory path to a ConfigMount.
|
|
@@ -158,13 +159,15 @@ def path_to_cdi_mount(
|
|
|
158
159
|
Path to the file or directory inside the container.
|
|
159
160
|
options:
|
|
160
161
|
Mount options.
|
|
162
|
+
ignore_notfound:
|
|
163
|
+
Whether to ignore if the path does not exist.
|
|
161
164
|
|
|
162
165
|
Returns:
|
|
163
166
|
The ConfigMount object.
|
|
164
167
|
None if the path does not exist.
|
|
165
168
|
|
|
166
169
|
"""
|
|
167
|
-
if not Path(path).exists():
|
|
170
|
+
if not Path(path).exists() and not ignore_notfound:
|
|
168
171
|
return None
|
|
169
172
|
|
|
170
173
|
if container_path is None:
|
|
@@ -4,13 +4,11 @@ import contextlib
|
|
|
4
4
|
import io
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import operator
|
|
8
7
|
import os
|
|
9
8
|
import socket
|
|
10
9
|
import sys
|
|
11
10
|
import tarfile
|
|
12
11
|
from dataclasses import dataclass, field
|
|
13
|
-
from functools import reduce
|
|
14
12
|
from math import ceil
|
|
15
13
|
from pathlib import Path
|
|
16
14
|
from typing import TYPE_CHECKING, Any
|
|
@@ -81,17 +79,6 @@ class DockerWorkloadPlan(WorkloadPlan):
|
|
|
81
79
|
Image used for the pause container.
|
|
82
80
|
unhealthy_restart_image (str):
|
|
83
81
|
Image used for unhealthy restart container.
|
|
84
|
-
resource_key_runtime_env_mapping: (dict[str, str]):
|
|
85
|
-
Mapping from resource names to environment variable names for device allocation,
|
|
86
|
-
which is used to tell the Container Runtime which GPUs to mount into the container.
|
|
87
|
-
For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
|
|
88
|
-
which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
89
|
-
With privileged mode, the container can access all GPUs even if specified.
|
|
90
|
-
resource_key_backend_env_mapping: (dict[str, list[str]]):
|
|
91
|
-
Mapping from resource names to environment variable names for device runtime,
|
|
92
|
-
which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
|
|
93
|
-
For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
|
|
94
|
-
which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
|
|
95
82
|
namespace (str | None):
|
|
96
83
|
Namespace of the workload.
|
|
97
84
|
name (str):
|
|
@@ -845,7 +832,7 @@ class DockerDeployer(EndoscopicDeployer):
|
|
|
845
832
|
msg = f"Failed to upload ephemeral files to container {container.name}"
|
|
846
833
|
raise OperationError(msg)
|
|
847
834
|
|
|
848
|
-
def _create_containers(
|
|
835
|
+
def _create_containers(
|
|
849
836
|
self,
|
|
850
837
|
workload: DockerWorkloadPlan,
|
|
851
838
|
ephemeral_volume_name_mapping: dict[str, str],
|
|
@@ -955,146 +942,120 @@ class DockerDeployer(EndoscopicDeployer):
|
|
|
955
942
|
envs.GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY.lower()
|
|
956
943
|
== "cdi"
|
|
957
944
|
)
|
|
945
|
+
fmt = "plain" if not cdi else "cdi"
|
|
958
946
|
|
|
959
|
-
r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
|
|
960
|
-
r_k_backend_env = workload.resource_key_backend_env_mapping or {}
|
|
961
|
-
vd_manus, vd_env, vd_cdis, vd_values = (
|
|
962
|
-
self.get_visible_devices_materials()
|
|
963
|
-
)
|
|
964
947
|
for r_k, r_v in c.resources.items():
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
if r_k in r_k_runtime_env:
|
|
983
|
-
# Set env if resource key is mapped.
|
|
984
|
-
runtime_env = [r_k_runtime_env[r_k]]
|
|
985
|
-
elif (
|
|
986
|
-
r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
|
|
987
|
-
):
|
|
988
|
-
# Set env if auto-mapping key is matched.
|
|
989
|
-
runtime_env = list(vd_env.keys())
|
|
990
|
-
else:
|
|
991
|
-
continue
|
|
948
|
+
if r_k == "cpu":
|
|
949
|
+
if isinstance(r_v, int | float):
|
|
950
|
+
create_options["cpu_shares"] = ceil(r_v * 1024)
|
|
951
|
+
elif isinstance(r_v, str) and r_v.isdigit():
|
|
952
|
+
create_options["cpu_shares"] = ceil(float(r_v) * 1024)
|
|
953
|
+
continue
|
|
954
|
+
if r_k == "memory":
|
|
955
|
+
if isinstance(r_v, int):
|
|
956
|
+
create_options["mem_limit"] = r_v
|
|
957
|
+
create_options["mem_reservation"] = r_v
|
|
958
|
+
create_options["memswap_limit"] = r_v
|
|
959
|
+
elif isinstance(r_v, str):
|
|
960
|
+
v = r_v.lower().removesuffix("i")
|
|
961
|
+
create_options["mem_limit"] = v
|
|
962
|
+
create_options["mem_reservation"] = v
|
|
963
|
+
create_options["memswap_limit"] = v
|
|
964
|
+
continue
|
|
992
965
|
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
966
|
+
if (
|
|
967
|
+
r_k
|
|
968
|
+
in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
|
|
969
|
+
):
|
|
970
|
+
# Set env if resource key is mapped.
|
|
971
|
+
runtime_envs = [
|
|
972
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
|
|
973
|
+
r_k
|
|
974
|
+
],
|
|
975
|
+
]
|
|
976
|
+
elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
|
|
977
|
+
# Set env if auto-mapping key is matched.
|
|
978
|
+
runtime_envs = self.get_runtime_envs()
|
|
979
|
+
else:
|
|
980
|
+
continue
|
|
1002
981
|
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
982
|
+
privileged = create_options.get("privileged", False)
|
|
983
|
+
resource_values = [x.strip() for x in r_v.split(",")]
|
|
984
|
+
|
|
985
|
+
# Generate CDI config if not yet.
|
|
986
|
+
if cdi and envs.GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE:
|
|
987
|
+
for ren in runtime_envs:
|
|
988
|
+
r_m = self.get_manufacturer(ren)
|
|
989
|
+
cdi_dump_config(
|
|
990
|
+
manufacturer=r_m,
|
|
991
|
+
output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
# Request devices.
|
|
995
|
+
if r_v == "all":
|
|
996
|
+
# Configure privileged.
|
|
997
|
+
create_options["privileged"] = True
|
|
998
|
+
# Request all devices.
|
|
999
|
+
for ren in runtime_envs:
|
|
1000
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1001
|
+
# Request device via CDI.
|
|
1002
|
+
if cdi:
|
|
1003
|
+
if "device_requests" not in create_options:
|
|
1004
|
+
create_options["device_requests"] = []
|
|
1005
|
+
create_options["device_requests"].append(
|
|
1006
|
+
docker.types.DeviceRequest(
|
|
1007
|
+
driver="cdi",
|
|
1008
|
+
count=0,
|
|
1009
|
+
device_ids=r_vs,
|
|
1010
|
+
),
|
|
1011
|
+
)
|
|
1012
|
+
continue
|
|
1013
|
+
# Request device via visible devices env.
|
|
1014
|
+
create_options["environment"][ren] = ",".join(r_vs)
|
|
1015
|
+
else:
|
|
1016
|
+
# Request specific devices.
|
|
1017
|
+
for ren in runtime_envs:
|
|
1018
|
+
# Request all devices if privileged,
|
|
1019
|
+
# otherwise, normalize requested devices.
|
|
1020
|
+
if privileged:
|
|
1021
|
+
r_vs = self.get_runtime_visible_devices(ren, fmt)
|
|
1040
1022
|
else:
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
for re in runtime_env:
|
|
1046
|
-
# Request device via CDI.
|
|
1047
|
-
if cdi:
|
|
1048
|
-
if not privileged:
|
|
1049
|
-
rv = [
|
|
1050
|
-
f"{vd_cdis[re]}={v.strip()}"
|
|
1051
|
-
for v in r_v.split(",")
|
|
1052
|
-
]
|
|
1053
|
-
else:
|
|
1054
|
-
rv = [
|
|
1055
|
-
f"{vd_cdis[re]}={v}"
|
|
1056
|
-
for v in (vd_values.get(re) or ["all"])
|
|
1057
|
-
]
|
|
1058
|
-
if "device_requests" not in create_options:
|
|
1059
|
-
create_options["device_requests"] = []
|
|
1060
|
-
create_options["device_requests"].append(
|
|
1061
|
-
docker.types.DeviceRequest(
|
|
1062
|
-
driver="cdi",
|
|
1063
|
-
count=0,
|
|
1064
|
-
device_ids=rv,
|
|
1065
|
-
),
|
|
1066
|
-
)
|
|
1067
|
-
continue
|
|
1068
|
-
# Request device via visible devices env.
|
|
1069
|
-
if not privileged:
|
|
1070
|
-
rv = str(r_v)
|
|
1071
|
-
else:
|
|
1072
|
-
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1073
|
-
create_options["environment"][re] = rv
|
|
1074
|
-
|
|
1075
|
-
# Configure runtime device access environment variables.
|
|
1076
|
-
if r_v != "all" and privileged:
|
|
1077
|
-
for be in backend_env:
|
|
1078
|
-
create_options["environment"][be] = (
|
|
1079
|
-
self.align_backend_visible_devices_env_values(
|
|
1080
|
-
be,
|
|
1081
|
-
str(r_v),
|
|
1082
|
-
)
|
|
1083
|
-
)
|
|
1084
|
-
|
|
1085
|
-
# Configure affinity if applicable.
|
|
1086
|
-
if (
|
|
1087
|
-
envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
|
|
1088
|
-
or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
|
|
1089
|
-
):
|
|
1090
|
-
cpus, numas = self.get_visible_devices_affinities(
|
|
1091
|
-
runtime_env,
|
|
1092
|
-
r_v,
|
|
1023
|
+
r_vs = self.map_runtime_visible_devices(
|
|
1024
|
+
ren,
|
|
1025
|
+
resource_values,
|
|
1026
|
+
fmt,
|
|
1093
1027
|
)
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
if
|
|
1097
|
-
create_options["
|
|
1028
|
+
# Request device via CDI.
|
|
1029
|
+
if cdi:
|
|
1030
|
+
if "device_requests" not in create_options:
|
|
1031
|
+
create_options["device_requests"] = []
|
|
1032
|
+
create_options["device_requests"].append(
|
|
1033
|
+
docker.types.DeviceRequest(
|
|
1034
|
+
driver="cdi",
|
|
1035
|
+
count=0,
|
|
1036
|
+
device_ids=r_vs,
|
|
1037
|
+
),
|
|
1038
|
+
)
|
|
1039
|
+
continue
|
|
1040
|
+
# Request device via visible devices env.
|
|
1041
|
+
create_options["environment"][ren] = ",".join(r_vs)
|
|
1042
|
+
|
|
1043
|
+
# If not requesting all devices but privileged,
|
|
1044
|
+
# must configure visible devices.
|
|
1045
|
+
if r_v != "all" and privileged:
|
|
1046
|
+
b_vs = self.map_backend_visible_devices(
|
|
1047
|
+
runtime_envs,
|
|
1048
|
+
resource_values,
|
|
1049
|
+
)
|
|
1050
|
+
create_options["environment"].update(b_vs)
|
|
1051
|
+
|
|
1052
|
+
# Configure affinity if applicable.
|
|
1053
|
+
create_options.update(
|
|
1054
|
+
self.map_visible_devices_affinities(
|
|
1055
|
+
runtime_envs,
|
|
1056
|
+
resource_values,
|
|
1057
|
+
),
|
|
1058
|
+
)
|
|
1098
1059
|
|
|
1099
1060
|
# Parameterize mounts.
|
|
1100
1061
|
self._append_container_mounts(
|
|
@@ -323,7 +323,7 @@ def get_device_allocation_policy(
|
|
|
323
323
|
|
|
324
324
|
if manufacturer in [
|
|
325
325
|
ManufacturerEnum.AMD,
|
|
326
|
-
|
|
326
|
+
ManufacturerEnum.ASCEND,
|
|
327
327
|
ManufacturerEnum.HYGON,
|
|
328
328
|
ManufacturerEnum.ILUVATAR,
|
|
329
329
|
ManufacturerEnum.METAX,
|
|
@@ -11,7 +11,7 @@ import grpc
|
|
|
11
11
|
from grpc_interceptor import AsyncServerInterceptor
|
|
12
12
|
from grpc_interceptor.exceptions import GrpcException
|
|
13
13
|
|
|
14
|
-
from ....detector import Device, str_range_to_list
|
|
14
|
+
from ....detector import Device, DeviceMemoryStatusEnum, str_range_to_list
|
|
15
15
|
from ...cdi import (
|
|
16
16
|
generate_config,
|
|
17
17
|
manufacturer_to_cdi_kind,
|
|
@@ -40,6 +40,7 @@ from ..types.kubelet.deviceplugin.v1beta1 import (
|
|
|
40
40
|
RegisterRequest,
|
|
41
41
|
RegistrationStub,
|
|
42
42
|
TopologyInfo,
|
|
43
|
+
Unhealthy,
|
|
43
44
|
Version,
|
|
44
45
|
add_DevicePluginServicer_to_server,
|
|
45
46
|
)
|
|
@@ -159,7 +160,7 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
|
|
|
159
160
|
self._runtime_env = manufacturer_to_runtime_env(device.manufacturer)
|
|
160
161
|
self._kdp_resource = cdi_kind_to_kdp_resource(
|
|
161
162
|
cdi_kind=self._cdi_kind,
|
|
162
|
-
device_index=device.index,
|
|
163
|
+
device_index=str(device.index),
|
|
163
164
|
)
|
|
164
165
|
|
|
165
166
|
super().__init__(self._kdp_resource)
|
|
@@ -334,12 +335,12 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
|
|
|
334
335
|
The response containing the list of devices.
|
|
335
336
|
|
|
336
337
|
"""
|
|
337
|
-
device_id = (
|
|
338
|
-
self._device.uuid if self._id_by == "uuid" else str(self._device.index)
|
|
339
|
-
)
|
|
340
|
-
|
|
341
338
|
dp_devices: list[DevicePluginDevice] = []
|
|
342
|
-
dp_device_health =
|
|
339
|
+
dp_device_health = (
|
|
340
|
+
Healthy
|
|
341
|
+
if self._device.memory_status == DeviceMemoryStatusEnum.HEALTHY
|
|
342
|
+
else Unhealthy
|
|
343
|
+
)
|
|
343
344
|
dp_device_topo = TopologyInfo(
|
|
344
345
|
nodes=[
|
|
345
346
|
NUMANode(
|
|
@@ -352,7 +353,10 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
|
|
|
352
353
|
)
|
|
353
354
|
|
|
354
355
|
for device_replica in range(1, self._max_allocations + 1):
|
|
355
|
-
dp_device_id = _to_device_plugin_device_id(
|
|
356
|
+
dp_device_id = _to_device_plugin_device_id(
|
|
357
|
+
str(self._device.index),
|
|
358
|
+
device_replica,
|
|
359
|
+
)
|
|
356
360
|
dp_devices.append(
|
|
357
361
|
DevicePluginDevice(
|
|
358
362
|
ID=dp_device_id,
|
|
@@ -419,28 +423,25 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
|
|
|
419
423
|
req: ContainerAllocateRequest,
|
|
420
424
|
) -> ContainerAllocateResponse:
|
|
421
425
|
policy = self._allocation_policy
|
|
422
|
-
|
|
426
|
+
device_id = self._device.uuid
|
|
427
|
+
if self._id_by == "index":
|
|
428
|
+
device_id = str(self._device.index)
|
|
423
429
|
|
|
424
430
|
# CDI device allocation.
|
|
425
431
|
if policy == "cdi":
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
device_id, _ = _from_device_plugin_device_id(dp_device_id)
|
|
429
|
-
cdi_devices.append(
|
|
432
|
+
return ContainerAllocateResponse(
|
|
433
|
+
cdi_devices=[
|
|
430
434
|
CDIDevice(
|
|
431
435
|
name=f"{self._cdi_kind}={device_id}",
|
|
432
436
|
),
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
return ContainerAllocateResponse(
|
|
436
|
-
cdi_devices=cdi_devices,
|
|
437
|
+
],
|
|
437
438
|
)
|
|
438
439
|
|
|
439
440
|
# Environment variable device allocation.
|
|
440
441
|
if policy == "env":
|
|
441
442
|
return ContainerAllocateResponse(
|
|
442
443
|
envs={
|
|
443
|
-
self._runtime_env:
|
|
444
|
+
self._runtime_env: device_id,
|
|
444
445
|
},
|
|
445
446
|
)
|
|
446
447
|
|
|
@@ -509,7 +510,7 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
|
|
|
509
510
|
@lru_cache
|
|
510
511
|
def cdi_kind_to_kdp_resource(
|
|
511
512
|
cdi_kind: str,
|
|
512
|
-
device_index:
|
|
513
|
+
device_index: str,
|
|
513
514
|
) -> str:
|
|
514
515
|
"""
|
|
515
516
|
Map CDI kind and device index to a Kubernetes Device Plugin resource name.
|