gpustack-runtime 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. gpustack_runtime/_version.py +2 -2
  2. gpustack_runtime/_version_appendix.py +1 -1
  3. gpustack_runtime/cmds/detector.py +3 -1
  4. gpustack_runtime/deployer/__types__.py +314 -233
  5. gpustack_runtime/deployer/cdi/__utils__.py +4 -1
  6. gpustack_runtime/deployer/docker.py +109 -148
  7. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +21 -3
  8. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
  9. gpustack_runtime/deployer/kuberentes.py +91 -126
  10. gpustack_runtime/deployer/podman.py +89 -122
  11. gpustack_runtime/detector/__init__.py +2 -0
  12. gpustack_runtime/detector/__types__.py +26 -0
  13. gpustack_runtime/detector/amd.py +28 -8
  14. gpustack_runtime/detector/ascend.py +49 -4
  15. gpustack_runtime/detector/cambricon.py +3 -0
  16. gpustack_runtime/detector/hygon.py +16 -1
  17. gpustack_runtime/detector/iluvatar.py +6 -0
  18. gpustack_runtime/detector/metax.py +8 -0
  19. gpustack_runtime/detector/mthreads.py +11 -0
  20. gpustack_runtime/detector/nvidia.py +139 -134
  21. gpustack_runtime/detector/pyixml/__init__.py +16 -0
  22. gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
  23. gpustack_runtime/detector/thead.py +135 -127
  24. gpustack_runtime/envs.py +7 -6
  25. {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
  26. {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
  27. {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
  28. {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
  29. {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0
@@ -147,6 +147,7 @@ def path_to_cdi_mount(
147
147
  path: str,
148
148
  container_path: str | None = None,
149
149
  options: list[str] | None = None,
150
+ ignore_notfound: bool = False,
150
151
  ) -> ConfigMount | None:
151
152
  """
152
153
  Convert a file/directory path to a ConfigMount.
@@ -158,13 +159,15 @@ def path_to_cdi_mount(
158
159
  Path to the file or directory inside the container.
159
160
  options:
160
161
  Mount options.
162
+ ignore_notfound:
163
+ Whether to ignore if the path does not exist.
161
164
 
162
165
  Returns:
163
166
  The ConfigMount object.
164
167
  None if the path does not exist.
165
168
 
166
169
  """
167
- if not Path(path).exists():
170
+ if not Path(path).exists() and not ignore_notfound:
168
171
  return None
169
172
 
170
173
  if container_path is None:
@@ -4,13 +4,11 @@ import contextlib
4
4
  import io
5
5
  import json
6
6
  import logging
7
- import operator
8
7
  import os
9
8
  import socket
10
9
  import sys
11
10
  import tarfile
12
11
  from dataclasses import dataclass, field
13
- from functools import reduce
14
12
  from math import ceil
15
13
  from pathlib import Path
16
14
  from typing import TYPE_CHECKING, Any
@@ -81,17 +79,6 @@ class DockerWorkloadPlan(WorkloadPlan):
81
79
  Image used for the pause container.
82
80
  unhealthy_restart_image (str):
83
81
  Image used for unhealthy restart container.
84
- resource_key_runtime_env_mapping: (dict[str, str]):
85
- Mapping from resource names to environment variable names for device allocation,
86
- which is used to tell the Container Runtime which GPUs to mount into the container.
87
- For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
88
- which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
89
- With privileged mode, the container can access all GPUs even if specified.
90
- resource_key_backend_env_mapping: (dict[str, list[str]]):
91
- Mapping from resource names to environment variable names for device runtime,
92
- which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
93
- For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
94
- which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
95
82
  namespace (str | None):
96
83
  Namespace of the workload.
97
84
  name (str):
@@ -845,7 +832,7 @@ class DockerDeployer(EndoscopicDeployer):
845
832
  msg = f"Failed to upload ephemeral files to container {container.name}"
846
833
  raise OperationError(msg)
847
834
 
848
- def _create_containers( # noqa: C901
835
+ def _create_containers(
849
836
  self,
850
837
  workload: DockerWorkloadPlan,
851
838
  ephemeral_volume_name_mapping: dict[str, str],
@@ -955,146 +942,120 @@ class DockerDeployer(EndoscopicDeployer):
955
942
  envs.GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY.lower()
956
943
  == "cdi"
957
944
  )
945
+ fmt = "plain" if not cdi else "cdi"
958
946
 
959
- r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
960
- r_k_backend_env = workload.resource_key_backend_env_mapping or {}
961
- vd_manus, vd_env, vd_cdis, vd_values = (
962
- self.get_visible_devices_materials()
963
- )
964
947
  for r_k, r_v in c.resources.items():
965
- match r_k:
966
- case "cpu":
967
- if isinstance(r_v, int | float):
968
- create_options["cpu_shares"] = ceil(r_v * 1024)
969
- elif isinstance(r_v, str) and r_v.isdigit():
970
- create_options["cpu_shares"] = ceil(float(r_v) * 1024)
971
- case "memory":
972
- if isinstance(r_v, int):
973
- create_options["mem_limit"] = r_v
974
- create_options["mem_reservation"] = r_v
975
- create_options["memswap_limit"] = r_v
976
- elif isinstance(r_v, str):
977
- v = r_v.lower().removesuffix("i")
978
- create_options["mem_limit"] = v
979
- create_options["mem_reservation"] = v
980
- create_options["memswap_limit"] = v
981
- case _:
982
- if r_k in r_k_runtime_env:
983
- # Set env if resource key is mapped.
984
- runtime_env = [r_k_runtime_env[r_k]]
985
- elif (
986
- r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
987
- ):
988
- # Set env if auto-mapping key is matched.
989
- runtime_env = list(vd_env.keys())
990
- else:
991
- continue
948
+ if r_k == "cpu":
949
+ if isinstance(r_v, int | float):
950
+ create_options["cpu_shares"] = ceil(r_v * 1024)
951
+ elif isinstance(r_v, str) and r_v.isdigit():
952
+ create_options["cpu_shares"] = ceil(float(r_v) * 1024)
953
+ continue
954
+ if r_k == "memory":
955
+ if isinstance(r_v, int):
956
+ create_options["mem_limit"] = r_v
957
+ create_options["mem_reservation"] = r_v
958
+ create_options["memswap_limit"] = r_v
959
+ elif isinstance(r_v, str):
960
+ v = r_v.lower().removesuffix("i")
961
+ create_options["mem_limit"] = v
962
+ create_options["mem_reservation"] = v
963
+ create_options["memswap_limit"] = v
964
+ continue
992
965
 
993
- if r_k in r_k_backend_env:
994
- # Set env if resource key is mapped.
995
- backend_env = r_k_backend_env[r_k]
996
- else:
997
- # Otherwise, use the default backend env names.
998
- backend_env = reduce(
999
- operator.add,
1000
- list(vd_env.values()),
1001
- )
966
+ if (
967
+ r_k
968
+ in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
969
+ ):
970
+ # Set env if resource key is mapped.
971
+ runtime_envs = [
972
+ envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
973
+ r_k
974
+ ],
975
+ ]
976
+ elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
977
+ # Set env if auto-mapping key is matched.
978
+ runtime_envs = self.get_runtime_envs()
979
+ else:
980
+ continue
1002
981
 
1003
- privileged = create_options.get("privileged", False)
1004
-
1005
- # Generate CDI config if not yet.
1006
- if cdi and envs.GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE:
1007
- for re in runtime_env:
1008
- cdi_dump_config(
1009
- manufacturer=vd_manus[re],
1010
- output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
1011
- )
1012
-
1013
- # Configure device access environment variable.
1014
- if r_v == "all" and backend_env:
1015
- # Configure privileged if requested all devices.
1016
- create_options["privileged"] = True
1017
- # Then, set container backend visible devices env to all devices,
1018
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1019
- # and mount corresponding libs if needed.
1020
- for re in runtime_env:
1021
- # Request device via CDI.
1022
- if cdi:
1023
- rv = [
1024
- f"{vd_cdis[re]}={v}"
1025
- for v in (vd_values.get(re) or ["all"])
1026
- ]
1027
- if "device_requests" not in create_options:
1028
- create_options["device_requests"] = []
1029
- create_options["device_requests"].append(
1030
- docker.types.DeviceRequest(
1031
- driver="cdi",
1032
- count=0,
1033
- device_ids=rv,
1034
- ),
1035
- )
1036
- continue
1037
- # Request device via visible devices env.
1038
- rv = ",".join(vd_values.get(re) or ["all"])
1039
- create_options["environment"][re] = rv
982
+ privileged = create_options.get("privileged", False)
983
+ resource_values = [x.strip() for x in r_v.split(",")]
984
+
985
+ # Generate CDI config if not yet.
986
+ if cdi and envs.GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE:
987
+ for ren in runtime_envs:
988
+ r_m = self.get_manufacturer(ren)
989
+ cdi_dump_config(
990
+ manufacturer=r_m,
991
+ output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
992
+ )
993
+
994
+ # Request devices.
995
+ if r_v == "all":
996
+ # Configure privileged.
997
+ create_options["privileged"] = True
998
+ # Request all devices.
999
+ for ren in runtime_envs:
1000
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1001
+ # Request device via CDI.
1002
+ if cdi:
1003
+ if "device_requests" not in create_options:
1004
+ create_options["device_requests"] = []
1005
+ create_options["device_requests"].append(
1006
+ docker.types.DeviceRequest(
1007
+ driver="cdi",
1008
+ count=0,
1009
+ device_ids=r_vs,
1010
+ ),
1011
+ )
1012
+ continue
1013
+ # Request device via visible devices env.
1014
+ create_options["environment"][ren] = ",".join(r_vs)
1015
+ else:
1016
+ # Request specific devices.
1017
+ for ren in runtime_envs:
1018
+ # Request all devices if privileged,
1019
+ # otherwise, normalize requested devices.
1020
+ if privileged:
1021
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1040
1022
  else:
1041
- # Set env to the allocated device IDs if no privileged,
1042
- # otherwise, set container backend visible devices env to all devices,
1043
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1044
- # and mount corresponding libs if needed.
1045
- for re in runtime_env:
1046
- # Request device via CDI.
1047
- if cdi:
1048
- if not privileged:
1049
- rv = [
1050
- f"{vd_cdis[re]}={v.strip()}"
1051
- for v in r_v.split(",")
1052
- ]
1053
- else:
1054
- rv = [
1055
- f"{vd_cdis[re]}={v}"
1056
- for v in (vd_values.get(re) or ["all"])
1057
- ]
1058
- if "device_requests" not in create_options:
1059
- create_options["device_requests"] = []
1060
- create_options["device_requests"].append(
1061
- docker.types.DeviceRequest(
1062
- driver="cdi",
1063
- count=0,
1064
- device_ids=rv,
1065
- ),
1066
- )
1067
- continue
1068
- # Request device via visible devices env.
1069
- if not privileged:
1070
- rv = str(r_v)
1071
- else:
1072
- rv = ",".join(vd_values.get(re) or ["all"])
1073
- create_options["environment"][re] = rv
1074
-
1075
- # Configure runtime device access environment variables.
1076
- if r_v != "all" and privileged:
1077
- for be in backend_env:
1078
- create_options["environment"][be] = (
1079
- self.align_backend_visible_devices_env_values(
1080
- be,
1081
- str(r_v),
1082
- )
1083
- )
1084
-
1085
- # Configure affinity if applicable.
1086
- if (
1087
- envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1088
- or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1089
- ):
1090
- cpus, numas = self.get_visible_devices_affinities(
1091
- runtime_env,
1092
- r_v,
1023
+ r_vs = self.map_runtime_visible_devices(
1024
+ ren,
1025
+ resource_values,
1026
+ fmt,
1093
1027
  )
1094
- if cpus:
1095
- create_options["cpuset_cpus"] = cpus
1096
- if numas and envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
1097
- create_options["cpuset_mems"] = numas
1028
+ # Request device via CDI.
1029
+ if cdi:
1030
+ if "device_requests" not in create_options:
1031
+ create_options["device_requests"] = []
1032
+ create_options["device_requests"].append(
1033
+ docker.types.DeviceRequest(
1034
+ driver="cdi",
1035
+ count=0,
1036
+ device_ids=r_vs,
1037
+ ),
1038
+ )
1039
+ continue
1040
+ # Request device via visible devices env.
1041
+ create_options["environment"][ren] = ",".join(r_vs)
1042
+
1043
+ # If not requesting all devices but privileged,
1044
+ # must configure visible devices.
1045
+ if r_v != "all" and privileged:
1046
+ b_vs = self.map_backend_visible_devices(
1047
+ runtime_envs,
1048
+ resource_values,
1049
+ )
1050
+ create_options["environment"].update(b_vs)
1051
+
1052
+ # Configure affinity if applicable.
1053
+ create_options.update(
1054
+ self.map_visible_devices_affinities(
1055
+ runtime_envs,
1056
+ resource_values,
1057
+ ),
1058
+ )
1098
1059
 
1099
1060
  # Parameterize mounts.
1100
1061
  self._append_container_mounts(
@@ -65,7 +65,7 @@ async def serve_async(
65
65
  if not devices:
66
66
  continue
67
67
 
68
- allocation_policy = _get_device_allocation_policy(manu)
68
+ allocation_policy = get_device_allocation_policy(manu)
69
69
  logger.info(
70
70
  "Using device allocation policy '%s' for manufacturer '%s'",
71
71
  allocation_policy,
@@ -277,7 +277,23 @@ def is_kubelet_socket_accessible(
277
277
 
278
278
 
279
279
  @lru_cache
280
- def _get_device_allocation_policy(
280
+ def get_resource_injection_policy() -> Literal["env", "kdp"]:
281
+ """
282
+ Get the resource injection policy (in lowercase) for the deployer.
283
+
284
+ Returns:
285
+ The resource injection policy.
286
+
287
+ """
288
+ policy = envs.GPUSTACK_RUNTIME_KUBERNETES_RESOURCE_INJECTION_POLICY.lower()
289
+ if policy != "auto":
290
+ return policy
291
+
292
+ return "kdp" if is_kubelet_socket_accessible() else "env"
293
+
294
+
295
+ @lru_cache
296
+ def get_device_allocation_policy(
281
297
  manufacturer: ManufacturerEnum,
282
298
  ) -> Literal["env", "cdi", "opaque"]:
283
299
  """
@@ -307,7 +323,7 @@ def _get_device_allocation_policy(
307
323
 
308
324
  if manufacturer in [
309
325
  ManufacturerEnum.AMD,
310
- # ManufacturerEnum.ASCEND, # Prioritize using Env policy for Ascend.
326
+ ManufacturerEnum.ASCEND,
311
327
  ManufacturerEnum.HYGON,
312
328
  ManufacturerEnum.ILUVATAR,
313
329
  ManufacturerEnum.METAX,
@@ -320,6 +336,8 @@ def _get_device_allocation_policy(
320
336
 
321
337
  __all__ = [
322
338
  "cdi_kind_to_kdp_resource",
339
+ "get_device_allocation_policy",
340
+ "get_resource_injection_policy",
323
341
  "is_kubelet_socket_accessible",
324
342
  "serve",
325
343
  "serve_async",
@@ -11,7 +11,7 @@ import grpc
11
11
  from grpc_interceptor import AsyncServerInterceptor
12
12
  from grpc_interceptor.exceptions import GrpcException
13
13
 
14
- from ....detector import Device, str_range_to_list
14
+ from ....detector import Device, DeviceMemoryStatusEnum, str_range_to_list
15
15
  from ...cdi import (
16
16
  generate_config,
17
17
  manufacturer_to_cdi_kind,
@@ -40,6 +40,7 @@ from ..types.kubelet.deviceplugin.v1beta1 import (
40
40
  RegisterRequest,
41
41
  RegistrationStub,
42
42
  TopologyInfo,
43
+ Unhealthy,
43
44
  Version,
44
45
  add_DevicePluginServicer_to_server,
45
46
  )
@@ -159,7 +160,7 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
159
160
  self._runtime_env = manufacturer_to_runtime_env(device.manufacturer)
160
161
  self._kdp_resource = cdi_kind_to_kdp_resource(
161
162
  cdi_kind=self._cdi_kind,
162
- device_index=device.index,
163
+ device_index=str(device.index),
163
164
  )
164
165
 
165
166
  super().__init__(self._kdp_resource)
@@ -334,12 +335,12 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
334
335
  The response containing the list of devices.
335
336
 
336
337
  """
337
- device_id = (
338
- self._device.uuid if self._id_by == "uuid" else str(self._device.index)
339
- )
340
-
341
338
  dp_devices: list[DevicePluginDevice] = []
342
- dp_device_health = Healthy
339
+ dp_device_health = (
340
+ Healthy
341
+ if self._device.memory_status == DeviceMemoryStatusEnum.HEALTHY
342
+ else Unhealthy
343
+ )
343
344
  dp_device_topo = TopologyInfo(
344
345
  nodes=[
345
346
  NUMANode(
@@ -352,7 +353,10 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
352
353
  )
353
354
 
354
355
  for device_replica in range(1, self._max_allocations + 1):
355
- dp_device_id = _to_device_plugin_device_id(device_id, device_replica)
356
+ dp_device_id = _to_device_plugin_device_id(
357
+ str(self._device.index),
358
+ device_replica,
359
+ )
356
360
  dp_devices.append(
357
361
  DevicePluginDevice(
358
362
  ID=dp_device_id,
@@ -419,28 +423,25 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
419
423
  req: ContainerAllocateRequest,
420
424
  ) -> ContainerAllocateResponse:
421
425
  policy = self._allocation_policy
422
- request_dp_device_ids = req.devices_ids
426
+ device_id = self._device.uuid
427
+ if self._id_by == "index":
428
+ device_id = str(self._device.index)
423
429
 
424
430
  # CDI device allocation.
425
431
  if policy == "cdi":
426
- cdi_devices: list[CDIDevice] = []
427
- for dp_device_id in request_dp_device_ids:
428
- device_id, _ = _from_device_plugin_device_id(dp_device_id)
429
- cdi_devices.append(
432
+ return ContainerAllocateResponse(
433
+ cdi_devices=[
430
434
  CDIDevice(
431
435
  name=f"{self._cdi_kind}={device_id}",
432
436
  ),
433
- )
434
-
435
- return ContainerAllocateResponse(
436
- cdi_devices=cdi_devices,
437
+ ],
437
438
  )
438
439
 
439
440
  # Environment variable device allocation.
440
441
  if policy == "env":
441
442
  return ContainerAllocateResponse(
442
443
  envs={
443
- self._runtime_env: ",".join(request_dp_device_ids),
444
+ self._runtime_env: device_id,
444
445
  },
445
446
  )
446
447
 
@@ -509,7 +510,7 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
509
510
  @lru_cache
510
511
  def cdi_kind_to_kdp_resource(
511
512
  cdi_kind: str,
512
- device_index: int,
513
+ device_index: str,
513
514
  ) -> str:
514
515
  """
515
516
  Map CDI kind and device index to a Kubernetes Device Plugin resource name.