gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. gpustack_runtime/_version.py +2 -2
  2. gpustack_runtime/_version_appendix.py +1 -1
  3. gpustack_runtime/cmds/detector.py +4 -2
  4. gpustack_runtime/deployer/__types__.py +314 -233
  5. gpustack_runtime/deployer/cdi/__init__.py +1 -1
  6. gpustack_runtime/deployer/cdi/__types__.py +2 -2
  7. gpustack_runtime/deployer/cdi/__utils__.py +4 -1
  8. gpustack_runtime/deployer/cdi/amd.py +6 -8
  9. gpustack_runtime/deployer/cdi/ascend.py +7 -9
  10. gpustack_runtime/deployer/cdi/hygon.py +6 -8
  11. gpustack_runtime/deployer/cdi/iluvatar.py +6 -8
  12. gpustack_runtime/deployer/cdi/metax.py +6 -8
  13. gpustack_runtime/deployer/cdi/thead.py +6 -8
  14. gpustack_runtime/deployer/docker.py +133 -146
  15. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +13 -8
  16. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +26 -21
  17. gpustack_runtime/deployer/kuberentes.py +89 -108
  18. gpustack_runtime/deployer/podman.py +113 -120
  19. gpustack_runtime/detector/__init__.py +2 -0
  20. gpustack_runtime/detector/__types__.py +26 -0
  21. gpustack_runtime/detector/__utils__.py +3 -0
  22. gpustack_runtime/detector/amd.py +32 -10
  23. gpustack_runtime/detector/ascend.py +67 -13
  24. gpustack_runtime/detector/cambricon.py +3 -0
  25. gpustack_runtime/detector/hygon.py +22 -3
  26. gpustack_runtime/detector/iluvatar.py +15 -7
  27. gpustack_runtime/detector/metax.py +16 -6
  28. gpustack_runtime/detector/mthreads.py +22 -8
  29. gpustack_runtime/detector/nvidia.py +148 -140
  30. gpustack_runtime/detector/pyacl/__init__.py +34 -14
  31. gpustack_runtime/detector/pydcmi/__init__.py +4 -2
  32. gpustack_runtime/detector/pyixml/__init__.py +16 -0
  33. gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
  34. gpustack_runtime/detector/thead.py +145 -134
  35. gpustack_runtime/envs.py +7 -6
  36. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/METADATA +2 -2
  37. gpustack_runtime-0.1.42.post1.dist-info/RECORD +67 -0
  38. gpustack_runtime-0.1.41.post3.dist-info/RECORD +0 -67
  39. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/WHEEL +0 -0
  40. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/entry_points.txt +0 -0
  41. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/licenses/LICENSE +0 -0
@@ -82,7 +82,7 @@ def dump_config(
82
82
  if cdi_path.exists():
83
83
  actual = cdi_path.read_text(encoding="utf-8")
84
84
  if actual == expected:
85
- return expected, str(cdi_path)
85
+ return expected, None
86
86
 
87
87
  cdi_path.write_text(expected, encoding="utf-8")
88
88
  return expected, str(cdi_path)
@@ -527,7 +527,7 @@ class Config(dict):
527
527
  self,
528
528
  kind: str,
529
529
  devices: list[ConfigDevice],
530
- container_edits: list[ConfigContainerEdits] | None = None,
530
+ container_edits: ConfigContainerEdits | None = None,
531
531
  cdi_version: str = _DEFAULT_CDI_VERSION,
532
532
  annotations: dict[str, str] | None = None,
533
533
  ):
@@ -581,7 +581,7 @@ class Config(dict):
581
581
  return self["kind"]
582
582
 
583
583
  @property
584
- def container_edits(self) -> list[ConfigContainerEdits] | None:
584
+ def container_edits(self) -> ConfigContainerEdits | None:
585
585
  """
586
586
  Return the list of container edits in the CDI configuration.
587
587
 
@@ -147,6 +147,7 @@ def path_to_cdi_mount(
147
147
  path: str,
148
148
  container_path: str | None = None,
149
149
  options: list[str] | None = None,
150
+ ignore_notfound: bool = False,
150
151
  ) -> ConfigMount | None:
151
152
  """
152
153
  Convert a file/directory path to a ConfigMount.
@@ -158,13 +159,15 @@ def path_to_cdi_mount(
158
159
  Path to the file or directory inside the container.
159
160
  options:
160
161
  Mount options.
162
+ ignore_notfound:
163
+ Whether to ignore if the path does not exist.
161
164
 
162
165
  Returns:
163
166
  The ConfigMount object.
164
167
  None if the path does not exist.
165
168
 
166
169
  """
167
- if not Path(path).exists():
170
+ if not Path(path).exists() and not ignore_notfound:
168
171
  return None
169
172
 
170
173
  if container_path is None:
@@ -135,12 +135,10 @@ class AMDGenerator(Generator):
135
135
  return Config(
136
136
  kind=kind,
137
137
  devices=cdi_devices,
138
- container_edits=[
139
- ConfigContainerEdits(
140
- env=[
141
- f"{runtime_env}=void",
142
- ],
143
- device_nodes=common_device_nodes,
144
- ),
145
- ],
138
+ container_edits=ConfigContainerEdits(
139
+ env=[
140
+ f"{runtime_env}=void",
141
+ ],
142
+ device_nodes=common_device_nodes,
143
+ ),
146
144
  )
@@ -152,13 +152,11 @@ class AscendGenerator(Generator):
152
152
  return Config(
153
153
  kind=kind,
154
154
  devices=cdi_devices,
155
- container_edits=[
156
- ConfigContainerEdits(
157
- env=[
158
- f"{runtime_env}=void",
159
- ],
160
- device_nodes=common_device_nodes,
161
- mounts=common_mounts,
162
- ),
163
- ],
155
+ container_edits=ConfigContainerEdits(
156
+ env=[
157
+ f"{runtime_env}=void",
158
+ ],
159
+ device_nodes=common_device_nodes,
160
+ mounts=common_mounts,
161
+ ),
164
162
  )
@@ -136,12 +136,10 @@ class HygonGenerator(Generator):
136
136
  return Config(
137
137
  kind=kind,
138
138
  devices=cdi_devices,
139
- container_edits=[
140
- ConfigContainerEdits(
141
- env=[
142
- f"{runtime_env}=void",
143
- ],
144
- device_nodes=common_device_nodes,
145
- ),
146
- ],
139
+ container_edits=ConfigContainerEdits(
140
+ env=[
141
+ f"{runtime_env}=void",
142
+ ],
143
+ device_nodes=common_device_nodes,
144
+ ),
147
145
  )
@@ -125,12 +125,10 @@ class IluvatarGenerator(Generator):
125
125
  return Config(
126
126
  kind=kind,
127
127
  devices=cdi_devices,
128
- container_edits=[
129
- ConfigContainerEdits(
130
- env=[
131
- f"{runtime_env}=void",
132
- ],
133
- device_nodes=common_device_nodes,
134
- ),
135
- ],
128
+ container_edits=ConfigContainerEdits(
129
+ env=[
130
+ f"{runtime_env}=void",
131
+ ],
132
+ device_nodes=common_device_nodes,
133
+ ),
136
134
  )
@@ -137,12 +137,10 @@ class MetaXGenerator(Generator):
137
137
  return Config(
138
138
  kind=kind,
139
139
  devices=cdi_devices,
140
- container_edits=[
141
- ConfigContainerEdits(
142
- env=[
143
- f"{runtime_env}=void",
144
- ],
145
- device_nodes=common_device_nodes,
146
- ),
147
- ],
140
+ container_edits=ConfigContainerEdits(
141
+ env=[
142
+ f"{runtime_env}=void",
143
+ ],
144
+ device_nodes=common_device_nodes,
145
+ ),
148
146
  )
@@ -126,12 +126,10 @@ class THeadGenerator(Generator):
126
126
  return Config(
127
127
  kind=kind,
128
128
  devices=cdi_devices,
129
- container_edits=[
130
- ConfigContainerEdits(
131
- env=[
132
- f"{runtime_env}=void",
133
- ],
134
- device_nodes=common_device_nodes,
135
- ),
136
- ],
129
+ container_edits=ConfigContainerEdits(
130
+ env=[
131
+ f"{runtime_env}=void",
132
+ ],
133
+ device_nodes=common_device_nodes,
134
+ ),
137
135
  )
@@ -4,13 +4,11 @@ import contextlib
4
4
  import io
5
5
  import json
6
6
  import logging
7
- import operator
8
7
  import os
9
8
  import socket
10
9
  import sys
11
10
  import tarfile
12
11
  from dataclasses import dataclass, field
13
- from functools import reduce
14
12
  from math import ceil
15
13
  from pathlib import Path
16
14
  from typing import TYPE_CHECKING, Any
@@ -81,17 +79,6 @@ class DockerWorkloadPlan(WorkloadPlan):
81
79
  Image used for the pause container.
82
80
  unhealthy_restart_image (str):
83
81
  Image used for unhealthy restart container.
84
- resource_key_runtime_env_mapping: (dict[str, str]):
85
- Mapping from resource names to environment variable names for device allocation,
86
- which is used to tell the Container Runtime which GPUs to mount into the container.
87
- For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
88
- which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
89
- With privileged mode, the container can access all GPUs even if specified.
90
- resource_key_backend_env_mapping: (dict[str, list[str]]):
91
- Mapping from resource names to environment variable names for device runtime,
92
- which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
93
- For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
94
- which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
95
82
  namespace (str | None):
96
83
  Namespace of the workload.
97
84
  name (str):
@@ -845,7 +832,7 @@ class DockerDeployer(EndoscopicDeployer):
845
832
  msg = f"Failed to upload ephemeral files to container {container.name}"
846
833
  raise OperationError(msg)
847
834
 
848
- def _create_containers( # noqa: C901
835
+ def _create_containers(
849
836
  self,
850
837
  workload: DockerWorkloadPlan,
851
838
  ephemeral_volume_name_mapping: dict[str, str],
@@ -955,146 +942,146 @@ class DockerDeployer(EndoscopicDeployer):
955
942
  envs.GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY.lower()
956
943
  == "cdi"
957
944
  )
945
+ fmt = "plain" if not cdi else "cdi"
958
946
 
959
- r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
960
- r_k_backend_env = workload.resource_key_backend_env_mapping or {}
961
- vd_manus, vd_env, vd_cdis, vd_values = (
962
- self.get_visible_devices_materials()
963
- )
964
947
  for r_k, r_v in c.resources.items():
965
- match r_k:
966
- case "cpu":
967
- if isinstance(r_v, int | float):
968
- create_options["cpu_shares"] = ceil(r_v * 1024)
969
- elif isinstance(r_v, str) and r_v.isdigit():
970
- create_options["cpu_shares"] = ceil(float(r_v) * 1024)
971
- case "memory":
972
- if isinstance(r_v, int):
973
- create_options["mem_limit"] = r_v
974
- create_options["mem_reservation"] = r_v
975
- create_options["memswap_limit"] = r_v
976
- elif isinstance(r_v, str):
977
- v = r_v.lower().removesuffix("i")
978
- create_options["mem_limit"] = v
979
- create_options["mem_reservation"] = v
980
- create_options["memswap_limit"] = v
981
- case _:
982
- if r_k in r_k_runtime_env:
983
- # Set env if resource key is mapped.
984
- runtime_env = [r_k_runtime_env[r_k]]
985
- elif (
986
- r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
987
- ):
988
- # Set env if auto-mapping key is matched.
989
- runtime_env = list(vd_env.keys())
990
- else:
991
- continue
992
-
993
- if r_k in r_k_backend_env:
994
- # Set env if resource key is mapped.
995
- backend_env = r_k_backend_env[r_k]
996
- else:
997
- # Otherwise, use the default backend env names.
998
- backend_env = reduce(
999
- operator.add,
1000
- list(vd_env.values()),
1001
- )
948
+ if r_k == "cpu":
949
+ if isinstance(r_v, int | float):
950
+ create_options["cpu_shares"] = ceil(r_v * 1024)
951
+ elif isinstance(r_v, str) and r_v.isdigit():
952
+ create_options["cpu_shares"] = ceil(float(r_v) * 1024)
953
+ continue
954
+ if r_k == "memory":
955
+ if isinstance(r_v, int):
956
+ create_options["mem_limit"] = r_v
957
+ create_options["mem_reservation"] = r_v
958
+ create_options["memswap_limit"] = r_v
959
+ elif isinstance(r_v, str):
960
+ v = r_v.lower().removesuffix("i")
961
+ create_options["mem_limit"] = v
962
+ create_options["mem_reservation"] = v
963
+ create_options["memswap_limit"] = v
964
+ continue
1002
965
 
1003
- privileged = create_options.get("privileged", False)
966
+ if (
967
+ r_k
968
+ in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
969
+ ):
970
+ # Set env if resource key is mapped.
971
+ runtime_envs = [
972
+ envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
973
+ r_k
974
+ ],
975
+ ]
976
+ elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
977
+ # Set env if auto-mapping key is matched.
978
+ runtime_envs = self.get_runtime_envs()
979
+ else:
980
+ continue
1004
981
 
1005
- # Generate CDI config if not yet.
1006
- if cdi and envs.GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE:
1007
- for re in runtime_env:
1008
- cdi_dump_config(
1009
- manufacturer=vd_manus[re],
1010
- output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
982
+ privileged = create_options.get("privileged", False)
983
+ resource_values = [x.strip() for x in r_v.split(",")]
984
+
985
+ # Generate CDI config if not yet.
986
+ if cdi and envs.GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE:
987
+ for ren in runtime_envs:
988
+ manu = self.get_manufacturer(ren)
989
+ cdi_config, cdi_config_path = cdi_dump_config(
990
+ manufacturer=manu,
991
+ output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
992
+ )
993
+ if cdi_config and cdi_config_path:
994
+ if logger.isEnabledFor(logging.DEBUG):
995
+ logger.debug(
996
+ "Generated CDI configuration for '%s' at '%s':\n%s",
997
+ manu,
998
+ cdi_config_path,
999
+ cdi_config,
1011
1000
  )
1012
-
1013
- # Configure device access environment variable.
1014
- if r_v == "all" and backend_env:
1015
- # Configure privileged if requested all devices.
1016
- create_options["privileged"] = True
1017
- # Then, set container backend visible devices env to all devices,
1018
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1019
- # and mount corresponding libs if needed.
1020
- for re in runtime_env:
1021
- # Request device via CDI.
1022
- if cdi:
1023
- rv = [
1024
- f"{vd_cdis[re]}={v}"
1025
- for v in (vd_values.get(re) or ["all"])
1026
- ]
1027
- if "device_requests" not in create_options:
1028
- create_options["device_requests"] = []
1029
- create_options["device_requests"].append(
1030
- docker.types.DeviceRequest(
1031
- driver="cdi",
1032
- count=0,
1033
- device_ids=rv,
1034
- ),
1035
- )
1036
- continue
1037
- # Request device via visible devices env.
1038
- rv = ",".join(vd_values.get(re) or ["all"])
1039
- create_options["environment"][re] = rv
1040
- else:
1041
- # Set env to the allocated device IDs if no privileged,
1042
- # otherwise, set container backend visible devices env to all devices,
1043
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1044
- # and mount corresponding libs if needed.
1045
- for re in runtime_env:
1046
- # Request device via CDI.
1047
- if cdi:
1048
- if not privileged:
1049
- rv = [
1050
- f"{vd_cdis[re]}={v.strip()}"
1051
- for v in r_v.split(",")
1052
- ]
1053
- else:
1054
- rv = [
1055
- f"{vd_cdis[re]}={v}"
1056
- for v in (vd_values.get(re) or ["all"])
1057
- ]
1058
- if "device_requests" not in create_options:
1059
- create_options["device_requests"] = []
1060
- create_options["device_requests"].append(
1061
- docker.types.DeviceRequest(
1062
- driver="cdi",
1063
- count=0,
1064
- device_ids=rv,
1065
- ),
1066
- )
1067
- continue
1068
- # Request device via visible devices env.
1069
- if not privileged:
1070
- rv = str(r_v)
1071
- else:
1072
- rv = ",".join(vd_values.get(re) or ["all"])
1073
- create_options["environment"][re] = rv
1074
-
1075
- # Configure runtime device access environment variables.
1076
- if r_v != "all" and privileged:
1077
- for be in backend_env:
1078
- create_options["environment"][be] = (
1079
- self.align_backend_visible_devices_env_values(
1080
- be,
1081
- str(r_v),
1082
- )
1001
+ else:
1002
+ logger.info(
1003
+ "Generated CDI configuration for '%s' at '%s'",
1004
+ manu,
1005
+ cdi_config_path,
1083
1006
  )
1007
+ elif cdi_config:
1008
+ logger.info(
1009
+ "Reuse generated CDI configuration for '%s'",
1010
+ manu,
1011
+ )
1012
+ else:
1013
+ logger.warning(
1014
+ "Delegated CDI configuration by other tools for '%s', "
1015
+ "e.g. for NVIDIA devices, please follow NVIDIA Container Toolkit Manual CDI Specification Generation to generate the CDI configuration, "
1016
+ "see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html#manual-cdi-specification-generation",
1017
+ manu,
1018
+ )
1084
1019
 
1085
- # Configure affinity if applicable.
1086
- if (
1087
- envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1088
- or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1089
- ):
1090
- cpus, numas = self.get_visible_devices_affinities(
1091
- runtime_env,
1092
- r_v,
1020
+ # Request devices.
1021
+ if r_v == "all":
1022
+ # Configure privileged.
1023
+ create_options["privileged"] = True
1024
+ # Request all devices.
1025
+ for ren in runtime_envs:
1026
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1027
+ # Request device via CDI.
1028
+ if cdi:
1029
+ if "device_requests" not in create_options:
1030
+ create_options["device_requests"] = []
1031
+ create_options["device_requests"].append(
1032
+ docker.types.DeviceRequest(
1033
+ driver="cdi",
1034
+ count=0,
1035
+ device_ids=r_vs,
1036
+ ),
1037
+ )
1038
+ continue
1039
+ # Request device via visible devices env.
1040
+ create_options["environment"][ren] = ",".join(r_vs)
1041
+ else:
1042
+ # Request specific devices.
1043
+ for ren in runtime_envs:
1044
+ # Request all devices if privileged,
1045
+ # otherwise, normalize requested devices.
1046
+ if privileged:
1047
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1048
+ else:
1049
+ r_vs = self.map_runtime_visible_devices(
1050
+ ren,
1051
+ resource_values,
1052
+ fmt,
1093
1053
  )
1094
- if cpus:
1095
- create_options["cpuset_cpus"] = cpus
1096
- if numas and envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
1097
- create_options["cpuset_mems"] = numas
1054
+ # Request device via CDI.
1055
+ if cdi:
1056
+ if "device_requests" not in create_options:
1057
+ create_options["device_requests"] = []
1058
+ create_options["device_requests"].append(
1059
+ docker.types.DeviceRequest(
1060
+ driver="cdi",
1061
+ count=0,
1062
+ device_ids=r_vs,
1063
+ ),
1064
+ )
1065
+ continue
1066
+ # Request device via visible devices env.
1067
+ create_options["environment"][ren] = ",".join(r_vs)
1068
+
1069
+ # If not requesting all devices but privileged,
1070
+ # must configure visible devices.
1071
+ if r_v != "all" and privileged:
1072
+ b_vs = self.map_backend_visible_devices(
1073
+ runtime_envs,
1074
+ resource_values,
1075
+ )
1076
+ create_options["environment"].update(b_vs)
1077
+
1078
+ # Configure affinity if applicable.
1079
+ create_options.update(
1080
+ self.map_visible_devices_affinities(
1081
+ runtime_envs,
1082
+ resource_values,
1083
+ ),
1084
+ )
1098
1085
 
1099
1086
  # Parameterize mounts.
1100
1087
  self._append_container_mounts(
@@ -78,28 +78,33 @@ async def serve_async(
78
78
  allocation_policy == "cdi"
79
79
  and envs.GPUSTACK_RUNTIME_KUBERNETES_KDP_CDI_SPECS_GENERATE
80
80
  ):
81
- generated_content, generated_path = cdi_dump_config(
81
+ cdi_config, cdi_config_path = cdi_dump_config(
82
82
  manufacturer=manu,
83
83
  output=cdi_generation_output,
84
84
  )
85
- if generated_content:
85
+ if cdi_config and cdi_config_path:
86
86
  if logger.isEnabledFor(logging.DEBUG):
87
87
  logger.debug(
88
88
  "Generated CDI configuration for '%s' at '%s':\n%s",
89
89
  manu,
90
- generated_path,
91
- generated_content,
90
+ cdi_config_path,
91
+ cdi_config,
92
92
  )
93
93
  else:
94
94
  logger.info(
95
95
  "Generated CDI configuration for '%s' at '%s'",
96
96
  manu,
97
- generated_path,
97
+ cdi_config_path,
98
98
  )
99
+ elif cdi_config:
100
+ logger.info(
101
+ "Reuse generated CDI configuration for '%s'",
102
+ manu,
103
+ )
99
104
  else:
100
105
  logger.warning(
101
- "Delegated CDI configuration by other tools for manufacturer '%s', "
102
- "e.g. NVIDIA Container Toolkit Manual CDI Specification Generation, "
106
+ "Delegated CDI configuration by other tools for '%s', "
107
+ "e.g. for NVIDIA devices, please follow NVIDIA Container Toolkit Manual CDI Specification Generation to generate the CDI configuration, "
103
108
  "see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html#manual-cdi-specification-generation",
104
109
  manu,
105
110
  )
@@ -323,7 +328,7 @@ def get_device_allocation_policy(
323
328
 
324
329
  if manufacturer in [
325
330
  ManufacturerEnum.AMD,
326
- # ManufacturerEnum.ASCEND, # Prioritize using Env policy for Ascend.
331
+ ManufacturerEnum.ASCEND,
327
332
  ManufacturerEnum.HYGON,
328
333
  ManufacturerEnum.ILUVATAR,
329
334
  ManufacturerEnum.METAX,
@@ -11,7 +11,8 @@ import grpc
11
11
  from grpc_interceptor import AsyncServerInterceptor
12
12
  from grpc_interceptor.exceptions import GrpcException
13
13
 
14
- from ....detector import Device, str_range_to_list
14
+ from ....detector import Device, DeviceMemoryStatusEnum, str_range_to_list
15
+ from ....detector.__utils__ import get_numa_node_size
15
16
  from ...cdi import (
16
17
  generate_config,
17
18
  manufacturer_to_cdi_kind,
@@ -40,6 +41,7 @@ from ..types.kubelet.deviceplugin.v1beta1 import (
40
41
  RegisterRequest,
41
42
  RegistrationStub,
42
43
  TopologyInfo,
44
+ Unhealthy,
43
45
  Version,
44
46
  add_DevicePluginServicer_to_server,
45
47
  )
@@ -159,7 +161,7 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
159
161
  self._runtime_env = manufacturer_to_runtime_env(device.manufacturer)
160
162
  self._kdp_resource = cdi_kind_to_kdp_resource(
161
163
  cdi_kind=self._cdi_kind,
162
- device_index=device.index,
164
+ device_index=str(device.index),
163
165
  )
164
166
 
165
167
  super().__init__(self._kdp_resource)
@@ -334,25 +336,31 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
334
336
  The response containing the list of devices.
335
337
 
336
338
  """
337
- device_id = (
338
- self._device.uuid if self._id_by == "uuid" else str(self._device.index)
339
- )
340
-
341
339
  dp_devices: list[DevicePluginDevice] = []
342
- dp_device_health = Healthy
340
+ dp_device_health = (
341
+ Healthy
342
+ if self._device.memory_status == DeviceMemoryStatusEnum.HEALTHY
343
+ else Unhealthy
344
+ )
343
345
  dp_device_topo = TopologyInfo(
344
346
  nodes=[
345
347
  NUMANode(
346
348
  ID=node_id,
347
349
  )
348
- for node_id in str_range_to_list(
349
- self._device.appendix.get("numa", "0"),
350
+ for node_id in (
351
+ str_range_to_list(
352
+ self._device.appendix.get("numa", ""),
353
+ )
354
+ or list(range(get_numa_node_size()))
350
355
  )
351
356
  ],
352
357
  )
353
358
 
354
359
  for device_replica in range(1, self._max_allocations + 1):
355
- dp_device_id = _to_device_plugin_device_id(device_id, device_replica)
360
+ dp_device_id = _to_device_plugin_device_id(
361
+ str(self._device.index),
362
+ device_replica,
363
+ )
356
364
  dp_devices.append(
357
365
  DevicePluginDevice(
358
366
  ID=dp_device_id,
@@ -419,28 +427,25 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
419
427
  req: ContainerAllocateRequest,
420
428
  ) -> ContainerAllocateResponse:
421
429
  policy = self._allocation_policy
422
- request_dp_device_ids = req.devices_ids
430
+ device_id = self._device.uuid
431
+ if self._id_by == "index":
432
+ device_id = str(self._device.index)
423
433
 
424
434
  # CDI device allocation.
425
435
  if policy == "cdi":
426
- cdi_devices: list[CDIDevice] = []
427
- for dp_device_id in request_dp_device_ids:
428
- device_id, _ = _from_device_plugin_device_id(dp_device_id)
429
- cdi_devices.append(
436
+ return ContainerAllocateResponse(
437
+ cdi_devices=[
430
438
  CDIDevice(
431
439
  name=f"{self._cdi_kind}={device_id}",
432
440
  ),
433
- )
434
-
435
- return ContainerAllocateResponse(
436
- cdi_devices=cdi_devices,
441
+ ],
437
442
  )
438
443
 
439
444
  # Environment variable device allocation.
440
445
  if policy == "env":
441
446
  return ContainerAllocateResponse(
442
447
  envs={
443
- self._runtime_env: ",".join(request_dp_device_ids),
448
+ self._runtime_env: device_id,
444
449
  },
445
450
  )
446
451
 
@@ -509,7 +514,7 @@ class SharableDevicePlugin(PluginServer, DevicePluginServicer):
509
514
  @lru_cache
510
515
  def cdi_kind_to_kdp_resource(
511
516
  cdi_kind: str,
512
- device_index: int,
517
+ device_index: str,
513
518
  ) -> str:
514
519
  """
515
520
  Map CDI kind and device index to a Kubernetes Device Plugin resource name.