gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. gpustack_runtime/_version.py +2 -2
  2. gpustack_runtime/_version_appendix.py +1 -1
  3. gpustack_runtime/cmds/detector.py +4 -2
  4. gpustack_runtime/deployer/__types__.py +314 -233
  5. gpustack_runtime/deployer/cdi/__init__.py +1 -1
  6. gpustack_runtime/deployer/cdi/__types__.py +2 -2
  7. gpustack_runtime/deployer/cdi/__utils__.py +4 -1
  8. gpustack_runtime/deployer/cdi/amd.py +6 -8
  9. gpustack_runtime/deployer/cdi/ascend.py +7 -9
  10. gpustack_runtime/deployer/cdi/hygon.py +6 -8
  11. gpustack_runtime/deployer/cdi/iluvatar.py +6 -8
  12. gpustack_runtime/deployer/cdi/metax.py +6 -8
  13. gpustack_runtime/deployer/cdi/thead.py +6 -8
  14. gpustack_runtime/deployer/docker.py +133 -146
  15. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +13 -8
  16. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +26 -21
  17. gpustack_runtime/deployer/kuberentes.py +89 -108
  18. gpustack_runtime/deployer/podman.py +113 -120
  19. gpustack_runtime/detector/__init__.py +2 -0
  20. gpustack_runtime/detector/__types__.py +26 -0
  21. gpustack_runtime/detector/__utils__.py +3 -0
  22. gpustack_runtime/detector/amd.py +32 -10
  23. gpustack_runtime/detector/ascend.py +67 -13
  24. gpustack_runtime/detector/cambricon.py +3 -0
  25. gpustack_runtime/detector/hygon.py +22 -3
  26. gpustack_runtime/detector/iluvatar.py +15 -7
  27. gpustack_runtime/detector/metax.py +16 -6
  28. gpustack_runtime/detector/mthreads.py +22 -8
  29. gpustack_runtime/detector/nvidia.py +148 -140
  30. gpustack_runtime/detector/pyacl/__init__.py +34 -14
  31. gpustack_runtime/detector/pydcmi/__init__.py +4 -2
  32. gpustack_runtime/detector/pyixml/__init__.py +16 -0
  33. gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
  34. gpustack_runtime/detector/thead.py +145 -134
  35. gpustack_runtime/envs.py +7 -6
  36. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/METADATA +2 -2
  37. gpustack_runtime-0.1.42.post1.dist-info/RECORD +67 -0
  38. gpustack_runtime-0.1.41.post3.dist-info/RECORD +0 -67
  39. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/WHEEL +0 -0
  40. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/entry_points.txt +0 -0
  41. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/licenses/LICENSE +0 -0
@@ -3,11 +3,9 @@ from __future__ import annotations as __future_annotations__
3
3
  import contextlib
4
4
  import json
5
5
  import logging
6
- import operator
7
6
  import os
8
7
  from dataclasses import dataclass, field
9
8
  from enum import Enum
10
- from functools import reduce
11
9
  from pathlib import Path
12
10
  from typing import TYPE_CHECKING
13
11
 
@@ -43,7 +41,7 @@ from .__utils__ import (
43
41
  sensitive_env_var,
44
42
  validate_rfc1123_domain_name,
45
43
  )
46
- from .k8s.deviceplugin import cdi_kind_to_kdp_resource, get_resource_injection_policy
44
+ from .k8s.deviceplugin import get_resource_injection_policy
47
45
 
48
46
  if TYPE_CHECKING:
49
47
  from collections.abc import Callable, Generator
@@ -88,17 +86,6 @@ class KubernetesWorkloadPlan(WorkloadPlan):
88
86
  Domain suffix for the cluster. Default is "cluster.local".
89
87
  service_type (KubernetesWorkloadServiceTypeEnum):
90
88
  Service type for the workload. Default is CLUSTER_IP.
91
- resource_key_runtime_env_mapping: (dict[str, str]):
92
- Mapping from resource names to environment variable names for device allocation,
93
- which is used to tell the Container Runtime which GPUs to mount into the container.
94
- For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
95
- which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
96
- With privileged mode, the container can access all GPUs even if specified.
97
- resource_key_backend_env_mapping: (dict[str, list[str]]):
98
- Mapping from resource names to environment variable names for device runtime,
99
- which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
100
- For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
101
- which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
102
89
  namespace (str | None):
103
90
  Namespace of the workload.
104
91
  name (str):
@@ -993,113 +980,103 @@ class KubernetesDeployer(EndoscopicDeployer):
993
980
  # Parameterize resources
994
981
  if c.resources:
995
982
  kdp = get_resource_injection_policy() == "kdp"
983
+ fmt = "kdp" if kdp else "plain"
996
984
 
997
985
  resources: dict[str, str] = {}
998
- r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
999
- r_k_backend_env = workload.resource_key_backend_env_mapping or {}
1000
- _, vd_env, vd_cdis, vd_values = self.get_visible_devices_materials()
1001
986
  for r_k, r_v in c.resources.items():
1002
987
  if r_k in ("cpu", "memory"):
1003
988
  resources[r_k] = str(r_v)
989
+ continue
990
+
991
+ if (
992
+ r_k
993
+ in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
994
+ ):
995
+ # Set env if resource key is mapped.
996
+ runtime_envs = [
997
+ envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
998
+ r_k
999
+ ],
1000
+ ]
1001
+ elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
1002
+ # Set env if auto-mapping key is matched.
1003
+ runtime_envs = self.get_runtime_envs()
1004
1004
  else:
1005
- if r_k in r_k_runtime_env:
1006
- # Set env if resource key is mapped.
1007
- runtime_env = [r_k_runtime_env[r_k]]
1008
- elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
1009
- # Set env if auto-mapping key is matched.
1010
- runtime_env = list(vd_env.keys())
1011
- else:
1012
- resources[r_k] = str(r_v)
1013
- continue
1014
-
1015
- if r_k in r_k_backend_env:
1016
- # Set env if resource key is mapped.
1017
- backend_env = r_k_backend_env[r_k]
1018
- else:
1019
- # Otherwise, use the default backend env names.
1020
- backend_env = reduce(operator.add, list(vd_env.values()))
1021
-
1022
- privileged = (
1005
+ resources[r_k] = str(r_v)
1006
+ continue
1007
+
1008
+ privileged = (
1009
+ container.security_context
1010
+ and container.security_context.privileged
1011
+ )
1012
+ resource_values = [x.strip() for x in r_v.split(",")]
1013
+
1014
+ # Request devices.
1015
+ if r_v == "all":
1016
+ # Configure privileged.
1017
+ container.security_context = (
1023
1018
  container.security_context
1024
- and container.security_context.privileged
1019
+ or kubernetes.client.V1SecurityContext()
1025
1020
  )
1026
-
1027
- # Configure device access environment variable.
1028
- if r_v == "all" and backend_env:
1029
- # Configure privileged if requested all devices.
1030
- container.security_context = (
1031
- container.security_context
1032
- or kubernetes.client.V1SecurityContext()
1021
+ container.security_context.privileged = True
1022
+ # Request all devices.
1023
+ for ren in runtime_envs:
1024
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1025
+ # Request device via KDP.
1026
+ if kdp:
1027
+ resources.update(
1028
+ dict.fromkeys(r_vs, "1"),
1029
+ )
1030
+ continue
1031
+ # Request device via visible devices env.
1032
+ container.env.append(
1033
+ kubernetes.client.V1EnvVar(
1034
+ name=ren,
1035
+ value=",".join(r_vs),
1036
+ ),
1033
1037
  )
1034
- container.security_context.privileged = True
1035
- # Then, set container backend visible devices env to all devices,
1036
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1037
- # and mount corresponding libs if needed.
1038
- for re in runtime_env:
1039
- # Request device via KDP.
1040
- if kdp:
1041
- for v in vd_values.get(re) or []:
1042
- kdp_resource = cdi_kind_to_kdp_resource(
1043
- cdi_kind=vd_cdis[re],
1044
- device_index=v,
1045
- )
1046
- resources[kdp_resource] = "1"
1047
- continue
1048
- # Request device via visible devices env.
1049
- rv = ",".join(vd_values.get(re) or ["all"])
1050
- container.env.append(
1051
- kubernetes.client.V1EnvVar(
1052
- name=re,
1053
- value=rv,
1054
- ),
1038
+ else:
1039
+ # Request specific devices.
1040
+ for ren in runtime_envs:
1041
+ # Request all devices if privileged,
1042
+ # otherwise, normalize requested devices.
1043
+ if privileged:
1044
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1045
+ else:
1046
+ r_vs = self.map_runtime_visible_devices(
1047
+ ren,
1048
+ resource_values,
1049
+ fmt,
1055
1050
  )
1056
- else:
1057
- # Set env to the allocated device IDs if no privileged,
1058
- # otherwise, set container backend visible devices env to all devices,
1059
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1060
- # and mount corresponding libs if needed.
1061
- for re in runtime_env:
1062
- # Request device via KDP.
1063
- if kdp:
1064
- if not privileged:
1065
- for v in str(r_v).split(","):
1066
- kdp_resource = cdi_kind_to_kdp_resource(
1067
- cdi_kind=vd_cdis[re],
1068
- device_index=int(v.strip()),
1069
- )
1070
- resources[kdp_resource] = "1"
1071
- else:
1072
- for v in vd_values.get(re) or []:
1073
- kdp_resource = cdi_kind_to_kdp_resource(
1074
- cdi_kind=vd_cdis[re],
1075
- device_index=v,
1076
- )
1077
- resources[kdp_resource] = "1"
1078
- continue
1079
- # Request device via visible devices env.
1080
- if not privileged:
1081
- rv = str(r_v)
1082
- else:
1083
- rv = ",".join(vd_values.get(re) or ["all"])
1084
- container.env.append(
1085
- kubernetes.client.V1EnvVar(
1086
- name=re,
1087
- value=rv,
1088
- ),
1051
+ # Request device via KDP.
1052
+ if kdp:
1053
+ resources.update(
1054
+ dict.fromkeys(r_vs, "1"),
1089
1055
  )
1056
+ continue
1057
+ # Request device via visible devices env.
1058
+ container.env.append(
1059
+ kubernetes.client.V1EnvVar(
1060
+ name=ren,
1061
+ value=",".join(r_vs),
1062
+ ),
1063
+ )
1090
1064
 
1091
- # Configure runtime device access environment variables.
1092
- if r_v != "all" and privileged:
1093
- for be in backend_env:
1094
- container.env.append(
1095
- kubernetes.client.V1EnvVar(
1096
- name=be,
1097
- value=self.align_backend_visible_devices_env_values(
1098
- be,
1099
- str(r_v),
1100
- ),
1101
- ),
1065
+ # Configure runtime device access environment variables.
1066
+ if r_v != "all" and privileged:
1067
+ b_vs = self.map_backend_visible_devices(
1068
+ runtime_envs,
1069
+ resource_values,
1070
+ )
1071
+ container.env.extend(
1072
+ [
1073
+ kubernetes.client.V1EnvVar(
1074
+ name=be,
1075
+ value=be_v,
1102
1076
  )
1077
+ for be, be_v in b_vs.items()
1078
+ ],
1079
+ )
1103
1080
 
1104
1081
  container.resources = kubernetes.client.V1ResourceRequirements(
1105
1082
  limits=(resources if resources else None),
@@ -1229,6 +1206,10 @@ class KubernetesDeployer(EndoscopicDeployer):
1229
1206
  self._client = self._get_client()
1230
1207
  self._node_name = envs.GPUSTACK_RUNTIME_KUBERNETES_NODE_NAME
1231
1208
 
1209
+ @property
1210
+ def allowed_uuid_values(self) -> bool:
1211
+ return get_resource_injection_policy() != "kdp"
1212
+
1232
1213
  def _prepare_mirrored_deployment(self):
1233
1214
  """
1234
1215
  Prepare for mirrored deployment.
@@ -4,13 +4,11 @@ import contextlib
4
4
  import io
5
5
  import json
6
6
  import logging
7
- import operator
8
7
  import os
9
8
  import socket
10
9
  import sys
11
10
  import tarfile
12
11
  from dataclasses import dataclass, field
13
- from functools import reduce
14
12
  from math import ceil
15
13
  from pathlib import Path
16
14
  from typing import TYPE_CHECKING, Any
@@ -84,17 +82,6 @@ class PodmanWorkloadPlan(WorkloadPlan):
84
82
  Image used for the pause container.
85
83
  unhealthy_restart_image (str):
86
84
  Image used for unhealthy restart container.
87
- resource_key_runtime_env_mapping: (dict[str, str]):
88
- Mapping from resource names to environment variable names for device allocation,
89
- which is used to tell the Container Runtime which GPUs to mount into the container.
90
- For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
91
- which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
92
- With privileged mode, the container can access all GPUs even if specified.
93
- resource_key_backend_env_mapping: (dict[str, list[str]]):
94
- Mapping from resource names to environment variable names for device runtime,
95
- which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
96
- For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
97
- which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
98
85
  namespace (str | None):
99
86
  Namespace of the workload.
100
87
  name (str):
@@ -952,120 +939,126 @@ class PodmanDeployer(EndoscopicDeployer):
952
939
 
953
940
  # Parameterize resources.
954
941
  if c.resources:
955
- r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
956
- r_k_backend_env = workload.resource_key_backend_env_mapping or {}
957
- vd_manus, vd_env, vd_cdis, vd_values = (
958
- self.get_visible_devices_materials()
959
- )
960
- for r_k, r_v in c.resources.items():
961
- match r_k:
962
- case "cpu":
963
- if isinstance(r_v, int | float):
964
- create_options["cpu_shares"] = ceil(r_v * 1024)
965
- elif isinstance(r_v, str) and r_v.isdigit():
966
- create_options["cpu_shares"] = ceil(float(r_v) * 1024)
967
- case "memory":
968
- if isinstance(r_v, int):
969
- create_options["mem_limit"] = r_v
970
- create_options["mem_reservation"] = r_v
971
- create_options["memswap_limit"] = r_v
972
- elif isinstance(r_v, str):
973
- v = r_v.lower().removesuffix("i")
974
- create_options["mem_limit"] = v
975
- create_options["mem_reservation"] = v
976
- create_options["memswap_limit"] = v
977
- case _:
978
- if r_k in r_k_runtime_env:
979
- # Set env if resource key is mapped.
980
- runtime_env = [r_k_runtime_env[r_k]]
981
- elif (
982
- r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
983
- ):
984
- # Set env if auto-mapping key is matched.
985
- runtime_env = list(vd_env.keys())
986
- else:
987
- continue
942
+ fmt = "cdi"
988
943
 
989
- if r_k in r_k_backend_env:
990
- # Set env if resource key is mapped.
991
- backend_env = r_k_backend_env[r_k]
992
- else:
993
- # Otherwise, use the default backend env names.
994
- backend_env = reduce(
995
- operator.add,
996
- list(vd_env.values()),
997
- )
944
+ for r_k, r_v in c.resources.items():
945
+ if r_k == "cpu":
946
+ if isinstance(r_v, int | float):
947
+ create_options["cpu_shares"] = ceil(r_v * 1024)
948
+ elif isinstance(r_v, str) and r_v.isdigit():
949
+ create_options["cpu_shares"] = ceil(float(r_v) * 1024)
950
+ continue
951
+ if r_k == "memory":
952
+ if isinstance(r_v, int):
953
+ create_options["mem_limit"] = r_v
954
+ create_options["mem_reservation"] = r_v
955
+ create_options["memswap_limit"] = r_v
956
+ elif isinstance(r_v, str):
957
+ v = r_v.lower().removesuffix("i")
958
+ create_options["mem_limit"] = v
959
+ create_options["mem_reservation"] = v
960
+ create_options["memswap_limit"] = v
961
+ continue
998
962
 
999
- privileged = create_options.get("privileged", False)
963
+ if (
964
+ r_k
965
+ in envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES
966
+ ):
967
+ # Set env if resource key is mapped.
968
+ runtime_envs = [
969
+ envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES[
970
+ r_k
971
+ ],
972
+ ]
973
+ elif r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY:
974
+ # Set env if auto-mapping key is matched.
975
+ runtime_envs = self.get_runtime_envs()
976
+ else:
977
+ continue
1000
978
 
1001
- # Generate CDI config if not yet.
1002
- if envs.GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE:
1003
- for re in runtime_env:
1004
- cdi_dump_config(
1005
- manufacturer=vd_manus[re],
1006
- output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
979
+ privileged = create_options.get("privileged", False)
980
+ resource_values = [x.strip() for x in r_v.split(",")]
981
+
982
+ # Generate CDI config if not yet.
983
+ if envs.GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE:
984
+ for ren in runtime_envs:
985
+ manu = self.get_manufacturer(ren)
986
+ cdi_config, cdi_config_path = cdi_dump_config(
987
+ manufacturer=manu,
988
+ output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
989
+ )
990
+ if cdi_config and cdi_config_path:
991
+ if logger.isEnabledFor(logging.DEBUG):
992
+ logger.debug(
993
+ "Generated CDI configuration for '%s' at '%s':\n%s",
994
+ manu,
995
+ cdi_config_path,
996
+ cdi_config,
1007
997
  )
1008
-
1009
- # Configure device access environment variable.
1010
- if r_v == "all" and backend_env:
1011
- # Configure privileged if requested all devices.
1012
- create_options["privileged"] = True
1013
- # Then, set container backend visible devices env to all devices,
1014
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1015
- # and mount corresponding libs if needed.
1016
- for re in runtime_env:
1017
- # Request device via CDI.
1018
- rv = [
1019
- f"{vd_cdis[re]}={v}"
1020
- for v in (vd_values.get(re) or ["all"])
1021
- ]
1022
- if "devices" not in create_options:
1023
- create_options["devices"] = []
1024
- create_options["devices"].extend(rv)
1025
- else:
1026
- # Set env to the allocated device IDs if no privileged,
1027
- # otherwise, set container backend visible devices env to all devices,
1028
- # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1029
- # and mount corresponding libs if needed.
1030
- for re in runtime_env:
1031
- # Request device via CDI.
1032
- if not privileged:
1033
- rv = [
1034
- f"{vd_cdis[re]}={v.strip()}"
1035
- for v in r_v.split(",")
1036
- ]
1037
- else:
1038
- rv = [
1039
- f"{vd_cdis[re]}={v}"
1040
- for v in (vd_values.get(re) or ["all"])
1041
- ]
1042
- if "devices" not in create_options:
1043
- create_options["devices"] = []
1044
- create_options["devices"].extend(rv)
1045
-
1046
- # Configure runtime device access environment variables.
1047
- if r_v != "all" and privileged:
1048
- for be in backend_env:
1049
- create_options["environment"][be] = (
1050
- self.align_backend_visible_devices_env_values(
1051
- be,
1052
- str(r_v),
1053
- )
998
+ else:
999
+ logger.info(
1000
+ "Generated CDI configuration for '%s' at '%s'",
1001
+ manu,
1002
+ cdi_config_path,
1054
1003
  )
1004
+ elif cdi_config:
1005
+ logger.info(
1006
+ "Reuse generated CDI configuration for '%s'",
1007
+ manu,
1008
+ )
1009
+ else:
1010
+ logger.warning(
1011
+ "Delegated CDI configuration by other tools for '%s', "
1012
+ "e.g. for NVIDIA devices, please follow NVIDIA Container Toolkit Manual CDI Specification Generation to generate the CDI configuration, "
1013
+ "see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html#manual-cdi-specification-generation",
1014
+ manu,
1015
+ )
1055
1016
 
1056
- # Configure affinity if applicable.
1057
- if (
1058
- envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1059
- or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1060
- ):
1061
- cpus, numas = self.get_visible_devices_affinities(
1062
- runtime_env,
1063
- r_v,
1017
+ # Configure device access environment variable.
1018
+ if r_v == "all":
1019
+ # Configure privileged.
1020
+ create_options["privileged"] = True
1021
+ # Request all devices.
1022
+ for ren in runtime_envs:
1023
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1024
+ # Request device via CDI.
1025
+ if "devices" not in create_options:
1026
+ create_options["devices"] = []
1027
+ create_options["devices"].extend(r_vs)
1028
+ else:
1029
+ # Request specific devices.
1030
+ for ren in runtime_envs:
1031
+ # Request all devices if privileged,
1032
+ # otherwise, normalize requested devices.
1033
+ if privileged:
1034
+ r_vs = self.get_runtime_visible_devices(ren, fmt)
1035
+ else:
1036
+ r_vs = self.map_runtime_visible_devices(
1037
+ ren,
1038
+ resource_values,
1039
+ fmt,
1064
1040
  )
1065
- if cpus:
1066
- create_options["cpuset_cpus"] = cpus
1067
- if numas and envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
1068
- create_options["cpuset_mems"] = numas
1041
+ # Request device via CDI.
1042
+ if "devices" not in create_options:
1043
+ create_options["devices"] = []
1044
+ create_options["devices"].extend(r_vs)
1045
+
1046
+ # If not requesting all devices but privileged,
1047
+ # must configure visible devices.
1048
+ if r_v != "all" and privileged:
1049
+ b_vs = self.map_backend_visible_devices(
1050
+ runtime_envs,
1051
+ resource_values,
1052
+ )
1053
+ create_options["environment"].update(b_vs)
1054
+
1055
+ # Configure affinity if applicable.
1056
+ create_options.update(
1057
+ self.map_visible_devices_affinities(
1058
+ runtime_envs,
1059
+ resource_values,
1060
+ ),
1061
+ )
1069
1062
 
1070
1063
  # Parameterize mounts.
1071
1064
  self._append_container_mounts(
@@ -7,6 +7,7 @@ from ..logging import debug_log_exception
7
7
  from .__types__ import (
8
8
  Detector,
9
9
  Device,
10
+ DeviceMemoryStatusEnum,
10
11
  Devices,
11
12
  ManufacturerEnum,
12
13
  Topology,
@@ -292,6 +293,7 @@ def filter_devices_by_manufacturer(
292
293
 
293
294
  __all__ = [
294
295
  "Device",
296
+ "DeviceMemoryStatusEnum",
295
297
  "Devices",
296
298
  "ManufacturerEnum",
297
299
  "Topology",
@@ -122,6 +122,28 @@ def backend_to_manufacturer(backend: str) -> ManufacturerEnum:
122
122
  return ManufacturerEnum.UNKNOWN
123
123
 
124
124
 
125
+ class DeviceMemoryStatusEnum(str, Enum):
126
+ """
127
+ Enum for Device Memory Status.
128
+ """
129
+
130
+ HEALTHY = "healthy"
131
+ """
132
+ Device is healthy.
133
+ """
134
+ UNHEALTHY = "unhealthy"
135
+ """
136
+ Device is unhealthy.
137
+ """
138
+ UNKNOWN = "unknown"
139
+ """
140
+ Device status is unknown.
141
+ """
142
+
143
+ def __str__(self):
144
+ return self.value
145
+
146
+
125
147
  @dataclass_json
126
148
  @dataclass
127
149
  class Device:
@@ -185,6 +207,10 @@ class Device:
185
207
  """
186
208
  Memory utilization of the device in percentage.
187
209
  """
210
+ memory_status: DeviceMemoryStatusEnum = DeviceMemoryStatusEnum.UNKNOWN
211
+ """
212
+ Status of the device.
213
+ """
188
214
  temperature: int | float | None = None
189
215
  """
190
216
  Temperature of the device in Celsius.
@@ -916,6 +916,9 @@ def str_range_to_list(str_range: str) -> list[int]:
916
916
  A list of indices.
917
917
 
918
918
  """
919
+ if not str_range:
920
+ return []
921
+
919
922
  str_range_parts = str_range.split(",")
920
923
 
921
924
  indices: set[int] = set()