gpustack-runtime 0.1.38.post4__py3-none-any.whl → 0.1.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1020,11 +1020,11 @@ class WorkloadPlan(WorkloadSecurity):
1020
1020
  c.execution.command_script = None
1021
1021
  # Add default registry if needed.
1022
1022
  if (
1023
- envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY
1024
- and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY
1023
+ envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY
1024
+ and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY
1025
1025
  not in ["docker.io", "index.docker.io"]
1026
1026
  ):
1027
- image_registry = envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_NAMESPACE
1027
+ image_registry = envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_NAMESPACE
1028
1028
  image_split = c.image.split("/")
1029
1029
  if len(image_split) == 1:
1030
1030
  c.image = f"{image_registry}/library/{c.image}"
@@ -1269,6 +1269,17 @@ class Deployer(ABC):
1269
1269
  "AMD_VISIBLE_DEVICES": ["HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"]
1270
1270
  }.
1271
1271
  """
1272
+ _visible_devices_cdis: dict[str, str] | None = None
1273
+ """
1274
+ Recorded visible devices envs to CDI mapping,
1275
+ the key is the runtime visible devices env name,
1276
+ the value is the corresponding CDI key.
1277
+ For example:
1278
+ {
1279
+ "NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu",
1280
+ "AMD_VISIBLE_DEVICES": "amd.com/gpu"
1281
+ }.
1282
+ """
1272
1283
  _visible_devices_values: dict[str, list[str]] | None = None
1273
1284
  """
1274
1285
  Recorded visible devices values,
@@ -1349,6 +1360,7 @@ class Deployer(ABC):
1349
1360
  return
1350
1361
 
1351
1362
  self._visible_devices_env = {}
1363
+ self._visible_devices_cdis = {}
1352
1364
  self._visible_devices_values = {}
1353
1365
  self._visible_devices_topologies = {}
1354
1366
  self._backend_visible_devices_values_alignment = {}
@@ -1364,33 +1376,33 @@ class Deployer(ABC):
1364
1376
  ren = envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES.get(
1365
1377
  rk,
1366
1378
  )
1367
- ben = envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_BACKEND_VISIBLE_DEVICES.get(
1379
+ ben_list = envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_BACKEND_VISIBLE_DEVICES.get(
1380
+ rk,
1381
+ )
1382
+ cdi = envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_CONTAINER_DEVICE_INTERFACES.get(
1368
1383
  rk,
1369
1384
  )
1370
- if ren and ben:
1385
+ if ren and ben_list:
1386
+ valued_uuid = (
1387
+ ren
1388
+ in envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
1389
+ )
1371
1390
  dev_uuids: list[str] = []
1372
1391
  dev_indexes: list[str] = []
1373
- for dev in devs:
1392
+ dev_indexes_alignment: dict[str, str] = {}
1393
+ for dev_i, dev in enumerate(devs):
1374
1394
  dev_uuids.append(dev.uuid)
1375
1395
  dev_indexes.append(str(dev.index))
1376
- dev_indexes_alignment: dict[str, str] = {
1377
- dev_indexes[i]: str(i) for i in range(len(devs))
1378
- }
1379
- self._visible_devices_env[ren] = ben
1396
+ dev_indexes_alignment[str(dev.index)] = str(dev_i)
1397
+ # Map runtime visible devices env <-> backend visible devices env list.
1398
+ self._visible_devices_env[ren] = ben_list
1399
+ # Map runtime visible devices env <-> CDI key.
1400
+ self._visible_devices_cdis[ren] = cdi
1401
+ # Map runtime visible devices env <-> device indexes or uuids.
1380
1402
  self._visible_devices_values[ren] = (
1381
- dev_uuids
1382
- if ren
1383
- in envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
1384
- else dev_indexes
1403
+ dev_uuids if valued_uuid else dev_indexes
1385
1404
  )
1386
- for ben_item in ben:
1387
- if (
1388
- ben_item
1389
- in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1390
- ):
1391
- self._backend_visible_devices_values_alignment[ben_item] = (
1392
- dev_indexes_alignment
1393
- )
1405
+ # Map runtime visible devices env <-> topology.
1394
1406
  if (
1395
1407
  envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1396
1408
  or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
@@ -1398,6 +1410,17 @@ class Deployer(ABC):
1398
1410
  topos = get_devices_topologies(devices=devs)
1399
1411
  if topos:
1400
1412
  self._visible_devices_topologies[ren] = topos[0]
1413
+ # Map backend visible devices env <-> devices alignment.
1414
+ if not valued_uuid:
1415
+ for ben in ben_list:
1416
+ valued_alignment = (
1417
+ ben
1418
+ in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
1419
+ )
1420
+ if valued_alignment:
1421
+ self._backend_visible_devices_values_alignment[ben] = (
1422
+ dev_indexes_alignment
1423
+ )
1401
1424
 
1402
1425
  if self._visible_devices_env:
1403
1426
  return
@@ -1406,17 +1429,21 @@ class Deployer(ABC):
1406
1429
  self._visible_devices_env["UNKNOWN_RUNTIME_VISIBLE_DEVICES"] = []
1407
1430
  self._visible_devices_values["UNKNOWN_RUNTIME_VISIBLE_DEVICES"] = ["all"]
1408
1431
 
1409
- def get_visible_devices_env_values(
1432
+ def get_visible_devices_values(
1410
1433
  self,
1411
- ) -> (dict[str, list[str]], dict[str, list[str]]):
1434
+ ) -> (dict[str, list[str]], dict[str, str], dict[str, list[str]]):
1412
1435
  """
1413
- Return the visible devices environment variables and values mappings.
1436
+ Return the visible devices environment variables, cdis and values mappings.
1414
1437
  For example:
1415
1438
  (
1416
1439
  {
1417
1440
  "NVIDIA_VISIBLE_DEVICES": ["CUDA_VISIBLE_DEVICES"],
1418
1441
  "AMD_VISIBLE_DEVICES": ["HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"]
1419
- }.
1442
+ },
1443
+ {
1444
+ "NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu",
1445
+ "AMD_VISIBLE_DEVICES": "amd.com/gpu"
1446
+ },
1420
1447
  {
1421
1448
  "NVIDIA_VISIBLE_DEVICES": ["0"],
1422
1449
  "AMD_VISIBLE_DEVICES": ["0", "1"]
@@ -1428,11 +1455,17 @@ class Deployer(ABC):
1428
1455
  - The first dictionary maps runtime visible devices environment variable names
1429
1456
  to lists of backend visible devices environment variable names.
1430
1457
  - The second dictionary maps runtime visible devices environment variable names
1458
+ to corresponding CDI keys.
1459
+ - The last dictionary maps runtime visible devices environment variable names
1431
1460
  to lists of device indexes or UUIDs.
1432
1461
 
1433
1462
  """
1434
1463
  self._prepare()
1435
- return self._visible_devices_env, self._visible_devices_values
1464
+ return (
1465
+ self._visible_devices_env,
1466
+ self._visible_devices_cdis,
1467
+ self._visible_devices_values,
1468
+ )
1436
1469
 
1437
1470
  def get_visible_devices_affinities(
1438
1471
  self,
@@ -139,7 +139,7 @@ class DockerWorkloadPlan(WorkloadPlan):
139
139
  super().validate_and_default()
140
140
 
141
141
  # Adjust default image namespace if needed.
142
- if namespace := envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_NAMESPACE:
142
+ if namespace := envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_NAMESPACE:
143
143
  self.pause_image = replace_image_with(
144
144
  image=self.pause_image,
145
145
  namespace=namespace,
@@ -175,7 +175,7 @@ class DockerWorkloadStatus(WorkloadStatus):
175
175
 
176
176
  @staticmethod
177
177
  def parse_state(
178
- d_containers: list[docker.models.containers],
178
+ d_containers: list[docker.models.containers.Container],
179
179
  ) -> WorkloadStatusStateEnum:
180
180
  """
181
181
  Parse the state of the workload based on the status of its containers.
@@ -221,7 +221,7 @@ class DockerWorkloadStatus(WorkloadStatus):
221
221
  d_run_state = WorkloadStatusStateEnum.PENDING
222
222
  else:
223
223
  health = cr.attrs["State"].get("Health", {})
224
- if health and health.get("Status", "healthy") != "healthy":
224
+ if health and health.get("Status", "healthy") not in ["healthy", ""]:
225
225
  return WorkloadStatusStateEnum.UNHEALTHY
226
226
 
227
227
  d_init_state = None
@@ -252,7 +252,7 @@ class DockerWorkloadStatus(WorkloadStatus):
252
252
  def __init__(
253
253
  self,
254
254
  name: WorkloadName,
255
- d_containers: list[docker.models.containers],
255
+ d_containers: list[docker.models.containers.Container],
256
256
  **kwargs,
257
257
  ):
258
258
  created_at = d_containers[0].attrs["Created"]
@@ -330,6 +330,12 @@ class DockerDeployer(Deployer):
330
330
  if client:
331
331
  try:
332
332
  supported = client.ping()
333
+ if envs.GPUSTACK_RUNTIME_LOG_EXCEPTION:
334
+ version_info = client.version()
335
+ logger.debug(
336
+ "Connected to Docker API server: %s",
337
+ version_info,
338
+ )
333
339
  except docker.errors.APIError:
334
340
  debug_log_exception(logger, "Failed to connect to Docker API server")
335
341
 
@@ -352,12 +358,13 @@ class DockerDeployer(Deployer):
352
358
  contextlib.redirect_stdout(dev_null),
353
359
  contextlib.redirect_stderr(dev_null),
354
360
  ):
355
- if Path("/var/run/docker.sock").exists():
356
- client = docker.DockerClient(base_url="unix://var/run/docker.sock")
357
- else:
358
- client = docker.from_env()
359
- except docker.errors.DockerException:
360
- debug_log_exception(logger, "Failed to get Docker client")
361
+ os_env = os.environ.copy()
362
+ if envs.GPUSTACK_RUNTIME_DOCKER_HOST:
363
+ os_env["DOCKER_HOST"] = envs.GPUSTACK_RUNTIME_DOCKER_HOST
364
+ client = docker.from_env(environment=os_env)
365
+ except docker.errors.DockerException as e:
366
+ if "FileNotFoundError" not in str(e):
367
+ debug_log_exception(logger, "Failed to get Docker client")
361
368
 
362
369
  return client
363
370
 
@@ -463,12 +470,12 @@ class DockerDeployer(Deployer):
463
470
  tag = tag or "latest"
464
471
  auth_config = None
465
472
  if (
466
- envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY_USERNAME
467
- and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY_PASSWORD
473
+ envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME
474
+ and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD
468
475
  ):
469
476
  auth_config = {
470
- "username": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY_USERNAME,
471
- "password": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY_PASSWORD,
477
+ "username": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME,
478
+ "password": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD,
472
479
  }
473
480
 
474
481
  logs = self._client.api.pull(
@@ -488,8 +495,10 @@ class DockerDeployer(Deployer):
488
495
  msg = f"Failed to pull image {image}, invalid response"
489
496
  raise OperationError(msg) from e
490
497
  except docker.errors.APIError as e:
491
- msg = f"Failed to pull image {image}{_detail_api_call_error(e)}"
492
- raise OperationError(msg) from e
498
+ if "no unqualified-search registries are defined" not in str(e):
499
+ msg = f"Failed to pull image {image}{_detail_api_call_error(e)}"
500
+ raise OperationError(msg) from e
501
+ return self._pull_image(f"docker.io/{image}")
493
502
 
494
503
  def _get_image(
495
504
  self,
@@ -658,6 +667,14 @@ class DockerDeployer(Deployer):
658
667
  # TODO(thxCode): check if the container matches the spec
659
668
  return d_container
660
669
 
670
+ host_socket_path = None
671
+ if envs.GPUSTACK_RUNTIME_DOCKER_HOST.startswith("http+unix://"):
672
+ host_socket_path = envs.GPUSTACK_RUNTIME_DOCKER_HOST[len("http+unix://") :]
673
+ elif envs.GPUSTACK_RUNTIME_DOCKER_HOST.startswith("unix://"):
674
+ host_socket_path = envs.GPUSTACK_RUNTIME_DOCKER_HOST[len("unix://") :]
675
+ if host_socket_path and not host_socket_path.startswith("/"):
676
+ host_socket_path = f"/{host_socket_path}"
677
+
661
678
  create_options: dict[str, Any] = {
662
679
  "name": container_name,
663
680
  "restart_policy": {"Name": "always"},
@@ -669,10 +686,17 @@ class DockerDeployer(Deployer):
669
686
  "environment": [
670
687
  f"AUTOHEAL_CONTAINER_LABEL={_LABEL_COMPONENT_HEAL_PREFIX}-{workload.name}",
671
688
  ],
672
- "volumes": [
673
- "/var/run/docker.sock:/var/run/docker.sock",
674
- ],
675
689
  }
690
+ if host_socket_path:
691
+ create_options["volumes"] = (
692
+ [
693
+ f"{host_socket_path}:/var/run/docker.sock",
694
+ ],
695
+ )
696
+ elif envs.GPUSTACK_RUNTIME_DOCKER_HOST:
697
+ create_options["environment"].append(
698
+ f"DOCKER_SOCK={envs.GPUSTACK_RUNTIME_DOCKER_HOST}",
699
+ )
676
700
 
677
701
  if envs.GPUSTACK_RUNTIME_DEPLOY_PRINT_CONVERSION:
678
702
  clogger.info(
@@ -760,7 +784,7 @@ class DockerDeployer(Deployer):
760
784
  else:
761
785
  continue
762
786
 
763
- if m.mode == ContainerMountModeEnum.ROX:
787
+ if m.mode != ContainerMountModeEnum.RWX:
764
788
  binding["ReadOnly"] = True
765
789
 
766
790
  mount_binding.append(binding)
@@ -841,6 +865,7 @@ class DockerDeployer(Deployer):
841
865
  workload: DockerWorkloadPlan,
842
866
  ephemeral_filename_mapping: dict[tuple[int, str] : str],
843
867
  ephemeral_volume_name_mapping: dict[str, str],
868
+ pause_container: docker.models.containers.Container,
844
869
  ) -> (
845
870
  list[docker.models.containers.Container],
846
871
  list[docker.models.containers.Container],
@@ -860,7 +885,7 @@ class DockerDeployer(Deployer):
860
885
  d_init_containers: list[docker.models.containers.Container] = []
861
886
  d_run_containers: list[docker.models.containers.Container] = []
862
887
 
863
- pause_container_namespace = f"container:{workload.name}-pause"
888
+ pause_container_namespace = f"container:{pause_container.id}"
864
889
  for ci, c in enumerate(workload.containers):
865
890
  container_name = f"{workload.name}-{c.profile.lower()}-{ci}"
866
891
  try:
@@ -942,9 +967,14 @@ class DockerDeployer(Deployer):
942
967
 
943
968
  # Parameterize resources.
944
969
  if c.resources:
970
+ cdi = (
971
+ envs.GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY.lower()
972
+ == "cdi"
973
+ )
974
+
945
975
  r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
946
976
  r_k_backend_env = workload.resource_key_backend_env_mapping or {}
947
- vd_env, vd_values = self.get_visible_devices_env_values()
977
+ vd_env, vd_cdis, vd_values = self.get_visible_devices_values()
948
978
  for r_k, r_v in c.resources.items():
949
979
  match r_k:
950
980
  case "cpu":
@@ -994,24 +1024,59 @@ class DockerDeployer(Deployer):
994
1024
  # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
995
1025
  # and mount corresponding libs if needed.
996
1026
  for re in runtime_env:
997
- # Set to "all" if no specific devices detected,
998
- # maybe the container backend can handle it.
999
- create_options["environment"][re] = (
1000
- ",".join(vd_values.get(re, [])) or "all"
1001
- )
1027
+ # Request device via CDI.
1028
+ if cdi:
1029
+ rv = [
1030
+ f"{vd_cdis[re]}={v}"
1031
+ for v in (vd_values.get(re) or ["all"])
1032
+ ]
1033
+ if "device_requests" not in create_options:
1034
+ create_options["device_requests"] = []
1035
+ create_options["device_requests"].append(
1036
+ docker.types.DeviceRequest(
1037
+ driver="cdi",
1038
+ count=0,
1039
+ device_ids=rv,
1040
+ ),
1041
+ )
1042
+ continue
1043
+ # Request device via visible devices env.
1044
+ rv = ",".join(vd_values.get(re) or ["all"])
1045
+ create_options["environment"][re] = rv
1002
1046
  else:
1003
1047
  # Set env to the allocated device IDs if no privileged,
1004
1048
  # otherwise, set container backend visible devices env to all devices,
1005
1049
  # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1006
1050
  # and mount corresponding libs if needed.
1007
1051
  for re in runtime_env:
1008
- # Set to "all" if no specific devices detected,
1009
- # maybe the container backend can handle it.
1010
- create_options["environment"][re] = (
1011
- str(r_v)
1012
- if not privileged
1013
- else (",".join(vd_values.get(re, [])) or "all")
1014
- )
1052
+ # Request device via CDI.
1053
+ if cdi:
1054
+ if not privileged:
1055
+ rv = [
1056
+ f"{vd_cdis[re]}={v.strip()}"
1057
+ for v in r_v.split(",")
1058
+ ]
1059
+ else:
1060
+ rv = [
1061
+ f"{vd_cdis[re]}={v}"
1062
+ for v in (vd_values.get(re) or ["all"])
1063
+ ]
1064
+ if "device_requests" not in create_options:
1065
+ create_options["device_requests"] = []
1066
+ create_options["device_requests"].append(
1067
+ docker.types.DeviceRequest(
1068
+ driver="cdi",
1069
+ count=0,
1070
+ device_ids=rv,
1071
+ ),
1072
+ )
1073
+ continue
1074
+ # Request device via visible devices env.
1075
+ if not privileged:
1076
+ rv = str(r_v)
1077
+ else:
1078
+ rv = ",".join(vd_values.get(re) or ["all"])
1079
+ create_options["environment"][re] = rv
1015
1080
 
1016
1081
  # Configure runtime device access environment variables.
1017
1082
  if r_v != "all" and privileged:
@@ -1204,7 +1269,7 @@ class DockerDeployer(Deployer):
1204
1269
  # Always filter out Docker Socket mount.
1205
1270
  m
1206
1271
  for m in (self_container.attrs["Mounts"] or [])
1207
- if m.get("Destination") != "/var/run/docker.sock"
1272
+ if not m.get("Destination").endswith("/docker.sock")
1208
1273
  ]
1209
1274
  if igs := envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES:
1210
1275
  mirrored_mounts = [
@@ -1491,6 +1556,7 @@ class DockerDeployer(Deployer):
1491
1556
  workload,
1492
1557
  ephemeral_filename_mapping,
1493
1558
  ephemeral_volume_name_mapping,
1559
+ pause_container,
1494
1560
  )
1495
1561
 
1496
1562
  # Create unhealthy restart container if needed.
@@ -1593,10 +1659,18 @@ class DockerDeployer(Deployer):
1593
1659
  # Remove all containers with the workload label.
1594
1660
  try:
1595
1661
  d_containers = getattr(workload, "_d_containers", [])
1662
+ # Remove non-pause containers first.
1596
1663
  for c in d_containers:
1597
- c.remove(
1598
- force=True,
1599
- )
1664
+ if "-pause" not in c.name:
1665
+ c.remove(
1666
+ force=True,
1667
+ )
1668
+ # Then remove pause containers.
1669
+ for c in d_containers:
1670
+ if "-pause" in c.name:
1671
+ c.remove(
1672
+ force=True,
1673
+ )
1600
1674
  except docker.errors.APIError as e:
1601
1675
  msg = f"Failed to delete containers for workload {name}{_detail_api_call_error(e)}"
1602
1676
  raise OperationError(msg) from e
@@ -1860,7 +1934,7 @@ class DockerDeployer(Deployer):
1860
1934
  }
1861
1935
 
1862
1936
  try:
1863
- result = container.exec_run(
1937
+ _, output = container.exec_run(
1864
1938
  detach=False,
1865
1939
  **exec_options,
1866
1940
  )
@@ -1869,8 +1943,8 @@ class DockerDeployer(Deployer):
1869
1943
  raise OperationError(msg) from e
1870
1944
  else:
1871
1945
  if not attach:
1872
- return result.output
1873
- return DockerWorkloadExecStream(result.output)
1946
+ return output
1947
+ return DockerWorkloadExecStream(output)
1874
1948
 
1875
1949
 
1876
1950
  def _has_restart_policy(
@@ -321,14 +321,18 @@ class KubernetesDeployer(Deployer):
321
321
  version_api = kubernetes.client.VersionApi(client)
322
322
  version_info = version_api.get_code()
323
323
  supported = version_info is not None
324
- except (
325
- urllib3.exceptions.MaxRetryError,
326
- kubernetes.client.exceptions.ApiException,
327
- ):
324
+ if envs.GPUSTACK_RUNTIME_LOG_EXCEPTION:
325
+ logger.debug(
326
+ "Connected to Kubernetes API server: %s",
327
+ version_info,
328
+ )
329
+ except kubernetes.client.exceptions.ApiException:
328
330
  debug_log_exception(
329
331
  logger,
330
332
  "Failed to connect to Kubernetes API server",
331
333
  )
334
+ except urllib3.exceptions.MaxRetryError:
335
+ pass
332
336
 
333
337
  return supported
334
338
 
@@ -985,7 +989,7 @@ class KubernetesDeployer(Deployer):
985
989
  resources: dict[str, str] = {}
986
990
  r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
987
991
  r_k_backend_env = workload.resource_key_backend_env_mapping or {}
988
- vd_env, vd_values = self.get_visible_devices_env_values()
992
+ vd_env, _, vd_values = self.get_visible_devices_values()
989
993
  for r_k, r_v in c.resources.items():
990
994
  if r_k in ("cpu", "memory"):
991
995
  resources[r_k] = str(r_v)
@@ -1024,12 +1028,12 @@ class KubernetesDeployer(Deployer):
1024
1028
  # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1025
1029
  # and mount corresponding libs if needed.
1026
1030
  for re in runtime_env:
1027
- # Set to "all" if no specific devices detected,
1028
- # maybe the container backend can handle it.
1031
+ # Request device via visible devices env.
1032
+ rv = ",".join(vd_values.get(re) or ["all"])
1029
1033
  container.env.append(
1030
1034
  kubernetes.client.V1EnvVar(
1031
1035
  name=re,
1032
- value=",".join(vd_values.get(re, [])) or "all",
1036
+ value=rv,
1033
1037
  ),
1034
1038
  )
1035
1039
  else:
@@ -1038,18 +1042,15 @@ class KubernetesDeployer(Deployer):
1038
1042
  # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1039
1043
  # and mount corresponding libs if needed.
1040
1044
  for re in runtime_env:
1041
- # Set to "all" if no specific devices detected,
1042
- # maybe the container backend can handle it.
1045
+ # Request device via visible devices env.
1046
+ if not privileged:
1047
+ rv = str(r_v)
1048
+ else:
1049
+ rv = ",".join(vd_values.get(re) or ["all"])
1043
1050
  container.env.append(
1044
1051
  kubernetes.client.V1EnvVar(
1045
1052
  name=re,
1046
- value=(
1047
- str(r_v)
1048
- if not privileged
1049
- else (
1050
- ",".join(vd_values.get(re, [])) or "all"
1051
- )
1052
- ),
1053
+ value=rv,
1053
1054
  ),
1054
1055
  )
1055
1056
 
@@ -1206,16 +1207,16 @@ class KubernetesDeployer(Deployer):
1206
1207
 
1207
1208
  # Create image pull secrets if default registry credentials are set.
1208
1209
  if not self._image_pull_secret and (
1209
- envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY_USERNAME
1210
- and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY_PASSWORD
1210
+ envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME
1211
+ and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD
1211
1212
  ):
1212
1213
  registry = (
1213
- envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY or "index.docker.io"
1214
+ envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY or "index.docker.io"
1214
1215
  )
1215
1216
  self._image_pull_secret = self._apply_image_pull_secret(
1216
1217
  registry=f"https://{registry}/v1/",
1217
- username=envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY_USERNAME,
1218
- password=envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_REGISTRY_PASSWORD,
1218
+ username=envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME,
1219
+ password=envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD,
1219
1220
  )
1220
1221
 
1221
1222
  # Prepare mirrored deployment if enabled.