gpustack-runtime 0.1.38.post3__py3-none-any.whl → 0.1.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/deployer/__init__.py +24 -49
- gpustack_runtime/deployer/__patches__.py +455 -0
- gpustack_runtime/deployer/__types__.py +60 -27
- gpustack_runtime/deployer/docker.py +115 -41
- gpustack_runtime/deployer/kuberentes.py +23 -22
- gpustack_runtime/deployer/podman.py +2114 -0
- gpustack_runtime/detector/amd.py +4 -13
- gpustack_runtime/detector/hygon.py +1 -1
- gpustack_runtime/detector/nvidia.py +1 -1
- gpustack_runtime/detector/pyhsa/__init__.py +7 -7
- gpustack_runtime/detector/pyrocmsmi/__init__.py +9 -3
- gpustack_runtime/envs.py +216 -45
- {gpustack_runtime-0.1.38.post3.dist-info → gpustack_runtime-0.1.39.dist-info}/METADATA +6 -4
- {gpustack_runtime-0.1.38.post3.dist-info → gpustack_runtime-0.1.39.dist-info}/RECORD +19 -17
- {gpustack_runtime-0.1.38.post3.dist-info → gpustack_runtime-0.1.39.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.38.post3.dist-info → gpustack_runtime-0.1.39.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.38.post3.dist-info → gpustack_runtime-0.1.39.dist-info}/licenses/LICENSE +0 -0
|
@@ -1020,11 +1020,11 @@ class WorkloadPlan(WorkloadSecurity):
|
|
|
1020
1020
|
c.execution.command_script = None
|
|
1021
1021
|
# Add default registry if needed.
|
|
1022
1022
|
if (
|
|
1023
|
-
envs.
|
|
1024
|
-
and envs.
|
|
1023
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY
|
|
1024
|
+
and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY
|
|
1025
1025
|
not in ["docker.io", "index.docker.io"]
|
|
1026
1026
|
):
|
|
1027
|
-
image_registry = envs.
|
|
1027
|
+
image_registry = envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_NAMESPACE
|
|
1028
1028
|
image_split = c.image.split("/")
|
|
1029
1029
|
if len(image_split) == 1:
|
|
1030
1030
|
c.image = f"{image_registry}/library/{c.image}"
|
|
@@ -1269,6 +1269,17 @@ class Deployer(ABC):
|
|
|
1269
1269
|
"AMD_VISIBLE_DEVICES": ["HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"]
|
|
1270
1270
|
}.
|
|
1271
1271
|
"""
|
|
1272
|
+
_visible_devices_cdis: dict[str, str] | None = None
|
|
1273
|
+
"""
|
|
1274
|
+
Recorded visible devices envs to CDI mapping,
|
|
1275
|
+
the key is the runtime visible devices env name,
|
|
1276
|
+
the value is the corresponding CDI key.
|
|
1277
|
+
For example:
|
|
1278
|
+
{
|
|
1279
|
+
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu",
|
|
1280
|
+
"AMD_VISIBLE_DEVICES": "amd.com/gpu"
|
|
1281
|
+
}.
|
|
1282
|
+
"""
|
|
1272
1283
|
_visible_devices_values: dict[str, list[str]] | None = None
|
|
1273
1284
|
"""
|
|
1274
1285
|
Recorded visible devices values,
|
|
@@ -1349,6 +1360,7 @@ class Deployer(ABC):
|
|
|
1349
1360
|
return
|
|
1350
1361
|
|
|
1351
1362
|
self._visible_devices_env = {}
|
|
1363
|
+
self._visible_devices_cdis = {}
|
|
1352
1364
|
self._visible_devices_values = {}
|
|
1353
1365
|
self._visible_devices_topologies = {}
|
|
1354
1366
|
self._backend_visible_devices_values_alignment = {}
|
|
@@ -1364,33 +1376,33 @@ class Deployer(ABC):
|
|
|
1364
1376
|
ren = envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES.get(
|
|
1365
1377
|
rk,
|
|
1366
1378
|
)
|
|
1367
|
-
|
|
1379
|
+
ben_list = envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_BACKEND_VISIBLE_DEVICES.get(
|
|
1380
|
+
rk,
|
|
1381
|
+
)
|
|
1382
|
+
cdi = envs.GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_CONTAINER_DEVICE_INTERFACES.get(
|
|
1368
1383
|
rk,
|
|
1369
1384
|
)
|
|
1370
|
-
if ren and
|
|
1385
|
+
if ren and ben_list:
|
|
1386
|
+
valued_uuid = (
|
|
1387
|
+
ren
|
|
1388
|
+
in envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
|
|
1389
|
+
)
|
|
1371
1390
|
dev_uuids: list[str] = []
|
|
1372
1391
|
dev_indexes: list[str] = []
|
|
1373
|
-
|
|
1392
|
+
dev_indexes_alignment: dict[str, str] = {}
|
|
1393
|
+
for dev_i, dev in enumerate(devs):
|
|
1374
1394
|
dev_uuids.append(dev.uuid)
|
|
1375
1395
|
dev_indexes.append(str(dev.index))
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1396
|
+
dev_indexes_alignment[str(dev.index)] = str(dev_i)
|
|
1397
|
+
# Map runtime visible devices env <-> backend visible devices env list.
|
|
1398
|
+
self._visible_devices_env[ren] = ben_list
|
|
1399
|
+
# Map runtime visible devices env <-> CDI key.
|
|
1400
|
+
self._visible_devices_cdis[ren] = cdi
|
|
1401
|
+
# Map runtime visible devices env <-> device indexes or uuids.
|
|
1380
1402
|
self._visible_devices_values[ren] = (
|
|
1381
|
-
dev_uuids
|
|
1382
|
-
if ren
|
|
1383
|
-
in envs.GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID
|
|
1384
|
-
else dev_indexes
|
|
1403
|
+
dev_uuids if valued_uuid else dev_indexes
|
|
1385
1404
|
)
|
|
1386
|
-
|
|
1387
|
-
if (
|
|
1388
|
-
ben_item
|
|
1389
|
-
in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
|
|
1390
|
-
):
|
|
1391
|
-
self._backend_visible_devices_values_alignment[ben_item] = (
|
|
1392
|
-
dev_indexes_alignment
|
|
1393
|
-
)
|
|
1405
|
+
# Map runtime visible devices env <-> topology.
|
|
1394
1406
|
if (
|
|
1395
1407
|
envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
|
|
1396
1408
|
or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
|
|
@@ -1398,6 +1410,17 @@ class Deployer(ABC):
|
|
|
1398
1410
|
topos = get_devices_topologies(devices=devs)
|
|
1399
1411
|
if topos:
|
|
1400
1412
|
self._visible_devices_topologies[ren] = topos[0]
|
|
1413
|
+
# Map backend visible devices env <-> devices alignment.
|
|
1414
|
+
if not valued_uuid:
|
|
1415
|
+
for ben in ben_list:
|
|
1416
|
+
valued_alignment = (
|
|
1417
|
+
ben
|
|
1418
|
+
in envs.GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
|
|
1419
|
+
)
|
|
1420
|
+
if valued_alignment:
|
|
1421
|
+
self._backend_visible_devices_values_alignment[ben] = (
|
|
1422
|
+
dev_indexes_alignment
|
|
1423
|
+
)
|
|
1401
1424
|
|
|
1402
1425
|
if self._visible_devices_env:
|
|
1403
1426
|
return
|
|
@@ -1406,17 +1429,21 @@ class Deployer(ABC):
|
|
|
1406
1429
|
self._visible_devices_env["UNKNOWN_RUNTIME_VISIBLE_DEVICES"] = []
|
|
1407
1430
|
self._visible_devices_values["UNKNOWN_RUNTIME_VISIBLE_DEVICES"] = ["all"]
|
|
1408
1431
|
|
|
1409
|
-
def
|
|
1432
|
+
def get_visible_devices_values(
|
|
1410
1433
|
self,
|
|
1411
|
-
) -> (dict[str, list[str]], dict[str, list[str]]):
|
|
1434
|
+
) -> (dict[str, list[str]], dict[str, str], dict[str, list[str]]):
|
|
1412
1435
|
"""
|
|
1413
|
-
Return the visible devices environment variables and values mappings.
|
|
1436
|
+
Return the visible devices environment variables, cdis and values mappings.
|
|
1414
1437
|
For example:
|
|
1415
1438
|
(
|
|
1416
1439
|
{
|
|
1417
1440
|
"NVIDIA_VISIBLE_DEVICES": ["CUDA_VISIBLE_DEVICES"],
|
|
1418
1441
|
"AMD_VISIBLE_DEVICES": ["HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"]
|
|
1419
|
-
}
|
|
1442
|
+
},
|
|
1443
|
+
{
|
|
1444
|
+
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu",
|
|
1445
|
+
"AMD_VISIBLE_DEVICES": "amd.com/gpu"
|
|
1446
|
+
},
|
|
1420
1447
|
{
|
|
1421
1448
|
"NVIDIA_VISIBLE_DEVICES": ["0"],
|
|
1422
1449
|
"AMD_VISIBLE_DEVICES": ["0", "1"]
|
|
@@ -1428,11 +1455,17 @@ class Deployer(ABC):
|
|
|
1428
1455
|
- The first dictionary maps runtime visible devices environment variable names
|
|
1429
1456
|
to lists of backend visible devices environment variable names.
|
|
1430
1457
|
- The second dictionary maps runtime visible devices environment variable names
|
|
1458
|
+
to corresponding CDI keys.
|
|
1459
|
+
- The last dictionary maps runtime visible devices environment variable names
|
|
1431
1460
|
to lists of device indexes or UUIDs.
|
|
1432
1461
|
|
|
1433
1462
|
"""
|
|
1434
1463
|
self._prepare()
|
|
1435
|
-
return
|
|
1464
|
+
return (
|
|
1465
|
+
self._visible_devices_env,
|
|
1466
|
+
self._visible_devices_cdis,
|
|
1467
|
+
self._visible_devices_values,
|
|
1468
|
+
)
|
|
1436
1469
|
|
|
1437
1470
|
def get_visible_devices_affinities(
|
|
1438
1471
|
self,
|
|
@@ -139,7 +139,7 @@ class DockerWorkloadPlan(WorkloadPlan):
|
|
|
139
139
|
super().validate_and_default()
|
|
140
140
|
|
|
141
141
|
# Adjust default image namespace if needed.
|
|
142
|
-
if namespace := envs.
|
|
142
|
+
if namespace := envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_NAMESPACE:
|
|
143
143
|
self.pause_image = replace_image_with(
|
|
144
144
|
image=self.pause_image,
|
|
145
145
|
namespace=namespace,
|
|
@@ -175,7 +175,7 @@ class DockerWorkloadStatus(WorkloadStatus):
|
|
|
175
175
|
|
|
176
176
|
@staticmethod
|
|
177
177
|
def parse_state(
|
|
178
|
-
d_containers: list[docker.models.containers],
|
|
178
|
+
d_containers: list[docker.models.containers.Container],
|
|
179
179
|
) -> WorkloadStatusStateEnum:
|
|
180
180
|
"""
|
|
181
181
|
Parse the state of the workload based on the status of its containers.
|
|
@@ -221,7 +221,7 @@ class DockerWorkloadStatus(WorkloadStatus):
|
|
|
221
221
|
d_run_state = WorkloadStatusStateEnum.PENDING
|
|
222
222
|
else:
|
|
223
223
|
health = cr.attrs["State"].get("Health", {})
|
|
224
|
-
if health and health.get("Status", "healthy")
|
|
224
|
+
if health and health.get("Status", "healthy") not in ["healthy", ""]:
|
|
225
225
|
return WorkloadStatusStateEnum.UNHEALTHY
|
|
226
226
|
|
|
227
227
|
d_init_state = None
|
|
@@ -252,7 +252,7 @@ class DockerWorkloadStatus(WorkloadStatus):
|
|
|
252
252
|
def __init__(
|
|
253
253
|
self,
|
|
254
254
|
name: WorkloadName,
|
|
255
|
-
d_containers: list[docker.models.containers],
|
|
255
|
+
d_containers: list[docker.models.containers.Container],
|
|
256
256
|
**kwargs,
|
|
257
257
|
):
|
|
258
258
|
created_at = d_containers[0].attrs["Created"]
|
|
@@ -330,6 +330,12 @@ class DockerDeployer(Deployer):
|
|
|
330
330
|
if client:
|
|
331
331
|
try:
|
|
332
332
|
supported = client.ping()
|
|
333
|
+
if envs.GPUSTACK_RUNTIME_LOG_EXCEPTION:
|
|
334
|
+
version_info = client.version()
|
|
335
|
+
logger.debug(
|
|
336
|
+
"Connected to Docker API server: %s",
|
|
337
|
+
version_info,
|
|
338
|
+
)
|
|
333
339
|
except docker.errors.APIError:
|
|
334
340
|
debug_log_exception(logger, "Failed to connect to Docker API server")
|
|
335
341
|
|
|
@@ -352,12 +358,13 @@ class DockerDeployer(Deployer):
|
|
|
352
358
|
contextlib.redirect_stdout(dev_null),
|
|
353
359
|
contextlib.redirect_stderr(dev_null),
|
|
354
360
|
):
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
except docker.errors.DockerException:
|
|
360
|
-
|
|
361
|
+
os_env = os.environ.copy()
|
|
362
|
+
if envs.GPUSTACK_RUNTIME_DOCKER_HOST:
|
|
363
|
+
os_env["DOCKER_HOST"] = envs.GPUSTACK_RUNTIME_DOCKER_HOST
|
|
364
|
+
client = docker.from_env(environment=os_env)
|
|
365
|
+
except docker.errors.DockerException as e:
|
|
366
|
+
if "FileNotFoundError" not in str(e):
|
|
367
|
+
debug_log_exception(logger, "Failed to get Docker client")
|
|
361
368
|
|
|
362
369
|
return client
|
|
363
370
|
|
|
@@ -463,12 +470,12 @@ class DockerDeployer(Deployer):
|
|
|
463
470
|
tag = tag or "latest"
|
|
464
471
|
auth_config = None
|
|
465
472
|
if (
|
|
466
|
-
envs.
|
|
467
|
-
and envs.
|
|
473
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME
|
|
474
|
+
and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD
|
|
468
475
|
):
|
|
469
476
|
auth_config = {
|
|
470
|
-
"username": envs.
|
|
471
|
-
"password": envs.
|
|
477
|
+
"username": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME,
|
|
478
|
+
"password": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD,
|
|
472
479
|
}
|
|
473
480
|
|
|
474
481
|
logs = self._client.api.pull(
|
|
@@ -488,8 +495,10 @@ class DockerDeployer(Deployer):
|
|
|
488
495
|
msg = f"Failed to pull image {image}, invalid response"
|
|
489
496
|
raise OperationError(msg) from e
|
|
490
497
|
except docker.errors.APIError as e:
|
|
491
|
-
|
|
492
|
-
|
|
498
|
+
if "no unqualified-search registries are defined" not in str(e):
|
|
499
|
+
msg = f"Failed to pull image {image}{_detail_api_call_error(e)}"
|
|
500
|
+
raise OperationError(msg) from e
|
|
501
|
+
return self._pull_image(f"docker.io/{image}")
|
|
493
502
|
|
|
494
503
|
def _get_image(
|
|
495
504
|
self,
|
|
@@ -658,6 +667,14 @@ class DockerDeployer(Deployer):
|
|
|
658
667
|
# TODO(thxCode): check if the container matches the spec
|
|
659
668
|
return d_container
|
|
660
669
|
|
|
670
|
+
host_socket_path = None
|
|
671
|
+
if envs.GPUSTACK_RUNTIME_DOCKER_HOST.startswith("http+unix://"):
|
|
672
|
+
host_socket_path = envs.GPUSTACK_RUNTIME_DOCKER_HOST[len("http+unix://") :]
|
|
673
|
+
elif envs.GPUSTACK_RUNTIME_DOCKER_HOST.startswith("unix://"):
|
|
674
|
+
host_socket_path = envs.GPUSTACK_RUNTIME_DOCKER_HOST[len("unix://") :]
|
|
675
|
+
if host_socket_path and not host_socket_path.startswith("/"):
|
|
676
|
+
host_socket_path = f"/{host_socket_path}"
|
|
677
|
+
|
|
661
678
|
create_options: dict[str, Any] = {
|
|
662
679
|
"name": container_name,
|
|
663
680
|
"restart_policy": {"Name": "always"},
|
|
@@ -669,10 +686,17 @@ class DockerDeployer(Deployer):
|
|
|
669
686
|
"environment": [
|
|
670
687
|
f"AUTOHEAL_CONTAINER_LABEL={_LABEL_COMPONENT_HEAL_PREFIX}-{workload.name}",
|
|
671
688
|
],
|
|
672
|
-
"volumes": [
|
|
673
|
-
"/var/run/docker.sock:/var/run/docker.sock",
|
|
674
|
-
],
|
|
675
689
|
}
|
|
690
|
+
if host_socket_path:
|
|
691
|
+
create_options["volumes"] = (
|
|
692
|
+
[
|
|
693
|
+
f"{host_socket_path}:/var/run/docker.sock",
|
|
694
|
+
],
|
|
695
|
+
)
|
|
696
|
+
elif envs.GPUSTACK_RUNTIME_DOCKER_HOST:
|
|
697
|
+
create_options["environment"].append(
|
|
698
|
+
f"DOCKER_SOCK={envs.GPUSTACK_RUNTIME_DOCKER_HOST}",
|
|
699
|
+
)
|
|
676
700
|
|
|
677
701
|
if envs.GPUSTACK_RUNTIME_DEPLOY_PRINT_CONVERSION:
|
|
678
702
|
clogger.info(
|
|
@@ -760,7 +784,7 @@ class DockerDeployer(Deployer):
|
|
|
760
784
|
else:
|
|
761
785
|
continue
|
|
762
786
|
|
|
763
|
-
if m.mode
|
|
787
|
+
if m.mode != ContainerMountModeEnum.RWX:
|
|
764
788
|
binding["ReadOnly"] = True
|
|
765
789
|
|
|
766
790
|
mount_binding.append(binding)
|
|
@@ -841,6 +865,7 @@ class DockerDeployer(Deployer):
|
|
|
841
865
|
workload: DockerWorkloadPlan,
|
|
842
866
|
ephemeral_filename_mapping: dict[tuple[int, str] : str],
|
|
843
867
|
ephemeral_volume_name_mapping: dict[str, str],
|
|
868
|
+
pause_container: docker.models.containers.Container,
|
|
844
869
|
) -> (
|
|
845
870
|
list[docker.models.containers.Container],
|
|
846
871
|
list[docker.models.containers.Container],
|
|
@@ -860,7 +885,7 @@ class DockerDeployer(Deployer):
|
|
|
860
885
|
d_init_containers: list[docker.models.containers.Container] = []
|
|
861
886
|
d_run_containers: list[docker.models.containers.Container] = []
|
|
862
887
|
|
|
863
|
-
pause_container_namespace = f"container:{
|
|
888
|
+
pause_container_namespace = f"container:{pause_container.id}"
|
|
864
889
|
for ci, c in enumerate(workload.containers):
|
|
865
890
|
container_name = f"{workload.name}-{c.profile.lower()}-{ci}"
|
|
866
891
|
try:
|
|
@@ -942,9 +967,14 @@ class DockerDeployer(Deployer):
|
|
|
942
967
|
|
|
943
968
|
# Parameterize resources.
|
|
944
969
|
if c.resources:
|
|
970
|
+
cdi = (
|
|
971
|
+
envs.GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY.lower()
|
|
972
|
+
== "cdi"
|
|
973
|
+
)
|
|
974
|
+
|
|
945
975
|
r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
|
|
946
976
|
r_k_backend_env = workload.resource_key_backend_env_mapping or {}
|
|
947
|
-
vd_env, vd_values = self.
|
|
977
|
+
vd_env, vd_cdis, vd_values = self.get_visible_devices_values()
|
|
948
978
|
for r_k, r_v in c.resources.items():
|
|
949
979
|
match r_k:
|
|
950
980
|
case "cpu":
|
|
@@ -994,24 +1024,59 @@ class DockerDeployer(Deployer):
|
|
|
994
1024
|
# so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
|
|
995
1025
|
# and mount corresponding libs if needed.
|
|
996
1026
|
for re in runtime_env:
|
|
997
|
-
#
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1027
|
+
# Request device via CDI.
|
|
1028
|
+
if cdi:
|
|
1029
|
+
rv = [
|
|
1030
|
+
f"{vd_cdis[re]}={v}"
|
|
1031
|
+
for v in (vd_values.get(re) or ["all"])
|
|
1032
|
+
]
|
|
1033
|
+
if "device_requests" not in create_options:
|
|
1034
|
+
create_options["device_requests"] = []
|
|
1035
|
+
create_options["device_requests"].append(
|
|
1036
|
+
docker.types.DeviceRequest(
|
|
1037
|
+
driver="cdi",
|
|
1038
|
+
count=0,
|
|
1039
|
+
device_ids=rv,
|
|
1040
|
+
),
|
|
1041
|
+
)
|
|
1042
|
+
continue
|
|
1043
|
+
# Request device via visible devices env.
|
|
1044
|
+
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1045
|
+
create_options["environment"][re] = rv
|
|
1002
1046
|
else:
|
|
1003
1047
|
# Set env to the allocated device IDs if no privileged,
|
|
1004
1048
|
# otherwise, set container backend visible devices env to all devices,
|
|
1005
1049
|
# so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
|
|
1006
1050
|
# and mount corresponding libs if needed.
|
|
1007
1051
|
for re in runtime_env:
|
|
1008
|
-
#
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1052
|
+
# Request device via CDI.
|
|
1053
|
+
if cdi:
|
|
1054
|
+
if not privileged:
|
|
1055
|
+
rv = [
|
|
1056
|
+
f"{vd_cdis[re]}={v.strip()}"
|
|
1057
|
+
for v in r_v.split(",")
|
|
1058
|
+
]
|
|
1059
|
+
else:
|
|
1060
|
+
rv = [
|
|
1061
|
+
f"{vd_cdis[re]}={v}"
|
|
1062
|
+
for v in (vd_values.get(re) or ["all"])
|
|
1063
|
+
]
|
|
1064
|
+
if "device_requests" not in create_options:
|
|
1065
|
+
create_options["device_requests"] = []
|
|
1066
|
+
create_options["device_requests"].append(
|
|
1067
|
+
docker.types.DeviceRequest(
|
|
1068
|
+
driver="cdi",
|
|
1069
|
+
count=0,
|
|
1070
|
+
device_ids=rv,
|
|
1071
|
+
),
|
|
1072
|
+
)
|
|
1073
|
+
continue
|
|
1074
|
+
# Request device via visible devices env.
|
|
1075
|
+
if not privileged:
|
|
1076
|
+
rv = str(r_v)
|
|
1077
|
+
else:
|
|
1078
|
+
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1079
|
+
create_options["environment"][re] = rv
|
|
1015
1080
|
|
|
1016
1081
|
# Configure runtime device access environment variables.
|
|
1017
1082
|
if r_v != "all" and privileged:
|
|
@@ -1204,7 +1269,7 @@ class DockerDeployer(Deployer):
|
|
|
1204
1269
|
# Always filter out Docker Socket mount.
|
|
1205
1270
|
m
|
|
1206
1271
|
for m in (self_container.attrs["Mounts"] or [])
|
|
1207
|
-
if m.get("Destination")
|
|
1272
|
+
if not m.get("Destination").endswith("/docker.sock")
|
|
1208
1273
|
]
|
|
1209
1274
|
if igs := envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES:
|
|
1210
1275
|
mirrored_mounts = [
|
|
@@ -1491,6 +1556,7 @@ class DockerDeployer(Deployer):
|
|
|
1491
1556
|
workload,
|
|
1492
1557
|
ephemeral_filename_mapping,
|
|
1493
1558
|
ephemeral_volume_name_mapping,
|
|
1559
|
+
pause_container,
|
|
1494
1560
|
)
|
|
1495
1561
|
|
|
1496
1562
|
# Create unhealthy restart container if needed.
|
|
@@ -1593,10 +1659,18 @@ class DockerDeployer(Deployer):
|
|
|
1593
1659
|
# Remove all containers with the workload label.
|
|
1594
1660
|
try:
|
|
1595
1661
|
d_containers = getattr(workload, "_d_containers", [])
|
|
1662
|
+
# Remove non-pause containers first.
|
|
1596
1663
|
for c in d_containers:
|
|
1597
|
-
c.
|
|
1598
|
-
|
|
1599
|
-
|
|
1664
|
+
if "-pause" not in c.name:
|
|
1665
|
+
c.remove(
|
|
1666
|
+
force=True,
|
|
1667
|
+
)
|
|
1668
|
+
# Then remove pause containers.
|
|
1669
|
+
for c in d_containers:
|
|
1670
|
+
if "-pause" in c.name:
|
|
1671
|
+
c.remove(
|
|
1672
|
+
force=True,
|
|
1673
|
+
)
|
|
1600
1674
|
except docker.errors.APIError as e:
|
|
1601
1675
|
msg = f"Failed to delete containers for workload {name}{_detail_api_call_error(e)}"
|
|
1602
1676
|
raise OperationError(msg) from e
|
|
@@ -1860,7 +1934,7 @@ class DockerDeployer(Deployer):
|
|
|
1860
1934
|
}
|
|
1861
1935
|
|
|
1862
1936
|
try:
|
|
1863
|
-
|
|
1937
|
+
_, output = container.exec_run(
|
|
1864
1938
|
detach=False,
|
|
1865
1939
|
**exec_options,
|
|
1866
1940
|
)
|
|
@@ -1869,8 +1943,8 @@ class DockerDeployer(Deployer):
|
|
|
1869
1943
|
raise OperationError(msg) from e
|
|
1870
1944
|
else:
|
|
1871
1945
|
if not attach:
|
|
1872
|
-
return
|
|
1873
|
-
return DockerWorkloadExecStream(
|
|
1946
|
+
return output
|
|
1947
|
+
return DockerWorkloadExecStream(output)
|
|
1874
1948
|
|
|
1875
1949
|
|
|
1876
1950
|
def _has_restart_policy(
|
|
@@ -321,14 +321,18 @@ class KubernetesDeployer(Deployer):
|
|
|
321
321
|
version_api = kubernetes.client.VersionApi(client)
|
|
322
322
|
version_info = version_api.get_code()
|
|
323
323
|
supported = version_info is not None
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
324
|
+
if envs.GPUSTACK_RUNTIME_LOG_EXCEPTION:
|
|
325
|
+
logger.debug(
|
|
326
|
+
"Connected to Kubernetes API server: %s",
|
|
327
|
+
version_info,
|
|
328
|
+
)
|
|
329
|
+
except kubernetes.client.exceptions.ApiException:
|
|
328
330
|
debug_log_exception(
|
|
329
331
|
logger,
|
|
330
332
|
"Failed to connect to Kubernetes API server",
|
|
331
333
|
)
|
|
334
|
+
except urllib3.exceptions.MaxRetryError:
|
|
335
|
+
pass
|
|
332
336
|
|
|
333
337
|
return supported
|
|
334
338
|
|
|
@@ -985,7 +989,7 @@ class KubernetesDeployer(Deployer):
|
|
|
985
989
|
resources: dict[str, str] = {}
|
|
986
990
|
r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
|
|
987
991
|
r_k_backend_env = workload.resource_key_backend_env_mapping or {}
|
|
988
|
-
vd_env, vd_values = self.
|
|
992
|
+
vd_env, _, vd_values = self.get_visible_devices_values()
|
|
989
993
|
for r_k, r_v in c.resources.items():
|
|
990
994
|
if r_k in ("cpu", "memory"):
|
|
991
995
|
resources[r_k] = str(r_v)
|
|
@@ -1024,12 +1028,12 @@ class KubernetesDeployer(Deployer):
|
|
|
1024
1028
|
# so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
|
|
1025
1029
|
# and mount corresponding libs if needed.
|
|
1026
1030
|
for re in runtime_env:
|
|
1027
|
-
#
|
|
1028
|
-
|
|
1031
|
+
# Request device via visible devices env.
|
|
1032
|
+
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1029
1033
|
container.env.append(
|
|
1030
1034
|
kubernetes.client.V1EnvVar(
|
|
1031
1035
|
name=re,
|
|
1032
|
-
value=
|
|
1036
|
+
value=rv,
|
|
1033
1037
|
),
|
|
1034
1038
|
)
|
|
1035
1039
|
else:
|
|
@@ -1038,18 +1042,15 @@ class KubernetesDeployer(Deployer):
|
|
|
1038
1042
|
# so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
|
|
1039
1043
|
# and mount corresponding libs if needed.
|
|
1040
1044
|
for re in runtime_env:
|
|
1041
|
-
#
|
|
1042
|
-
|
|
1045
|
+
# Request device via visible devices env.
|
|
1046
|
+
if not privileged:
|
|
1047
|
+
rv = str(r_v)
|
|
1048
|
+
else:
|
|
1049
|
+
rv = ",".join(vd_values.get(re) or ["all"])
|
|
1043
1050
|
container.env.append(
|
|
1044
1051
|
kubernetes.client.V1EnvVar(
|
|
1045
1052
|
name=re,
|
|
1046
|
-
value=
|
|
1047
|
-
str(r_v)
|
|
1048
|
-
if not privileged
|
|
1049
|
-
else (
|
|
1050
|
-
",".join(vd_values.get(re, [])) or "all"
|
|
1051
|
-
)
|
|
1052
|
-
),
|
|
1053
|
+
value=rv,
|
|
1053
1054
|
),
|
|
1054
1055
|
)
|
|
1055
1056
|
|
|
@@ -1206,16 +1207,16 @@ class KubernetesDeployer(Deployer):
|
|
|
1206
1207
|
|
|
1207
1208
|
# Create image pull secrets if default registry credentials are set.
|
|
1208
1209
|
if not self._image_pull_secret and (
|
|
1209
|
-
envs.
|
|
1210
|
-
and envs.
|
|
1210
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME
|
|
1211
|
+
and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD
|
|
1211
1212
|
):
|
|
1212
1213
|
registry = (
|
|
1213
|
-
envs.
|
|
1214
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY or "index.docker.io"
|
|
1214
1215
|
)
|
|
1215
1216
|
self._image_pull_secret = self._apply_image_pull_secret(
|
|
1216
1217
|
registry=f"https://{registry}/v1/",
|
|
1217
|
-
username=envs.
|
|
1218
|
-
password=envs.
|
|
1218
|
+
username=envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME,
|
|
1219
|
+
password=envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD,
|
|
1219
1220
|
)
|
|
1220
1221
|
|
|
1221
1222
|
# Prepare mirrored deployment if enabled.
|