gpustack-runtime 0.1.39__py3-none-any.whl → 0.1.39.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/__main__.py +6 -2
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/__init__.py +6 -2
- gpustack_runtime/cmds/deployer.py +187 -288
- gpustack_runtime/deployer/__init__.py +197 -0
- gpustack_runtime/deployer/__types__.py +382 -17
- gpustack_runtime/deployer/__utils__.py +34 -0
- gpustack_runtime/deployer/docker.py +312 -167
- gpustack_runtime/deployer/kuberentes.py +288 -45
- gpustack_runtime/deployer/podman.py +322 -167
- gpustack_runtime/detector/amd.py +1 -1
- gpustack_runtime/detector/hygon.py +1 -1
- gpustack_runtime/detector/pyhsa/__init__.py +7 -7
- gpustack_runtime/envs.py +29 -42
- {gpustack_runtime-0.1.39.dist-info → gpustack_runtime-0.1.39.post2.dist-info}/METADATA +2 -2
- {gpustack_runtime-0.1.39.dist-info → gpustack_runtime-0.1.39.post2.dist-info}/RECORD +20 -20
- {gpustack_runtime-0.1.39.dist-info → gpustack_runtime-0.1.39.post2.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.39.dist-info → gpustack_runtime-0.1.39.post2.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.39.dist-info → gpustack_runtime-0.1.39.post2.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
|
+
import io
|
|
4
5
|
import json
|
|
5
6
|
import logging
|
|
6
7
|
import operator
|
|
7
8
|
import os
|
|
8
9
|
import socket
|
|
9
10
|
import sys
|
|
11
|
+
import tarfile
|
|
10
12
|
from dataclasses import dataclass, field
|
|
11
13
|
from functools import lru_cache, reduce
|
|
12
14
|
from math import ceil
|
|
@@ -26,7 +28,7 @@ from podman.domain.containers_create import CreateMixin
|
|
|
26
28
|
from tqdm import tqdm
|
|
27
29
|
|
|
28
30
|
from .. import envs
|
|
29
|
-
from ..logging import debug_log_exception
|
|
31
|
+
from ..logging import debug_log_exception, debug_log_warning
|
|
30
32
|
from .__patches__ import patch_render_payload
|
|
31
33
|
from .__types__ import (
|
|
32
34
|
Container,
|
|
@@ -35,7 +37,7 @@ from .__types__ import (
|
|
|
35
37
|
ContainerMountModeEnum,
|
|
36
38
|
ContainerProfileEnum,
|
|
37
39
|
ContainerRestartPolicyEnum,
|
|
38
|
-
|
|
40
|
+
EndoscopicDeployer,
|
|
39
41
|
OperationError,
|
|
40
42
|
UnsupportedError,
|
|
41
43
|
WorkloadExecStream,
|
|
@@ -47,7 +49,13 @@ from .__types__ import (
|
|
|
47
49
|
WorkloadStatusOperation,
|
|
48
50
|
WorkloadStatusStateEnum,
|
|
49
51
|
)
|
|
50
|
-
from .__utils__ import
|
|
52
|
+
from .__utils__ import (
|
|
53
|
+
_MiB,
|
|
54
|
+
bytes_to_human_readable,
|
|
55
|
+
replace_image_with,
|
|
56
|
+
safe_json,
|
|
57
|
+
sensitive_env_var,
|
|
58
|
+
)
|
|
51
59
|
|
|
52
60
|
if TYPE_CHECKING:
|
|
53
61
|
from collections.abc import Callable, Generator
|
|
@@ -142,7 +150,7 @@ class PodmanWorkloadPlan(WorkloadPlan):
|
|
|
142
150
|
super().validate_and_default()
|
|
143
151
|
|
|
144
152
|
# Adjust default image namespace if needed.
|
|
145
|
-
if namespace := envs.
|
|
153
|
+
if namespace := envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_NAMESPACE:
|
|
146
154
|
self.pause_image = replace_image_with(
|
|
147
155
|
image=self.pause_image,
|
|
148
156
|
namespace=namespace,
|
|
@@ -297,7 +305,7 @@ Name of the Podman deployer.
|
|
|
297
305
|
"""
|
|
298
306
|
|
|
299
307
|
|
|
300
|
-
class PodmanDeployer(
|
|
308
|
+
class PodmanDeployer(EndoscopicDeployer):
|
|
301
309
|
"""
|
|
302
310
|
Deployer implementation for Podman containers.
|
|
303
311
|
"""
|
|
@@ -306,10 +314,6 @@ class PodmanDeployer(Deployer):
|
|
|
306
314
|
"""
|
|
307
315
|
Client for interacting with the Podman daemon.
|
|
308
316
|
"""
|
|
309
|
-
_container_ephemeral_files_dir: Path | None = None
|
|
310
|
-
"""
|
|
311
|
-
Directory for ephemeral files inside containers, internal use only.
|
|
312
|
-
"""
|
|
313
317
|
_mutate_create_options: Callable[[dict[str, Any]], dict[str, Any]] | None = None
|
|
314
318
|
"""
|
|
315
319
|
Function to handle mirrored deployment, internal use only.
|
|
@@ -386,48 +390,6 @@ class PodmanDeployer(Deployer):
|
|
|
386
390
|
|
|
387
391
|
return wrapper
|
|
388
392
|
|
|
389
|
-
@staticmethod
|
|
390
|
-
def _create_ephemeral_files(
|
|
391
|
-
workload: PodmanWorkloadPlan,
|
|
392
|
-
) -> dict[tuple[int, str], str]:
|
|
393
|
-
"""
|
|
394
|
-
Create ephemeral files as local file for the workload.
|
|
395
|
-
|
|
396
|
-
Returns:
|
|
397
|
-
A mapping from (container index, configured path) to actual filename.
|
|
398
|
-
|
|
399
|
-
Raises:
|
|
400
|
-
OperationError:
|
|
401
|
-
If the ephemeral files fail to create.
|
|
402
|
-
|
|
403
|
-
"""
|
|
404
|
-
# Map (container index, configured path) to actual filename.
|
|
405
|
-
ephemeral_filename_mapping: dict[tuple[int, str], str] = {}
|
|
406
|
-
ephemeral_files: list[tuple[str, str, int]] = []
|
|
407
|
-
for ci, c in enumerate(workload.containers):
|
|
408
|
-
for fi, f in enumerate(c.files or []):
|
|
409
|
-
if f.content is not None:
|
|
410
|
-
fn = f"{workload.name}-{ci}-{fi}"
|
|
411
|
-
ephemeral_filename_mapping[(ci, f.path)] = fn
|
|
412
|
-
ephemeral_files.append((fn, f.content, f.mode))
|
|
413
|
-
if not ephemeral_filename_mapping:
|
|
414
|
-
return ephemeral_filename_mapping
|
|
415
|
-
|
|
416
|
-
# Create ephemeral files directory if not exists.
|
|
417
|
-
try:
|
|
418
|
-
for fn, fc, fm in ephemeral_files:
|
|
419
|
-
fp = envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR.joinpath(fn)
|
|
420
|
-
with fp.open("w", encoding="utf-8") as f:
|
|
421
|
-
f.write(fc)
|
|
422
|
-
f.flush()
|
|
423
|
-
fp.chmod(fm)
|
|
424
|
-
logger.debug("Created local file %s with mode %s", fp, oct(fm))
|
|
425
|
-
except OSError as e:
|
|
426
|
-
msg = "Failed to create ephemeral files"
|
|
427
|
-
raise OperationError(msg) from e
|
|
428
|
-
|
|
429
|
-
return ephemeral_filename_mapping
|
|
430
|
-
|
|
431
393
|
def _create_ephemeral_volumes(self, workload: PodmanWorkloadPlan) -> dict[str, str]:
|
|
432
394
|
"""
|
|
433
395
|
Create ephemeral volumes for the workload.
|
|
@@ -473,12 +435,12 @@ class PodmanDeployer(Deployer):
|
|
|
473
435
|
tag = tag or "latest"
|
|
474
436
|
auth_config = None
|
|
475
437
|
if (
|
|
476
|
-
envs.
|
|
477
|
-
and envs.
|
|
438
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_USERNAME
|
|
439
|
+
and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_PASSWORD
|
|
478
440
|
):
|
|
479
441
|
auth_config = {
|
|
480
|
-
"username": envs.
|
|
481
|
-
"password": envs.
|
|
442
|
+
"username": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_USERNAME,
|
|
443
|
+
"password": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_PASSWORD,
|
|
482
444
|
}
|
|
483
445
|
|
|
484
446
|
logs = self._client.api.pull(
|
|
@@ -715,12 +677,10 @@ class PodmanDeployer(Deployer):
|
|
|
715
677
|
else:
|
|
716
678
|
return d_container
|
|
717
679
|
|
|
680
|
+
@staticmethod
|
|
718
681
|
def _append_container_mounts(
|
|
719
|
-
self,
|
|
720
682
|
create_options: dict[str, Any],
|
|
721
683
|
c: Container,
|
|
722
|
-
ci: int,
|
|
723
|
-
ephemeral_filename_mapping: dict[tuple[int, str] : str],
|
|
724
684
|
ephemeral_volume_name_mapping: dict[str, str],
|
|
725
685
|
):
|
|
726
686
|
"""
|
|
@@ -736,17 +696,7 @@ class PodmanDeployer(Deployer):
|
|
|
736
696
|
"target": "",
|
|
737
697
|
}
|
|
738
698
|
|
|
739
|
-
if f.content is
|
|
740
|
-
# Ephemeral file, use from local ephemeral files directory.
|
|
741
|
-
if (ci, f.path) not in ephemeral_filename_mapping:
|
|
742
|
-
continue
|
|
743
|
-
fn = ephemeral_filename_mapping[(ci, f.path)]
|
|
744
|
-
path = str(
|
|
745
|
-
self._container_ephemeral_files_dir.joinpath(fn),
|
|
746
|
-
)
|
|
747
|
-
binding["source"] = path
|
|
748
|
-
binding["target"] = f"/{f.path.lstrip('/')}"
|
|
749
|
-
elif f.path:
|
|
699
|
+
if f.content is None and f.path:
|
|
750
700
|
# Host file, bind directly.
|
|
751
701
|
binding["source"] = f.path
|
|
752
702
|
binding["target"] = f.path
|
|
@@ -858,10 +808,39 @@ class PodmanDeployer(Deployer):
|
|
|
858
808
|
|
|
859
809
|
return healthcheck
|
|
860
810
|
|
|
811
|
+
@staticmethod
|
|
812
|
+
def _upload_ephemeral_files(
|
|
813
|
+
c: Container,
|
|
814
|
+
container: podman.domain.containers.Container,
|
|
815
|
+
):
|
|
816
|
+
if not c.files:
|
|
817
|
+
return
|
|
818
|
+
|
|
819
|
+
f_tar = io.BytesIO()
|
|
820
|
+
with tarfile.open(fileobj=f_tar, mode="w") as tar:
|
|
821
|
+
for f in c.files:
|
|
822
|
+
if f.content is None or not f.path:
|
|
823
|
+
continue
|
|
824
|
+
fc_bytes = f.content.encode("utf-8")
|
|
825
|
+
info = tarfile.TarInfo(name=f.path.lstrip("/"))
|
|
826
|
+
info.size = len(fc_bytes)
|
|
827
|
+
info.mode = f.mode
|
|
828
|
+
tar.addfile(tarinfo=info, fileobj=io.BytesIO(fc_bytes))
|
|
829
|
+
if f_tar.getbuffer().nbytes == 0:
|
|
830
|
+
return
|
|
831
|
+
|
|
832
|
+
f_tar.seek(0)
|
|
833
|
+
uploaded = container.put_archive(
|
|
834
|
+
path="/",
|
|
835
|
+
data=f_tar.getvalue(),
|
|
836
|
+
)
|
|
837
|
+
if not uploaded:
|
|
838
|
+
msg = f"Failed to upload ephemeral files to container {container.name}"
|
|
839
|
+
raise OperationError(msg)
|
|
840
|
+
|
|
861
841
|
def _create_containers(
|
|
862
842
|
self,
|
|
863
843
|
workload: PodmanWorkloadPlan,
|
|
864
|
-
ephemeral_filename_mapping: dict[tuple[int, str] : str],
|
|
865
844
|
ephemeral_volume_name_mapping: dict[str, str],
|
|
866
845
|
pause_container: podman.domain.containers.Container,
|
|
867
846
|
) -> (
|
|
@@ -1077,8 +1056,6 @@ class PodmanDeployer(Deployer):
|
|
|
1077
1056
|
self._append_container_mounts(
|
|
1078
1057
|
create_options,
|
|
1079
1058
|
c,
|
|
1080
|
-
ci,
|
|
1081
|
-
ephemeral_filename_mapping,
|
|
1082
1059
|
ephemeral_volume_name_mapping,
|
|
1083
1060
|
)
|
|
1084
1061
|
|
|
@@ -1125,6 +1102,10 @@ class PodmanDeployer(Deployer):
|
|
|
1125
1102
|
detach=detach,
|
|
1126
1103
|
**create_options,
|
|
1127
1104
|
)
|
|
1105
|
+
|
|
1106
|
+
# Upload ephemeral files into the container.
|
|
1107
|
+
self._upload_ephemeral_files(c, d_container)
|
|
1108
|
+
|
|
1128
1109
|
except podman.errors.APIError as e:
|
|
1129
1110
|
msg = f"Failed to create container {container_name}{_detail_api_call_error(e)}"
|
|
1130
1111
|
raise OperationError(msg) from e
|
|
@@ -1174,43 +1155,30 @@ class PodmanDeployer(Deployer):
|
|
|
1174
1155
|
def __init__(self):
|
|
1175
1156
|
super().__init__(_NAME)
|
|
1176
1157
|
self._client = self._get_client()
|
|
1177
|
-
self._container_ephemeral_files_dir = (
|
|
1178
|
-
envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR
|
|
1179
|
-
)
|
|
1180
1158
|
|
|
1181
|
-
def
|
|
1159
|
+
def _prepare_mirrored_deployment(self):
|
|
1182
1160
|
"""
|
|
1183
|
-
Prepare for
|
|
1161
|
+
Prepare for mirrored deployment.
|
|
1184
1162
|
|
|
1185
1163
|
"""
|
|
1186
1164
|
# Prepare mirrored deployment if enabled.
|
|
1187
1165
|
if self._mutate_create_options:
|
|
1188
1166
|
return
|
|
1189
1167
|
self._mutate_create_options = lambda o: o
|
|
1190
|
-
if not envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT:
|
|
1191
|
-
logger.debug("Mirrored deployment disabled")
|
|
1192
|
-
return
|
|
1193
1168
|
|
|
1194
1169
|
# Retrieve self-container info.
|
|
1195
|
-
## - Get Container name, default to hostname if not set.
|
|
1196
|
-
self_container_id = envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME
|
|
1197
|
-
if not self_container_id:
|
|
1198
|
-
self_container_id = socket.gethostname()
|
|
1199
|
-
logger.warning(
|
|
1200
|
-
"Mirrored deployment enabled, but no Container name set, using hostname(%s) instead",
|
|
1201
|
-
self_container_id,
|
|
1202
|
-
)
|
|
1203
1170
|
try:
|
|
1204
|
-
self_container = self._find_self_container(
|
|
1171
|
+
self_container = self._find_self_container()
|
|
1172
|
+
if not self_container:
|
|
1173
|
+
return
|
|
1205
1174
|
logger.info(
|
|
1206
1175
|
"Mirrored deployment enabled, using self Container %s for options mirroring",
|
|
1207
|
-
self_container.
|
|
1176
|
+
self_container.short_id,
|
|
1208
1177
|
)
|
|
1209
1178
|
self_image = self_container.image
|
|
1210
1179
|
except podman.errors.APIError:
|
|
1211
1180
|
logger.exception(
|
|
1212
|
-
"Mirrored deployment enabled, but failed to get self Container
|
|
1213
|
-
self_container_id,
|
|
1181
|
+
"Mirrored deployment enabled, but failed to get self Container, skipping",
|
|
1214
1182
|
)
|
|
1215
1183
|
return
|
|
1216
1184
|
|
|
@@ -1370,36 +1338,10 @@ class PodmanDeployer(Deployer):
|
|
|
1370
1338
|
|
|
1371
1339
|
self._mutate_create_options = mutate_create_options
|
|
1372
1340
|
|
|
1373
|
-
|
|
1374
|
-
if mirrored_mounts:
|
|
1375
|
-
e_target = str(envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR)
|
|
1376
|
-
b_source = ""
|
|
1377
|
-
b_target = ""
|
|
1378
|
-
for m in mirrored_mounts:
|
|
1379
|
-
c_target = m.get("Destination", "///")
|
|
1380
|
-
if (
|
|
1381
|
-
e_target == c_target or e_target.startswith(f"{c_target}/")
|
|
1382
|
-
) and len(c_target) >= len(b_target):
|
|
1383
|
-
b_source = m.get("Source")
|
|
1384
|
-
b_target = c_target
|
|
1385
|
-
if b_source:
|
|
1386
|
-
result = Path(b_source)
|
|
1387
|
-
if e_target != b_target:
|
|
1388
|
-
b_subpath = e_target.removeprefix(b_target)
|
|
1389
|
-
result = result.joinpath(b_subpath.lstrip("/"))
|
|
1390
|
-
self._container_ephemeral_files_dir = result
|
|
1391
|
-
|
|
1392
|
-
def _find_self_container(
|
|
1393
|
-
self,
|
|
1394
|
-
self_container_id: str,
|
|
1395
|
-
) -> podman.domain.containers.Container:
|
|
1341
|
+
def _find_self_container(self) -> podman.domain.containers.Container | None:
|
|
1396
1342
|
"""
|
|
1397
1343
|
Find the current container if running inside a Podman container.
|
|
1398
1344
|
|
|
1399
|
-
Args:
|
|
1400
|
-
self_container_id:
|
|
1401
|
-
The container name or ID to find.
|
|
1402
|
-
|
|
1403
1345
|
Returns:
|
|
1404
1346
|
The Podman container if found, None otherwise.
|
|
1405
1347
|
|
|
@@ -1407,38 +1349,54 @@ class PodmanDeployer(Deployer):
|
|
|
1407
1349
|
If failed to find itself.
|
|
1408
1350
|
|
|
1409
1351
|
"""
|
|
1410
|
-
if envs.
|
|
1411
|
-
|
|
1412
|
-
return
|
|
1413
|
-
|
|
1414
|
-
# Find containers that matches the hostname.
|
|
1415
|
-
containers: list[podman.domain.containers.Container] = []
|
|
1416
|
-
for c in self._client.containers.list(compatible=True):
|
|
1417
|
-
# Ignore workload containers with host network enabled.
|
|
1418
|
-
if _LABEL_WORKLOAD in c.labels:
|
|
1419
|
-
continue
|
|
1420
|
-
# Ignore containers that do not match the hostname.
|
|
1421
|
-
if c.attrs["Config"].get("Hostname", "") != self_container_id:
|
|
1422
|
-
continue
|
|
1423
|
-
# Ignore containers that do not match the filter labels.
|
|
1424
|
-
if envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS and any(
|
|
1425
|
-
c.labels.get(k) != v
|
|
1426
|
-
for k, v in envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS.items()
|
|
1427
|
-
):
|
|
1428
|
-
continue
|
|
1429
|
-
containers.append(c)
|
|
1352
|
+
if not envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT:
|
|
1353
|
+
logger.debug("Mirrored deployment disabled")
|
|
1354
|
+
return None
|
|
1430
1355
|
|
|
1431
|
-
#
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
"
|
|
1356
|
+
# Get container ID or hostname.
|
|
1357
|
+
self_container_id = envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME
|
|
1358
|
+
if not self_container_id:
|
|
1359
|
+
self_container_id = socket.gethostname()
|
|
1360
|
+
debug_log_warning(
|
|
1361
|
+
logger,
|
|
1362
|
+
"Mirrored deployment enabled, but no Container name set, using hostname(%s) instead",
|
|
1363
|
+
self_container_id,
|
|
1438
1364
|
)
|
|
1439
|
-
raise podman.errors.NotFound(msg)
|
|
1440
1365
|
|
|
1441
|
-
|
|
1366
|
+
if envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME:
|
|
1367
|
+
# Directly get container.
|
|
1368
|
+
self_container = self._client.containers.get(self_container_id)
|
|
1369
|
+
else:
|
|
1370
|
+
# Find containers that matches the hostname.
|
|
1371
|
+
containers: list[podman.domain.containers.Container] = []
|
|
1372
|
+
for c in self._client.containers.list(compatible=True):
|
|
1373
|
+
# Ignore workload containers with host network enabled.
|
|
1374
|
+
if _LABEL_WORKLOAD in c.labels:
|
|
1375
|
+
continue
|
|
1376
|
+
# Ignore containers that do not match the hostname.
|
|
1377
|
+
if c.attrs["Config"].get("Hostname", "") != self_container_id:
|
|
1378
|
+
continue
|
|
1379
|
+
# Ignore containers that do not match the filter labels.
|
|
1380
|
+
if envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS and any(
|
|
1381
|
+
c.labels.get(k) != v
|
|
1382
|
+
for k, v in envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS.items()
|
|
1383
|
+
):
|
|
1384
|
+
continue
|
|
1385
|
+
containers.append(c)
|
|
1386
|
+
|
|
1387
|
+
# Validate found containers.
|
|
1388
|
+
if len(containers) != 1:
|
|
1389
|
+
msg = (
|
|
1390
|
+
f"Found multiple Containers with the same hostname {self_container_id}, "
|
|
1391
|
+
if len(containers) > 1
|
|
1392
|
+
else f"Not found Container with hostname {self_container_id}, "
|
|
1393
|
+
"please use `--env GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=...` to specify the exact Container name"
|
|
1394
|
+
)
|
|
1395
|
+
raise podman.errors.NotFound(msg)
|
|
1396
|
+
|
|
1397
|
+
self_container = containers[0]
|
|
1398
|
+
|
|
1399
|
+
return self_container
|
|
1442
1400
|
|
|
1443
1401
|
@_supported
|
|
1444
1402
|
def _create(self, workload: WorkloadPlan):
|
|
@@ -1464,7 +1422,7 @@ class PodmanDeployer(Deployer):
|
|
|
1464
1422
|
msg = f"Invalid workload type: {type(workload)}"
|
|
1465
1423
|
raise TypeError(msg)
|
|
1466
1424
|
|
|
1467
|
-
self.
|
|
1425
|
+
self._prepare_mirrored_deployment()
|
|
1468
1426
|
|
|
1469
1427
|
if isinstance(workload, WorkloadPlan):
|
|
1470
1428
|
workload = PodmanWorkloadPlan(**workload.__dict__)
|
|
@@ -1472,12 +1430,6 @@ class PodmanDeployer(Deployer):
|
|
|
1472
1430
|
if logger.isEnabledFor(logging.DEBUG):
|
|
1473
1431
|
logger.debug("Creating workload:\n%s", workload.to_yaml())
|
|
1474
1432
|
|
|
1475
|
-
# Create ephemeral file if needed,
|
|
1476
|
-
# (container index, configured path): <actual filename>
|
|
1477
|
-
ephemeral_filename_mapping: dict[tuple[int, str] : str] = (
|
|
1478
|
-
self._create_ephemeral_files(workload)
|
|
1479
|
-
)
|
|
1480
|
-
|
|
1481
1433
|
# Create ephemeral volumes if needed,
|
|
1482
1434
|
# <configured volume name>: <actual volume name>
|
|
1483
1435
|
ephemeral_volume_name_mapping: dict[str, str] = self._create_ephemeral_volumes(
|
|
@@ -1490,7 +1442,6 @@ class PodmanDeployer(Deployer):
|
|
|
1490
1442
|
# Create init/run containers.
|
|
1491
1443
|
init_containers, run_containers = self._create_containers(
|
|
1492
1444
|
workload,
|
|
1493
|
-
ephemeral_filename_mapping,
|
|
1494
1445
|
ephemeral_volume_name_mapping,
|
|
1495
1446
|
pause_container,
|
|
1496
1447
|
)
|
|
@@ -1631,17 +1582,6 @@ class PodmanDeployer(Deployer):
|
|
|
1631
1582
|
msg = f"Failed to delete volumes for workload {name}{_detail_api_call_error(e)}"
|
|
1632
1583
|
raise OperationError(msg) from e
|
|
1633
1584
|
|
|
1634
|
-
# Remove all ephemeral files for the workload.
|
|
1635
|
-
try:
|
|
1636
|
-
for fp in envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR.glob(
|
|
1637
|
-
f"{name}-*",
|
|
1638
|
-
):
|
|
1639
|
-
if fp.is_file():
|
|
1640
|
-
fp.unlink(missing_ok=True)
|
|
1641
|
-
except OSError as e:
|
|
1642
|
-
msg = f"Failed to delete ephemeral files for workload {name}"
|
|
1643
|
-
raise OperationError(msg) from e
|
|
1644
|
-
|
|
1645
1585
|
return workload
|
|
1646
1586
|
|
|
1647
1587
|
@_supported
|
|
@@ -1796,6 +1736,11 @@ class PodmanDeployer(Deployer):
|
|
|
1796
1736
|
msg = f"Failed to fetch logs for container {container.name} of workload {name}{_detail_api_call_error(e)}"
|
|
1797
1737
|
raise OperationError(msg) from e
|
|
1798
1738
|
else:
|
|
1739
|
+
if not follow:
|
|
1740
|
+
result = bytearray()
|
|
1741
|
+
for chunk in output:
|
|
1742
|
+
result.extend(chunk)
|
|
1743
|
+
return result.decode("utf-8")
|
|
1799
1744
|
return output
|
|
1800
1745
|
|
|
1801
1746
|
@_supported
|
|
@@ -1880,6 +1825,216 @@ class PodmanDeployer(Deployer):
|
|
|
1880
1825
|
return output
|
|
1881
1826
|
return PodmanWorkloadExecStream(output)
|
|
1882
1827
|
|
|
1828
|
+
@_supported
|
|
1829
|
+
def _inspect(
|
|
1830
|
+
self,
|
|
1831
|
+
name: WorkloadName,
|
|
1832
|
+
namespace: WorkloadNamespace | None = None,
|
|
1833
|
+
) -> str | None:
|
|
1834
|
+
"""
|
|
1835
|
+
Inspect a Podman workload.
|
|
1836
|
+
|
|
1837
|
+
Args:
|
|
1838
|
+
name:
|
|
1839
|
+
The name of the workload.
|
|
1840
|
+
namespace:
|
|
1841
|
+
The namespace of the workload.
|
|
1842
|
+
|
|
1843
|
+
Returns:
|
|
1844
|
+
The inspection result as a JSON string. None if not found.
|
|
1845
|
+
|
|
1846
|
+
Raises:
|
|
1847
|
+
UnsupportedError:
|
|
1848
|
+
If Podman is not supported in the current environment.
|
|
1849
|
+
OperationError:
|
|
1850
|
+
If the Podman workload fails to inspect.
|
|
1851
|
+
|
|
1852
|
+
"""
|
|
1853
|
+
workload = self._get(name=name, namespace=namespace)
|
|
1854
|
+
if not workload:
|
|
1855
|
+
return None
|
|
1856
|
+
|
|
1857
|
+
d_containers = getattr(workload, "_d_containers", [])
|
|
1858
|
+
if not d_containers:
|
|
1859
|
+
return None
|
|
1860
|
+
|
|
1861
|
+
result = []
|
|
1862
|
+
for c in d_containers:
|
|
1863
|
+
c_attrs = c.attrs
|
|
1864
|
+
# Mask sensitive environment variables
|
|
1865
|
+
if "Env" in c_attrs["Config"]:
|
|
1866
|
+
for i, env in enumerate(c_attrs["Config"]["Env"] or []):
|
|
1867
|
+
env_name, _ = env.split("=", maxsplit=1)
|
|
1868
|
+
if sensitive_env_var(env_name):
|
|
1869
|
+
c_attrs["Config"]["Env"][i] = f"{env_name}=******"
|
|
1870
|
+
result.append(c_attrs)
|
|
1871
|
+
return safe_json(result, indent=2)
|
|
1872
|
+
|
|
1873
|
+
def _find_self_container_for_endoscopy(self) -> podman.domain.containers.Container:
|
|
1874
|
+
"""
|
|
1875
|
+
Find the self container for endoscopy.
|
|
1876
|
+
Only works in mirrored deployment mode.
|
|
1877
|
+
|
|
1878
|
+
Returns:
|
|
1879
|
+
The self container object.
|
|
1880
|
+
|
|
1881
|
+
Raises:
|
|
1882
|
+
UnsupportedError:
|
|
1883
|
+
If endoscopy is not supported in the current environment.
|
|
1884
|
+
|
|
1885
|
+
"""
|
|
1886
|
+
try:
|
|
1887
|
+
self_container = self._find_self_container()
|
|
1888
|
+
except podman.errors.APIError as e:
|
|
1889
|
+
msg = "Endoscopy is not supported in the current environment: Mirrored deployment enabled, but failed to get self Container"
|
|
1890
|
+
raise UnsupportedError(msg) from e
|
|
1891
|
+
except Exception as e:
|
|
1892
|
+
msg = "Endoscopy is not supported in the current environment: Failed to get self Container"
|
|
1893
|
+
raise UnsupportedError(msg) from e
|
|
1894
|
+
|
|
1895
|
+
if not self_container:
|
|
1896
|
+
msg = "Endoscopy is not supported in the current environment: Mirrored deployment disabled"
|
|
1897
|
+
raise UnsupportedError(msg)
|
|
1898
|
+
return self_container
|
|
1899
|
+
|
|
1900
|
+
def _endoscopic_logs(
|
|
1901
|
+
self,
|
|
1902
|
+
timestamps: bool = False,
|
|
1903
|
+
tail: int | None = None,
|
|
1904
|
+
since: int | None = None,
|
|
1905
|
+
follow: bool = False,
|
|
1906
|
+
) -> Generator[bytes | str, None, None] | bytes | str:
|
|
1907
|
+
"""
|
|
1908
|
+
Get the logs of the deployer itself.
|
|
1909
|
+
Only works in mirrored deployment mode.
|
|
1910
|
+
|
|
1911
|
+
Args:
|
|
1912
|
+
timestamps:
|
|
1913
|
+
Show timestamps in the logs.
|
|
1914
|
+
tail:
|
|
1915
|
+
Number of lines to show from the end of the logs.
|
|
1916
|
+
since:
|
|
1917
|
+
Show logs since the given epoch in seconds.
|
|
1918
|
+
follow:
|
|
1919
|
+
Whether to follow the logs.
|
|
1920
|
+
|
|
1921
|
+
Returns:
|
|
1922
|
+
The logs as a byte string or a generator yielding byte strings if follow is True.
|
|
1923
|
+
|
|
1924
|
+
Raises:
|
|
1925
|
+
UnsupportedError:
|
|
1926
|
+
If endoscopy is not supported in the current environment.
|
|
1927
|
+
OperationError:
|
|
1928
|
+
If the deployer fails to get logs.
|
|
1929
|
+
|
|
1930
|
+
"""
|
|
1931
|
+
self_container = self._find_self_container_for_endoscopy()
|
|
1932
|
+
|
|
1933
|
+
logs_options = {
|
|
1934
|
+
"timestamps": timestamps,
|
|
1935
|
+
"tail": tail if tail >= 0 else None,
|
|
1936
|
+
"since": since,
|
|
1937
|
+
"follow": follow,
|
|
1938
|
+
}
|
|
1939
|
+
|
|
1940
|
+
try:
|
|
1941
|
+
output = self_container.logs(
|
|
1942
|
+
stream=follow,
|
|
1943
|
+
**logs_options,
|
|
1944
|
+
)
|
|
1945
|
+
except podman.errors.APIError as e:
|
|
1946
|
+
msg = f"Failed to fetch logs for self Container {self_container.short_id}{_detail_api_call_error(e)}"
|
|
1947
|
+
raise OperationError(msg) from e
|
|
1948
|
+
else:
|
|
1949
|
+
if not follow:
|
|
1950
|
+
result = bytearray()
|
|
1951
|
+
for chunk in output:
|
|
1952
|
+
result.extend(chunk)
|
|
1953
|
+
return result.decode("utf-8")
|
|
1954
|
+
return output
|
|
1955
|
+
|
|
1956
|
+
def _endoscopic_exec(
|
|
1957
|
+
self,
|
|
1958
|
+
detach: bool = True,
|
|
1959
|
+
command: list[str] | None = None,
|
|
1960
|
+
args: list[str] | None = None,
|
|
1961
|
+
) -> WorkloadExecStream | bytes | str:
|
|
1962
|
+
"""
|
|
1963
|
+
Execute a command in the deployer itself.
|
|
1964
|
+
Only works in mirrored deployment mode.
|
|
1965
|
+
|
|
1966
|
+
Args:
|
|
1967
|
+
detach:
|
|
1968
|
+
Whether to detach from the command.
|
|
1969
|
+
command:
|
|
1970
|
+
The command to execute.
|
|
1971
|
+
If not specified, use /bin/sh and implicitly attach.
|
|
1972
|
+
args:
|
|
1973
|
+
The arguments to pass to the command.
|
|
1974
|
+
|
|
1975
|
+
Returns:
|
|
1976
|
+
If detach is False, return a WorkloadExecStream.
|
|
1977
|
+
otherwise, return the output of the command as a byte string or string.
|
|
1978
|
+
|
|
1979
|
+
Raises:
|
|
1980
|
+
UnsupportedError:
|
|
1981
|
+
If endoscopy is not supported in the current environment.
|
|
1982
|
+
OperationError:
|
|
1983
|
+
If the deployer fails to execute the command.
|
|
1984
|
+
|
|
1985
|
+
"""
|
|
1986
|
+
self_container = self._find_self_container_for_endoscopy()
|
|
1987
|
+
|
|
1988
|
+
attach = not detach or not command
|
|
1989
|
+
exec_options = {
|
|
1990
|
+
"stdout": True,
|
|
1991
|
+
"stderr": True,
|
|
1992
|
+
"stdin": attach,
|
|
1993
|
+
"socket": attach,
|
|
1994
|
+
"tty": attach,
|
|
1995
|
+
"cmd": [*command, *(args or [])] if command else ["/bin/sh"],
|
|
1996
|
+
}
|
|
1997
|
+
|
|
1998
|
+
try:
|
|
1999
|
+
_, output = self_container.exec_run(
|
|
2000
|
+
detach=False,
|
|
2001
|
+
**exec_options,
|
|
2002
|
+
)
|
|
2003
|
+
except podman.errors.APIError as e:
|
|
2004
|
+
msg = f"Failed to exec command in self Container {self_container.short_id}{_detail_api_call_error(e)}"
|
|
2005
|
+
raise OperationError(msg) from e
|
|
2006
|
+
else:
|
|
2007
|
+
if not attach:
|
|
2008
|
+
return output
|
|
2009
|
+
return PodmanWorkloadExecStream(output)
|
|
2010
|
+
|
|
2011
|
+
def _endoscopic_inspect(self) -> str:
|
|
2012
|
+
"""
|
|
2013
|
+
Inspect the deployer itself.
|
|
2014
|
+
Only works in mirrored deployment mode.
|
|
2015
|
+
|
|
2016
|
+
Returns:
|
|
2017
|
+
The inspection result.
|
|
2018
|
+
|
|
2019
|
+
Raises:
|
|
2020
|
+
UnsupportedError:
|
|
2021
|
+
If endoscopy is not supported in the current environment.
|
|
2022
|
+
OperationError:
|
|
2023
|
+
If the deployer fails to execute the command.
|
|
2024
|
+
|
|
2025
|
+
"""
|
|
2026
|
+
self_container = self._find_self_container_for_endoscopy()
|
|
2027
|
+
|
|
2028
|
+
c_attrs = self_container.attrs
|
|
2029
|
+
# Mask sensitive environment variables
|
|
2030
|
+
if "Env" in c_attrs["Config"]:
|
|
2031
|
+
for i, env in enumerate(c_attrs["Config"]["Env"] or []):
|
|
2032
|
+
env_name, _ = env.split("=", maxsplit=1)
|
|
2033
|
+
if sensitive_env_var(env_name):
|
|
2034
|
+
c_attrs["Config"]["Env"][i] = f"{env_name}=******"
|
|
2035
|
+
|
|
2036
|
+
return safe_json(c_attrs, indent=2)
|
|
2037
|
+
|
|
1883
2038
|
|
|
1884
2039
|
def _has_restart_policy(
|
|
1885
2040
|
container: podman.domain.containers.Container,
|
gpustack_runtime/detector/amd.py
CHANGED
|
@@ -108,7 +108,7 @@ class AMDDetector(Detector):
|
|
|
108
108
|
dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
|
|
109
109
|
else:
|
|
110
110
|
dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
|
|
111
|
-
dev_hsa_agent = hsa_agents.get(dev_uuid)
|
|
111
|
+
dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
|
|
112
112
|
|
|
113
113
|
dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
|
|
114
114
|
dev_driver_ver = dev_gpu_driver_info.get("driver_version")
|
|
@@ -108,7 +108,7 @@ class HygonDetector(Detector):
|
|
|
108
108
|
dev_index = dev_idx
|
|
109
109
|
|
|
110
110
|
dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
|
|
111
|
-
dev_hsa_agent = hsa_agents.get(dev_uuid)
|
|
111
|
+
dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
|
|
112
112
|
|
|
113
113
|
dev_name = dev_hsa_agent.name
|
|
114
114
|
if not dev_name:
|