gpustack-runtime 0.1.39.post1__py3-none-any.whl → 0.1.39.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/__main__.py +6 -0
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/__init__.py +6 -0
- gpustack_runtime/cmds/deployer.py +170 -40
- gpustack_runtime/deployer/__init__.py +197 -0
- gpustack_runtime/deployer/__types__.py +382 -17
- gpustack_runtime/deployer/__utils__.py +34 -0
- gpustack_runtime/deployer/docker.py +280 -66
- gpustack_runtime/deployer/kuberentes.py +288 -45
- gpustack_runtime/deployer/podman.py +290 -66
- gpustack_runtime/detector/__utils__.py +23 -0
- gpustack_runtime/detector/amd.py +18 -10
- gpustack_runtime/detector/hygon.py +7 -2
- gpustack_runtime/detector/iluvatar.py +10 -2
- gpustack_runtime/detector/mthreads.py +8 -12
- gpustack_runtime/detector/nvidia.py +194 -86
- gpustack_runtime/detector/pyhsa/__init__.py +7 -7
- gpustack_runtime/detector/pyrocmsmi/__init__.py +3 -9
- gpustack_runtime/envs.py +30 -18
- {gpustack_runtime-0.1.39.post1.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/METADATA +3 -2
- {gpustack_runtime-0.1.39.post1.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/RECORD +25 -26
- gpustack_runtime/detector/pymtml/__init__.py +0 -770
- {gpustack_runtime-0.1.39.post1.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.39.post1.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.39.post1.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/licenses/LICENSE +0 -0
|
@@ -28,7 +28,7 @@ from podman.domain.containers_create import CreateMixin
|
|
|
28
28
|
from tqdm import tqdm
|
|
29
29
|
|
|
30
30
|
from .. import envs
|
|
31
|
-
from ..logging import debug_log_exception
|
|
31
|
+
from ..logging import debug_log_exception, debug_log_warning
|
|
32
32
|
from .__patches__ import patch_render_payload
|
|
33
33
|
from .__types__ import (
|
|
34
34
|
Container,
|
|
@@ -37,7 +37,7 @@ from .__types__ import (
|
|
|
37
37
|
ContainerMountModeEnum,
|
|
38
38
|
ContainerProfileEnum,
|
|
39
39
|
ContainerRestartPolicyEnum,
|
|
40
|
-
|
|
40
|
+
EndoscopicDeployer,
|
|
41
41
|
OperationError,
|
|
42
42
|
UnsupportedError,
|
|
43
43
|
WorkloadExecStream,
|
|
@@ -49,7 +49,13 @@ from .__types__ import (
|
|
|
49
49
|
WorkloadStatusOperation,
|
|
50
50
|
WorkloadStatusStateEnum,
|
|
51
51
|
)
|
|
52
|
-
from .__utils__ import
|
|
52
|
+
from .__utils__ import (
|
|
53
|
+
_MiB,
|
|
54
|
+
bytes_to_human_readable,
|
|
55
|
+
replace_image_with,
|
|
56
|
+
safe_json,
|
|
57
|
+
sensitive_env_var,
|
|
58
|
+
)
|
|
53
59
|
|
|
54
60
|
if TYPE_CHECKING:
|
|
55
61
|
from collections.abc import Callable, Generator
|
|
@@ -144,7 +150,7 @@ class PodmanWorkloadPlan(WorkloadPlan):
|
|
|
144
150
|
super().validate_and_default()
|
|
145
151
|
|
|
146
152
|
# Adjust default image namespace if needed.
|
|
147
|
-
if namespace := envs.
|
|
153
|
+
if namespace := envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_NAMESPACE:
|
|
148
154
|
self.pause_image = replace_image_with(
|
|
149
155
|
image=self.pause_image,
|
|
150
156
|
namespace=namespace,
|
|
@@ -299,7 +305,7 @@ Name of the Podman deployer.
|
|
|
299
305
|
"""
|
|
300
306
|
|
|
301
307
|
|
|
302
|
-
class PodmanDeployer(
|
|
308
|
+
class PodmanDeployer(EndoscopicDeployer):
|
|
303
309
|
"""
|
|
304
310
|
Deployer implementation for Podman containers.
|
|
305
311
|
"""
|
|
@@ -429,12 +435,12 @@ class PodmanDeployer(Deployer):
|
|
|
429
435
|
tag = tag or "latest"
|
|
430
436
|
auth_config = None
|
|
431
437
|
if (
|
|
432
|
-
envs.
|
|
433
|
-
and envs.
|
|
438
|
+
envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_USERNAME
|
|
439
|
+
and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_PASSWORD
|
|
434
440
|
):
|
|
435
441
|
auth_config = {
|
|
436
|
-
"username": envs.
|
|
437
|
-
"password": envs.
|
|
442
|
+
"username": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_USERNAME,
|
|
443
|
+
"password": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_PASSWORD,
|
|
438
444
|
}
|
|
439
445
|
|
|
440
446
|
logs = self._client.api.pull(
|
|
@@ -1150,39 +1156,29 @@ class PodmanDeployer(Deployer):
|
|
|
1150
1156
|
super().__init__(_NAME)
|
|
1151
1157
|
self._client = self._get_client()
|
|
1152
1158
|
|
|
1153
|
-
def
|
|
1159
|
+
def _prepare_mirrored_deployment(self):
|
|
1154
1160
|
"""
|
|
1155
|
-
Prepare for
|
|
1161
|
+
Prepare for mirrored deployment.
|
|
1156
1162
|
|
|
1157
1163
|
"""
|
|
1158
1164
|
# Prepare mirrored deployment if enabled.
|
|
1159
1165
|
if self._mutate_create_options:
|
|
1160
1166
|
return
|
|
1161
1167
|
self._mutate_create_options = lambda o: o
|
|
1162
|
-
if not envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT:
|
|
1163
|
-
logger.debug("Mirrored deployment disabled")
|
|
1164
|
-
return
|
|
1165
1168
|
|
|
1166
1169
|
# Retrieve self-container info.
|
|
1167
|
-
## - Get Container name, default to hostname if not set.
|
|
1168
|
-
self_container_id = envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME
|
|
1169
|
-
if not self_container_id:
|
|
1170
|
-
self_container_id = socket.gethostname()
|
|
1171
|
-
logger.warning(
|
|
1172
|
-
"Mirrored deployment enabled, but no Container name set, using hostname(%s) instead",
|
|
1173
|
-
self_container_id,
|
|
1174
|
-
)
|
|
1175
1170
|
try:
|
|
1176
|
-
self_container = self._find_self_container(
|
|
1171
|
+
self_container = self._find_self_container()
|
|
1172
|
+
if not self_container:
|
|
1173
|
+
return
|
|
1177
1174
|
logger.info(
|
|
1178
1175
|
"Mirrored deployment enabled, using self Container %s for options mirroring",
|
|
1179
|
-
self_container.
|
|
1176
|
+
self_container.short_id,
|
|
1180
1177
|
)
|
|
1181
1178
|
self_image = self_container.image
|
|
1182
1179
|
except podman.errors.APIError:
|
|
1183
1180
|
logger.exception(
|
|
1184
|
-
"Mirrored deployment enabled, but failed to get self Container
|
|
1185
|
-
self_container_id,
|
|
1181
|
+
"Mirrored deployment enabled, but failed to get self Container, skipping",
|
|
1186
1182
|
)
|
|
1187
1183
|
return
|
|
1188
1184
|
|
|
@@ -1193,8 +1189,12 @@ class PodmanDeployer(Deployer):
|
|
|
1193
1189
|
self_container_envs: dict[str, str] = dict(
|
|
1194
1190
|
item.split("=", 1) for item in self_container.attrs["Config"].get("Env", [])
|
|
1195
1191
|
)
|
|
1196
|
-
self_image_envs: dict[str, str] =
|
|
1197
|
-
|
|
1192
|
+
self_image_envs: dict[str, str] = (
|
|
1193
|
+
dict(
|
|
1194
|
+
item.split("=", 1) for item in self_image.attrs["Config"].get("Env", [])
|
|
1195
|
+
)
|
|
1196
|
+
if self_image.attrs["Config"]
|
|
1197
|
+
else {}
|
|
1198
1198
|
)
|
|
1199
1199
|
mirrored_envs: dict[str, str] = {
|
|
1200
1200
|
# Filter out gpustack-internal envs and same-as-image envs.
|
|
@@ -1342,17 +1342,10 @@ class PodmanDeployer(Deployer):
|
|
|
1342
1342
|
|
|
1343
1343
|
self._mutate_create_options = mutate_create_options
|
|
1344
1344
|
|
|
1345
|
-
def _find_self_container(
|
|
1346
|
-
self,
|
|
1347
|
-
self_container_id: str,
|
|
1348
|
-
) -> podman.domain.containers.Container:
|
|
1345
|
+
def _find_self_container(self) -> podman.domain.containers.Container | None:
|
|
1349
1346
|
"""
|
|
1350
1347
|
Find the current container if running inside a Podman container.
|
|
1351
1348
|
|
|
1352
|
-
Args:
|
|
1353
|
-
self_container_id:
|
|
1354
|
-
The container name or ID to find.
|
|
1355
|
-
|
|
1356
1349
|
Returns:
|
|
1357
1350
|
The Podman container if found, None otherwise.
|
|
1358
1351
|
|
|
@@ -1360,38 +1353,54 @@ class PodmanDeployer(Deployer):
|
|
|
1360
1353
|
If failed to find itself.
|
|
1361
1354
|
|
|
1362
1355
|
"""
|
|
1363
|
-
if envs.
|
|
1364
|
-
|
|
1365
|
-
return
|
|
1366
|
-
|
|
1367
|
-
# Find containers that matches the hostname.
|
|
1368
|
-
containers: list[podman.domain.containers.Container] = []
|
|
1369
|
-
for c in self._client.containers.list(compatible=True):
|
|
1370
|
-
# Ignore workload containers with host network enabled.
|
|
1371
|
-
if _LABEL_WORKLOAD in c.labels:
|
|
1372
|
-
continue
|
|
1373
|
-
# Ignore containers that do not match the hostname.
|
|
1374
|
-
if c.attrs["Config"].get("Hostname", "") != self_container_id:
|
|
1375
|
-
continue
|
|
1376
|
-
# Ignore containers that do not match the filter labels.
|
|
1377
|
-
if envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS and any(
|
|
1378
|
-
c.labels.get(k) != v
|
|
1379
|
-
for k, v in envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS.items()
|
|
1380
|
-
):
|
|
1381
|
-
continue
|
|
1382
|
-
containers.append(c)
|
|
1356
|
+
if not envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT:
|
|
1357
|
+
logger.debug("Mirrored deployment disabled")
|
|
1358
|
+
return None
|
|
1383
1359
|
|
|
1384
|
-
#
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
"
|
|
1360
|
+
# Get container ID or hostname.
|
|
1361
|
+
self_container_id = envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME
|
|
1362
|
+
if not self_container_id:
|
|
1363
|
+
self_container_id = socket.gethostname()
|
|
1364
|
+
debug_log_warning(
|
|
1365
|
+
logger,
|
|
1366
|
+
"Mirrored deployment enabled, but no Container name set, using hostname(%s) instead",
|
|
1367
|
+
self_container_id,
|
|
1391
1368
|
)
|
|
1392
|
-
raise podman.errors.NotFound(msg)
|
|
1393
1369
|
|
|
1394
|
-
|
|
1370
|
+
if envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME:
|
|
1371
|
+
# Directly get container.
|
|
1372
|
+
self_container = self._client.containers.get(self_container_id)
|
|
1373
|
+
else:
|
|
1374
|
+
# Find containers that matches the hostname.
|
|
1375
|
+
containers: list[podman.domain.containers.Container] = []
|
|
1376
|
+
for c in self._client.containers.list(compatible=True):
|
|
1377
|
+
# Ignore workload containers with host network enabled.
|
|
1378
|
+
if _LABEL_WORKLOAD in c.labels:
|
|
1379
|
+
continue
|
|
1380
|
+
# Ignore containers that do not match the hostname.
|
|
1381
|
+
if c.attrs["Config"].get("Hostname", "") != self_container_id:
|
|
1382
|
+
continue
|
|
1383
|
+
# Ignore containers that do not match the filter labels.
|
|
1384
|
+
if envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS and any(
|
|
1385
|
+
c.labels.get(k) != v
|
|
1386
|
+
for k, v in envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS.items()
|
|
1387
|
+
):
|
|
1388
|
+
continue
|
|
1389
|
+
containers.append(c)
|
|
1390
|
+
|
|
1391
|
+
# Validate found containers.
|
|
1392
|
+
if len(containers) != 1:
|
|
1393
|
+
msg = (
|
|
1394
|
+
f"Found multiple Containers with the same hostname {self_container_id}, "
|
|
1395
|
+
if len(containers) > 1
|
|
1396
|
+
else f"Not found Container with hostname {self_container_id}, "
|
|
1397
|
+
"please use `--env GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=...` to specify the exact Container name"
|
|
1398
|
+
)
|
|
1399
|
+
raise podman.errors.NotFound(msg)
|
|
1400
|
+
|
|
1401
|
+
self_container = containers[0]
|
|
1402
|
+
|
|
1403
|
+
return self_container
|
|
1395
1404
|
|
|
1396
1405
|
@_supported
|
|
1397
1406
|
def _create(self, workload: WorkloadPlan):
|
|
@@ -1417,7 +1426,7 @@ class PodmanDeployer(Deployer):
|
|
|
1417
1426
|
msg = f"Invalid workload type: {type(workload)}"
|
|
1418
1427
|
raise TypeError(msg)
|
|
1419
1428
|
|
|
1420
|
-
self.
|
|
1429
|
+
self._prepare_mirrored_deployment()
|
|
1421
1430
|
|
|
1422
1431
|
if isinstance(workload, WorkloadPlan):
|
|
1423
1432
|
workload = PodmanWorkloadPlan(**workload.__dict__)
|
|
@@ -1731,6 +1740,11 @@ class PodmanDeployer(Deployer):
|
|
|
1731
1740
|
msg = f"Failed to fetch logs for container {container.name} of workload {name}{_detail_api_call_error(e)}"
|
|
1732
1741
|
raise OperationError(msg) from e
|
|
1733
1742
|
else:
|
|
1743
|
+
if not follow:
|
|
1744
|
+
result = bytearray()
|
|
1745
|
+
for chunk in output:
|
|
1746
|
+
result.extend(chunk)
|
|
1747
|
+
return result.decode("utf-8")
|
|
1734
1748
|
return output
|
|
1735
1749
|
|
|
1736
1750
|
@_supported
|
|
@@ -1815,6 +1829,216 @@ class PodmanDeployer(Deployer):
|
|
|
1815
1829
|
return output
|
|
1816
1830
|
return PodmanWorkloadExecStream(output)
|
|
1817
1831
|
|
|
1832
|
+
@_supported
|
|
1833
|
+
def _inspect(
|
|
1834
|
+
self,
|
|
1835
|
+
name: WorkloadName,
|
|
1836
|
+
namespace: WorkloadNamespace | None = None,
|
|
1837
|
+
) -> str | None:
|
|
1838
|
+
"""
|
|
1839
|
+
Inspect a Podman workload.
|
|
1840
|
+
|
|
1841
|
+
Args:
|
|
1842
|
+
name:
|
|
1843
|
+
The name of the workload.
|
|
1844
|
+
namespace:
|
|
1845
|
+
The namespace of the workload.
|
|
1846
|
+
|
|
1847
|
+
Returns:
|
|
1848
|
+
The inspection result as a JSON string. None if not found.
|
|
1849
|
+
|
|
1850
|
+
Raises:
|
|
1851
|
+
UnsupportedError:
|
|
1852
|
+
If Podman is not supported in the current environment.
|
|
1853
|
+
OperationError:
|
|
1854
|
+
If the Podman workload fails to inspect.
|
|
1855
|
+
|
|
1856
|
+
"""
|
|
1857
|
+
workload = self._get(name=name, namespace=namespace)
|
|
1858
|
+
if not workload:
|
|
1859
|
+
return None
|
|
1860
|
+
|
|
1861
|
+
d_containers = getattr(workload, "_d_containers", [])
|
|
1862
|
+
if not d_containers:
|
|
1863
|
+
return None
|
|
1864
|
+
|
|
1865
|
+
result = []
|
|
1866
|
+
for c in d_containers:
|
|
1867
|
+
c_attrs = c.attrs
|
|
1868
|
+
# Mask sensitive environment variables
|
|
1869
|
+
if "Env" in c_attrs["Config"]:
|
|
1870
|
+
for i, env in enumerate(c_attrs["Config"]["Env"] or []):
|
|
1871
|
+
env_name, _ = env.split("=", maxsplit=1)
|
|
1872
|
+
if sensitive_env_var(env_name):
|
|
1873
|
+
c_attrs["Config"]["Env"][i] = f"{env_name}=******"
|
|
1874
|
+
result.append(c_attrs)
|
|
1875
|
+
return safe_json(result, indent=2)
|
|
1876
|
+
|
|
1877
|
+
def _find_self_container_for_endoscopy(self) -> podman.domain.containers.Container:
|
|
1878
|
+
"""
|
|
1879
|
+
Find the self container for endoscopy.
|
|
1880
|
+
Only works in mirrored deployment mode.
|
|
1881
|
+
|
|
1882
|
+
Returns:
|
|
1883
|
+
The self container object.
|
|
1884
|
+
|
|
1885
|
+
Raises:
|
|
1886
|
+
UnsupportedError:
|
|
1887
|
+
If endoscopy is not supported in the current environment.
|
|
1888
|
+
|
|
1889
|
+
"""
|
|
1890
|
+
try:
|
|
1891
|
+
self_container = self._find_self_container()
|
|
1892
|
+
except podman.errors.APIError as e:
|
|
1893
|
+
msg = "Endoscopy is not supported in the current environment: Mirrored deployment enabled, but failed to get self Container"
|
|
1894
|
+
raise UnsupportedError(msg) from e
|
|
1895
|
+
except Exception as e:
|
|
1896
|
+
msg = "Endoscopy is not supported in the current environment: Failed to get self Container"
|
|
1897
|
+
raise UnsupportedError(msg) from e
|
|
1898
|
+
|
|
1899
|
+
if not self_container:
|
|
1900
|
+
msg = "Endoscopy is not supported in the current environment: Mirrored deployment disabled"
|
|
1901
|
+
raise UnsupportedError(msg)
|
|
1902
|
+
return self_container
|
|
1903
|
+
|
|
1904
|
+
def _endoscopic_logs(
|
|
1905
|
+
self,
|
|
1906
|
+
timestamps: bool = False,
|
|
1907
|
+
tail: int | None = None,
|
|
1908
|
+
since: int | None = None,
|
|
1909
|
+
follow: bool = False,
|
|
1910
|
+
) -> Generator[bytes | str, None, None] | bytes | str:
|
|
1911
|
+
"""
|
|
1912
|
+
Get the logs of the deployer itself.
|
|
1913
|
+
Only works in mirrored deployment mode.
|
|
1914
|
+
|
|
1915
|
+
Args:
|
|
1916
|
+
timestamps:
|
|
1917
|
+
Show timestamps in the logs.
|
|
1918
|
+
tail:
|
|
1919
|
+
Number of lines to show from the end of the logs.
|
|
1920
|
+
since:
|
|
1921
|
+
Show logs since the given epoch in seconds.
|
|
1922
|
+
follow:
|
|
1923
|
+
Whether to follow the logs.
|
|
1924
|
+
|
|
1925
|
+
Returns:
|
|
1926
|
+
The logs as a byte string or a generator yielding byte strings if follow is True.
|
|
1927
|
+
|
|
1928
|
+
Raises:
|
|
1929
|
+
UnsupportedError:
|
|
1930
|
+
If endoscopy is not supported in the current environment.
|
|
1931
|
+
OperationError:
|
|
1932
|
+
If the deployer fails to get logs.
|
|
1933
|
+
|
|
1934
|
+
"""
|
|
1935
|
+
self_container = self._find_self_container_for_endoscopy()
|
|
1936
|
+
|
|
1937
|
+
logs_options = {
|
|
1938
|
+
"timestamps": timestamps,
|
|
1939
|
+
"tail": tail if tail >= 0 else None,
|
|
1940
|
+
"since": since,
|
|
1941
|
+
"follow": follow,
|
|
1942
|
+
}
|
|
1943
|
+
|
|
1944
|
+
try:
|
|
1945
|
+
output = self_container.logs(
|
|
1946
|
+
stream=follow,
|
|
1947
|
+
**logs_options,
|
|
1948
|
+
)
|
|
1949
|
+
except podman.errors.APIError as e:
|
|
1950
|
+
msg = f"Failed to fetch logs for self Container {self_container.short_id}{_detail_api_call_error(e)}"
|
|
1951
|
+
raise OperationError(msg) from e
|
|
1952
|
+
else:
|
|
1953
|
+
if not follow:
|
|
1954
|
+
result = bytearray()
|
|
1955
|
+
for chunk in output:
|
|
1956
|
+
result.extend(chunk)
|
|
1957
|
+
return result.decode("utf-8")
|
|
1958
|
+
return output
|
|
1959
|
+
|
|
1960
|
+
def _endoscopic_exec(
|
|
1961
|
+
self,
|
|
1962
|
+
detach: bool = True,
|
|
1963
|
+
command: list[str] | None = None,
|
|
1964
|
+
args: list[str] | None = None,
|
|
1965
|
+
) -> WorkloadExecStream | bytes | str:
|
|
1966
|
+
"""
|
|
1967
|
+
Execute a command in the deployer itself.
|
|
1968
|
+
Only works in mirrored deployment mode.
|
|
1969
|
+
|
|
1970
|
+
Args:
|
|
1971
|
+
detach:
|
|
1972
|
+
Whether to detach from the command.
|
|
1973
|
+
command:
|
|
1974
|
+
The command to execute.
|
|
1975
|
+
If not specified, use /bin/sh and implicitly attach.
|
|
1976
|
+
args:
|
|
1977
|
+
The arguments to pass to the command.
|
|
1978
|
+
|
|
1979
|
+
Returns:
|
|
1980
|
+
If detach is False, return a WorkloadExecStream.
|
|
1981
|
+
otherwise, return the output of the command as a byte string or string.
|
|
1982
|
+
|
|
1983
|
+
Raises:
|
|
1984
|
+
UnsupportedError:
|
|
1985
|
+
If endoscopy is not supported in the current environment.
|
|
1986
|
+
OperationError:
|
|
1987
|
+
If the deployer fails to execute the command.
|
|
1988
|
+
|
|
1989
|
+
"""
|
|
1990
|
+
self_container = self._find_self_container_for_endoscopy()
|
|
1991
|
+
|
|
1992
|
+
attach = not detach or not command
|
|
1993
|
+
exec_options = {
|
|
1994
|
+
"stdout": True,
|
|
1995
|
+
"stderr": True,
|
|
1996
|
+
"stdin": attach,
|
|
1997
|
+
"socket": attach,
|
|
1998
|
+
"tty": attach,
|
|
1999
|
+
"cmd": [*command, *(args or [])] if command else ["/bin/sh"],
|
|
2000
|
+
}
|
|
2001
|
+
|
|
2002
|
+
try:
|
|
2003
|
+
_, output = self_container.exec_run(
|
|
2004
|
+
detach=False,
|
|
2005
|
+
**exec_options,
|
|
2006
|
+
)
|
|
2007
|
+
except podman.errors.APIError as e:
|
|
2008
|
+
msg = f"Failed to exec command in self Container {self_container.short_id}{_detail_api_call_error(e)}"
|
|
2009
|
+
raise OperationError(msg) from e
|
|
2010
|
+
else:
|
|
2011
|
+
if not attach:
|
|
2012
|
+
return output
|
|
2013
|
+
return PodmanWorkloadExecStream(output)
|
|
2014
|
+
|
|
2015
|
+
def _endoscopic_inspect(self) -> str:
|
|
2016
|
+
"""
|
|
2017
|
+
Inspect the deployer itself.
|
|
2018
|
+
Only works in mirrored deployment mode.
|
|
2019
|
+
|
|
2020
|
+
Returns:
|
|
2021
|
+
The inspection result.
|
|
2022
|
+
|
|
2023
|
+
Raises:
|
|
2024
|
+
UnsupportedError:
|
|
2025
|
+
If endoscopy is not supported in the current environment.
|
|
2026
|
+
OperationError:
|
|
2027
|
+
If the deployer fails to execute the command.
|
|
2028
|
+
|
|
2029
|
+
"""
|
|
2030
|
+
self_container = self._find_self_container_for_endoscopy()
|
|
2031
|
+
|
|
2032
|
+
c_attrs = self_container.attrs
|
|
2033
|
+
# Mask sensitive environment variables
|
|
2034
|
+
if "Env" in c_attrs["Config"]:
|
|
2035
|
+
for i, env in enumerate(c_attrs["Config"]["Env"] or []):
|
|
2036
|
+
env_name, _ = env.split("=", maxsplit=1)
|
|
2037
|
+
if sensitive_env_var(env_name):
|
|
2038
|
+
c_attrs["Config"]["Env"][i] = f"{env_name}=******"
|
|
2039
|
+
|
|
2040
|
+
return safe_json(c_attrs, indent=2)
|
|
2041
|
+
|
|
1818
2042
|
|
|
1819
2043
|
def _has_restart_policy(
|
|
1820
2044
|
container: podman.domain.containers.Container,
|
|
@@ -951,3 +951,26 @@ def bitmask_to_str(bitmask_list: list) -> str:
|
|
|
951
951
|
offset += get_bits_size()
|
|
952
952
|
|
|
953
953
|
return list_to_range_str(sorted(bits_lists))
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
def get_physical_function_by_bdf(bdf: str) -> str:
|
|
957
|
+
"""
|
|
958
|
+
Get the physical function BDF for a given PCI device BDF address.
|
|
959
|
+
|
|
960
|
+
Args:
|
|
961
|
+
bdf:
|
|
962
|
+
The PCI device BDF address (e.g., "0000:00:1f.0").
|
|
963
|
+
|
|
964
|
+
Returns:
|
|
965
|
+
The physical function BDF if found, otherwise returns the original BDF.
|
|
966
|
+
|
|
967
|
+
"""
|
|
968
|
+
if bdf:
|
|
969
|
+
with contextlib.suppress(Exception):
|
|
970
|
+
dev_path = Path(f"/sys/bus/pci/devices/{bdf}")
|
|
971
|
+
if dev_path.exists():
|
|
972
|
+
physfn_path = dev_path / "physfn"
|
|
973
|
+
if physfn_path.exists():
|
|
974
|
+
physfn_realpath = physfn_path.resolve()
|
|
975
|
+
return physfn_realpath.name
|
|
976
|
+
return bdf
|
gpustack_runtime/detector/amd.py
CHANGED
|
@@ -16,6 +16,7 @@ from .__utils__ import (
|
|
|
16
16
|
get_brief_version,
|
|
17
17
|
get_numa_node_by_bdf,
|
|
18
18
|
get_pci_devices,
|
|
19
|
+
get_physical_function_by_bdf,
|
|
19
20
|
get_utilization,
|
|
20
21
|
map_numa_node_to_cpu_affinity,
|
|
21
22
|
)
|
|
@@ -107,8 +108,12 @@ class AMDDetector(Detector):
|
|
|
107
108
|
asic_serial = dev_gpu_asic_info.get("asic_serial")
|
|
108
109
|
dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
|
|
109
110
|
else:
|
|
110
|
-
dev_uuid =
|
|
111
|
-
|
|
111
|
+
dev_uuid = ""
|
|
112
|
+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
113
|
+
dev_uuid = (
|
|
114
|
+
f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
|
|
115
|
+
)
|
|
116
|
+
dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
|
|
112
117
|
|
|
113
118
|
dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
|
|
114
119
|
dev_driver_ver = dev_gpu_driver_info.get("driver_version")
|
|
@@ -119,8 +124,13 @@ class AMDDetector(Detector):
|
|
|
119
124
|
|
|
120
125
|
dev_cc = dev_hsa_agent.compute_capability
|
|
121
126
|
if not dev_cc:
|
|
122
|
-
|
|
123
|
-
dev_cc =
|
|
127
|
+
if "target_graphics_version" in dev_gpu_asic_info:
|
|
128
|
+
dev_cc = dev_gpu_asic_info.get("target_graphics_version")
|
|
129
|
+
else:
|
|
130
|
+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
131
|
+
dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(
|
|
132
|
+
dev_idx,
|
|
133
|
+
)
|
|
124
134
|
|
|
125
135
|
dev_bdf = None
|
|
126
136
|
dev_card_id = None
|
|
@@ -195,15 +205,13 @@ class AMDDetector(Detector):
|
|
|
195
205
|
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
|
|
196
206
|
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
|
|
197
207
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
dev,
|
|
202
|
-
)
|
|
208
|
+
dev_is_vgpu = False
|
|
209
|
+
if dev_bdf:
|
|
210
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
203
211
|
|
|
204
212
|
dev_appendix = {
|
|
205
213
|
"arch_family": _get_arch_family(dev_asic_family_id),
|
|
206
|
-
"vgpu":
|
|
214
|
+
"vgpu": dev_is_vgpu,
|
|
207
215
|
}
|
|
208
216
|
if dev_bdf:
|
|
209
217
|
dev_appendix["bdf"] = dev_bdf
|
|
@@ -16,6 +16,7 @@ from .__utils__ import (
|
|
|
16
16
|
get_brief_version,
|
|
17
17
|
get_numa_node_by_bdf,
|
|
18
18
|
get_pci_devices,
|
|
19
|
+
get_physical_function_by_bdf,
|
|
19
20
|
get_utilization,
|
|
20
21
|
map_numa_node_to_cpu_affinity,
|
|
21
22
|
)
|
|
@@ -108,7 +109,7 @@ class HygonDetector(Detector):
|
|
|
108
109
|
dev_index = dev_idx
|
|
109
110
|
|
|
110
111
|
dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
|
|
111
|
-
dev_hsa_agent = hsa_agents.get(dev_uuid)
|
|
112
|
+
dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
|
|
112
113
|
|
|
113
114
|
dev_name = dev_hsa_agent.name
|
|
114
115
|
if not dev_name:
|
|
@@ -156,8 +157,12 @@ class HygonDetector(Detector):
|
|
|
156
157
|
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
|
|
157
158
|
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
|
|
158
159
|
|
|
160
|
+
dev_is_vgpu = False
|
|
161
|
+
if dev_bdf:
|
|
162
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
163
|
+
|
|
159
164
|
dev_appendix = {
|
|
160
|
-
"vgpu":
|
|
165
|
+
"vgpu": dev_is_vgpu,
|
|
161
166
|
}
|
|
162
167
|
if dev_bdf is not None:
|
|
163
168
|
dev_appendix["bdf"] = dev_bdf
|
|
@@ -23,6 +23,7 @@ from .__utils__ import (
|
|
|
23
23
|
get_numa_node_by_bdf,
|
|
24
24
|
get_numa_nodeset_size,
|
|
25
25
|
get_pci_devices,
|
|
26
|
+
get_physical_function_by_bdf,
|
|
26
27
|
get_utilization,
|
|
27
28
|
map_numa_node_to_cpu_affinity,
|
|
28
29
|
support_command,
|
|
@@ -165,13 +166,20 @@ class IluvatarDetector(Detector):
|
|
|
165
166
|
if dev_cc_t:
|
|
166
167
|
dev_cc = ".".join(map(str, dev_cc_t))
|
|
167
168
|
|
|
169
|
+
dev_bdf = None
|
|
170
|
+
with contextlib.suppress(pyixml.NVMLError):
|
|
171
|
+
dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
|
|
172
|
+
dev_bdf = str(dev_pci_info.busIdLegacy).lower()
|
|
173
|
+
|
|
168
174
|
dev_is_vgpu = False
|
|
169
|
-
|
|
175
|
+
if dev_bdf:
|
|
176
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
170
177
|
|
|
171
178
|
dev_appendix = {
|
|
172
179
|
"vgpu": dev_is_vgpu,
|
|
173
|
-
"bdf": str(dev_pci_info.busIdLegacy).lower(),
|
|
174
180
|
}
|
|
181
|
+
if dev_bdf:
|
|
182
|
+
dev_appendix["bdf"] = dev_bdf
|
|
175
183
|
|
|
176
184
|
ret.append(
|
|
177
185
|
Device(
|
|
@@ -3,9 +3,10 @@ from __future__ import annotations
|
|
|
3
3
|
import logging
|
|
4
4
|
from functools import lru_cache
|
|
5
5
|
|
|
6
|
+
import pymtml
|
|
7
|
+
|
|
6
8
|
from .. import envs
|
|
7
9
|
from ..logging import debug_log_exception, debug_log_warning
|
|
8
|
-
from . import pymtml
|
|
9
10
|
from .__types__ import (
|
|
10
11
|
Detector,
|
|
11
12
|
Device,
|
|
@@ -105,9 +106,8 @@ class MThreadsDetector(Detector):
|
|
|
105
106
|
|
|
106
107
|
try:
|
|
107
108
|
pymtml.mtmlLibraryInit()
|
|
108
|
-
|
|
109
|
-
sys_driver_ver = pymtml.mtmlSystemGetDriverVersion()
|
|
110
|
-
|
|
109
|
+
system = pymtml.mtmlLibraryInitSystem()
|
|
110
|
+
sys_driver_ver = pymtml.mtmlSystemGetDriverVersion(system)
|
|
111
111
|
dev_count = pymtml.mtmlLibraryCountDevice()
|
|
112
112
|
for dev_idx in range(dev_count):
|
|
113
113
|
dev_index = dev_idx
|
|
@@ -139,25 +139,20 @@ class MThreadsDetector(Detector):
|
|
|
139
139
|
|
|
140
140
|
dev_mem = 0
|
|
141
141
|
dev_mem_used = 0
|
|
142
|
-
|
|
143
|
-
try:
|
|
142
|
+
with pymtml.mtmlMemoryContext(dev) as devmem:
|
|
144
143
|
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
145
144
|
pymtml.mtmlMemoryGetTotal(devmem),
|
|
146
145
|
)
|
|
147
146
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
148
147
|
pymtml.mtmlMemoryGetUsed(devmem),
|
|
149
148
|
)
|
|
150
|
-
finally:
|
|
151
|
-
pymtml.mtmlDeviceFreeMemory(devmem)
|
|
152
149
|
|
|
153
150
|
dev_cores_util = None
|
|
154
151
|
dev_temp = None
|
|
155
|
-
|
|
156
|
-
try:
|
|
152
|
+
with pymtml.mtmlGpuContext(dev) as devgpu:
|
|
157
153
|
dev_cores_util = pymtml.mtmlGpuGetUtilization(devgpu)
|
|
158
154
|
dev_temp = pymtml.mtmlGpuGetTemperature(devgpu)
|
|
159
|
-
|
|
160
|
-
pymtml.mtmlDeviceFreeGpu(devgpu)
|
|
155
|
+
|
|
161
156
|
if dev_cores_util is None:
|
|
162
157
|
debug_log_warning(
|
|
163
158
|
logger,
|
|
@@ -198,6 +193,7 @@ class MThreadsDetector(Detector):
|
|
|
198
193
|
debug_log_exception(logger, "Failed to process devices fetching")
|
|
199
194
|
raise
|
|
200
195
|
finally:
|
|
196
|
+
pymtml.mtmlLibraryFreeSystem(system)
|
|
201
197
|
pymtml.mtmlLibraryShutDown()
|
|
202
198
|
|
|
203
199
|
return ret
|