gpustack-runtime 0.1.39__py3-none-any.whl → 0.1.39.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
+ import io
4
5
  import json
5
6
  import logging
6
7
  import operator
7
8
  import os
8
9
  import socket
9
10
  import sys
11
+ import tarfile
10
12
  from dataclasses import dataclass, field
11
13
  from functools import lru_cache, reduce
12
14
  from math import ceil
@@ -26,7 +28,7 @@ from podman.domain.containers_create import CreateMixin
26
28
  from tqdm import tqdm
27
29
 
28
30
  from .. import envs
29
- from ..logging import debug_log_exception
31
+ from ..logging import debug_log_exception, debug_log_warning
30
32
  from .__patches__ import patch_render_payload
31
33
  from .__types__ import (
32
34
  Container,
@@ -35,7 +37,7 @@ from .__types__ import (
35
37
  ContainerMountModeEnum,
36
38
  ContainerProfileEnum,
37
39
  ContainerRestartPolicyEnum,
38
- Deployer,
40
+ EndoscopicDeployer,
39
41
  OperationError,
40
42
  UnsupportedError,
41
43
  WorkloadExecStream,
@@ -47,7 +49,13 @@ from .__types__ import (
47
49
  WorkloadStatusOperation,
48
50
  WorkloadStatusStateEnum,
49
51
  )
50
- from .__utils__ import _MiB, bytes_to_human_readable, replace_image_with, safe_json
52
+ from .__utils__ import (
53
+ _MiB,
54
+ bytes_to_human_readable,
55
+ replace_image_with,
56
+ safe_json,
57
+ sensitive_env_var,
58
+ )
51
59
 
52
60
  if TYPE_CHECKING:
53
61
  from collections.abc import Callable, Generator
@@ -142,7 +150,7 @@ class PodmanWorkloadPlan(WorkloadPlan):
142
150
  super().validate_and_default()
143
151
 
144
152
  # Adjust default image namespace if needed.
145
- if namespace := envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_NAMESPACE:
153
+ if namespace := envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_NAMESPACE:
146
154
  self.pause_image = replace_image_with(
147
155
  image=self.pause_image,
148
156
  namespace=namespace,
@@ -297,7 +305,7 @@ Name of the Podman deployer.
297
305
  """
298
306
 
299
307
 
300
- class PodmanDeployer(Deployer):
308
+ class PodmanDeployer(EndoscopicDeployer):
301
309
  """
302
310
  Deployer implementation for Podman containers.
303
311
  """
@@ -306,10 +314,6 @@ class PodmanDeployer(Deployer):
306
314
  """
307
315
  Client for interacting with the Podman daemon.
308
316
  """
309
- _container_ephemeral_files_dir: Path | None = None
310
- """
311
- Directory for ephemeral files inside containers, internal use only.
312
- """
313
317
  _mutate_create_options: Callable[[dict[str, Any]], dict[str, Any]] | None = None
314
318
  """
315
319
  Function to handle mirrored deployment, internal use only.
@@ -386,48 +390,6 @@ class PodmanDeployer(Deployer):
386
390
 
387
391
  return wrapper
388
392
 
389
- @staticmethod
390
- def _create_ephemeral_files(
391
- workload: PodmanWorkloadPlan,
392
- ) -> dict[tuple[int, str], str]:
393
- """
394
- Create ephemeral files as local file for the workload.
395
-
396
- Returns:
397
- A mapping from (container index, configured path) to actual filename.
398
-
399
- Raises:
400
- OperationError:
401
- If the ephemeral files fail to create.
402
-
403
- """
404
- # Map (container index, configured path) to actual filename.
405
- ephemeral_filename_mapping: dict[tuple[int, str], str] = {}
406
- ephemeral_files: list[tuple[str, str, int]] = []
407
- for ci, c in enumerate(workload.containers):
408
- for fi, f in enumerate(c.files or []):
409
- if f.content is not None:
410
- fn = f"{workload.name}-{ci}-{fi}"
411
- ephemeral_filename_mapping[(ci, f.path)] = fn
412
- ephemeral_files.append((fn, f.content, f.mode))
413
- if not ephemeral_filename_mapping:
414
- return ephemeral_filename_mapping
415
-
416
- # Create ephemeral files directory if not exists.
417
- try:
418
- for fn, fc, fm in ephemeral_files:
419
- fp = envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR.joinpath(fn)
420
- with fp.open("w", encoding="utf-8") as f:
421
- f.write(fc)
422
- f.flush()
423
- fp.chmod(fm)
424
- logger.debug("Created local file %s with mode %s", fp, oct(fm))
425
- except OSError as e:
426
- msg = "Failed to create ephemeral files"
427
- raise OperationError(msg) from e
428
-
429
- return ephemeral_filename_mapping
430
-
431
393
  def _create_ephemeral_volumes(self, workload: PodmanWorkloadPlan) -> dict[str, str]:
432
394
  """
433
395
  Create ephemeral volumes for the workload.
@@ -473,12 +435,12 @@ class PodmanDeployer(Deployer):
473
435
  tag = tag or "latest"
474
436
  auth_config = None
475
437
  if (
476
- envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME
477
- and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD
438
+ envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_USERNAME
439
+ and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_PASSWORD
478
440
  ):
479
441
  auth_config = {
480
- "username": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME,
481
- "password": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD,
442
+ "username": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_USERNAME,
443
+ "password": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_CONTAINER_REGISTRY_PASSWORD,
482
444
  }
483
445
 
484
446
  logs = self._client.api.pull(
@@ -715,12 +677,10 @@ class PodmanDeployer(Deployer):
715
677
  else:
716
678
  return d_container
717
679
 
680
+ @staticmethod
718
681
  def _append_container_mounts(
719
- self,
720
682
  create_options: dict[str, Any],
721
683
  c: Container,
722
- ci: int,
723
- ephemeral_filename_mapping: dict[tuple[int, str] : str],
724
684
  ephemeral_volume_name_mapping: dict[str, str],
725
685
  ):
726
686
  """
@@ -736,17 +696,7 @@ class PodmanDeployer(Deployer):
736
696
  "target": "",
737
697
  }
738
698
 
739
- if f.content is not None:
740
- # Ephemeral file, use from local ephemeral files directory.
741
- if (ci, f.path) not in ephemeral_filename_mapping:
742
- continue
743
- fn = ephemeral_filename_mapping[(ci, f.path)]
744
- path = str(
745
- self._container_ephemeral_files_dir.joinpath(fn),
746
- )
747
- binding["source"] = path
748
- binding["target"] = f"/{f.path.lstrip('/')}"
749
- elif f.path:
699
+ if f.content is None and f.path:
750
700
  # Host file, bind directly.
751
701
  binding["source"] = f.path
752
702
  binding["target"] = f.path
@@ -858,10 +808,39 @@ class PodmanDeployer(Deployer):
858
808
 
859
809
  return healthcheck
860
810
 
811
+ @staticmethod
812
+ def _upload_ephemeral_files(
813
+ c: Container,
814
+ container: podman.domain.containers.Container,
815
+ ):
816
+ if not c.files:
817
+ return
818
+
819
+ f_tar = io.BytesIO()
820
+ with tarfile.open(fileobj=f_tar, mode="w") as tar:
821
+ for f in c.files:
822
+ if f.content is None or not f.path:
823
+ continue
824
+ fc_bytes = f.content.encode("utf-8")
825
+ info = tarfile.TarInfo(name=f.path.lstrip("/"))
826
+ info.size = len(fc_bytes)
827
+ info.mode = f.mode
828
+ tar.addfile(tarinfo=info, fileobj=io.BytesIO(fc_bytes))
829
+ if f_tar.getbuffer().nbytes == 0:
830
+ return
831
+
832
+ f_tar.seek(0)
833
+ uploaded = container.put_archive(
834
+ path="/",
835
+ data=f_tar.getvalue(),
836
+ )
837
+ if not uploaded:
838
+ msg = f"Failed to upload ephemeral files to container {container.name}"
839
+ raise OperationError(msg)
840
+
861
841
  def _create_containers(
862
842
  self,
863
843
  workload: PodmanWorkloadPlan,
864
- ephemeral_filename_mapping: dict[tuple[int, str] : str],
865
844
  ephemeral_volume_name_mapping: dict[str, str],
866
845
  pause_container: podman.domain.containers.Container,
867
846
  ) -> (
@@ -1077,8 +1056,6 @@ class PodmanDeployer(Deployer):
1077
1056
  self._append_container_mounts(
1078
1057
  create_options,
1079
1058
  c,
1080
- ci,
1081
- ephemeral_filename_mapping,
1082
1059
  ephemeral_volume_name_mapping,
1083
1060
  )
1084
1061
 
@@ -1125,6 +1102,10 @@ class PodmanDeployer(Deployer):
1125
1102
  detach=detach,
1126
1103
  **create_options,
1127
1104
  )
1105
+
1106
+ # Upload ephemeral files into the container.
1107
+ self._upload_ephemeral_files(c, d_container)
1108
+
1128
1109
  except podman.errors.APIError as e:
1129
1110
  msg = f"Failed to create container {container_name}{_detail_api_call_error(e)}"
1130
1111
  raise OperationError(msg) from e
@@ -1174,43 +1155,30 @@ class PodmanDeployer(Deployer):
1174
1155
  def __init__(self):
1175
1156
  super().__init__(_NAME)
1176
1157
  self._client = self._get_client()
1177
- self._container_ephemeral_files_dir = (
1178
- envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR
1179
- )
1180
1158
 
1181
- def _prepare_create(self):
1159
+ def _prepare_mirrored_deployment(self):
1182
1160
  """
1183
- Prepare for creation.
1161
+ Prepare for mirrored deployment.
1184
1162
 
1185
1163
  """
1186
1164
  # Prepare mirrored deployment if enabled.
1187
1165
  if self._mutate_create_options:
1188
1166
  return
1189
1167
  self._mutate_create_options = lambda o: o
1190
- if not envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT:
1191
- logger.debug("Mirrored deployment disabled")
1192
- return
1193
1168
 
1194
1169
  # Retrieve self-container info.
1195
- ## - Get Container name, default to hostname if not set.
1196
- self_container_id = envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME
1197
- if not self_container_id:
1198
- self_container_id = socket.gethostname()
1199
- logger.warning(
1200
- "Mirrored deployment enabled, but no Container name set, using hostname(%s) instead",
1201
- self_container_id,
1202
- )
1203
1170
  try:
1204
- self_container = self._find_self_container(self_container_id)
1171
+ self_container = self._find_self_container()
1172
+ if not self_container:
1173
+ return
1205
1174
  logger.info(
1206
1175
  "Mirrored deployment enabled, using self Container %s for options mirroring",
1207
- self_container.id[:12],
1176
+ self_container.short_id,
1208
1177
  )
1209
1178
  self_image = self_container.image
1210
1179
  except podman.errors.APIError:
1211
1180
  logger.exception(
1212
- "Mirrored deployment enabled, but failed to get self Container %s, skipping",
1213
- self_container_id,
1181
+ "Mirrored deployment enabled, but failed to get self Container, skipping",
1214
1182
  )
1215
1183
  return
1216
1184
 
@@ -1370,36 +1338,10 @@ class PodmanDeployer(Deployer):
1370
1338
 
1371
1339
  self._mutate_create_options = mutate_create_options
1372
1340
 
1373
- # Extract ephemeral files dir mutation if any.
1374
- if mirrored_mounts:
1375
- e_target = str(envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR)
1376
- b_source = ""
1377
- b_target = ""
1378
- for m in mirrored_mounts:
1379
- c_target = m.get("Destination", "///")
1380
- if (
1381
- e_target == c_target or e_target.startswith(f"{c_target}/")
1382
- ) and len(c_target) >= len(b_target):
1383
- b_source = m.get("Source")
1384
- b_target = c_target
1385
- if b_source:
1386
- result = Path(b_source)
1387
- if e_target != b_target:
1388
- b_subpath = e_target.removeprefix(b_target)
1389
- result = result.joinpath(b_subpath.lstrip("/"))
1390
- self._container_ephemeral_files_dir = result
1391
-
1392
- def _find_self_container(
1393
- self,
1394
- self_container_id: str,
1395
- ) -> podman.domain.containers.Container:
1341
+ def _find_self_container(self) -> podman.domain.containers.Container | None:
1396
1342
  """
1397
1343
  Find the current container if running inside a Podman container.
1398
1344
 
1399
- Args:
1400
- self_container_id:
1401
- The container name or ID to find.
1402
-
1403
1345
  Returns:
1404
1346
  The Podman container if found, None otherwise.
1405
1347
 
@@ -1407,38 +1349,54 @@ class PodmanDeployer(Deployer):
1407
1349
  If failed to find itself.
1408
1350
 
1409
1351
  """
1410
- if envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME:
1411
- # Directly get container by name or ID.
1412
- return self._client.containers.get(self_container_id)
1413
-
1414
- # Find containers that matches the hostname.
1415
- containers: list[podman.domain.containers.Container] = []
1416
- for c in self._client.containers.list(compatible=True):
1417
- # Ignore workload containers with host network enabled.
1418
- if _LABEL_WORKLOAD in c.labels:
1419
- continue
1420
- # Ignore containers that do not match the hostname.
1421
- if c.attrs["Config"].get("Hostname", "") != self_container_id:
1422
- continue
1423
- # Ignore containers that do not match the filter labels.
1424
- if envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS and any(
1425
- c.labels.get(k) != v
1426
- for k, v in envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS.items()
1427
- ):
1428
- continue
1429
- containers.append(c)
1352
+ if not envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT:
1353
+ logger.debug("Mirrored deployment disabled")
1354
+ return None
1430
1355
 
1431
- # Validate found containers.
1432
- if len(containers) != 1:
1433
- msg = (
1434
- f"Found multiple Containers with the same hostname {self_container_id}, "
1435
- if len(containers) > 1
1436
- else f"Not found Container with hostname {self_container_id}, "
1437
- "please use `--env GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=...` to specify the exact container name"
1356
+ # Get container ID or hostname.
1357
+ self_container_id = envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME
1358
+ if not self_container_id:
1359
+ self_container_id = socket.gethostname()
1360
+ debug_log_warning(
1361
+ logger,
1362
+ "Mirrored deployment enabled, but no Container name set, using hostname(%s) instead",
1363
+ self_container_id,
1438
1364
  )
1439
- raise podman.errors.NotFound(msg)
1440
1365
 
1441
- return containers[0]
1366
+ if envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME:
1367
+ # Directly get container.
1368
+ self_container = self._client.containers.get(self_container_id)
1369
+ else:
1370
+ # Find containers that matches the hostname.
1371
+ containers: list[podman.domain.containers.Container] = []
1372
+ for c in self._client.containers.list(compatible=True):
1373
+ # Ignore workload containers with host network enabled.
1374
+ if _LABEL_WORKLOAD in c.labels:
1375
+ continue
1376
+ # Ignore containers that do not match the hostname.
1377
+ if c.attrs["Config"].get("Hostname", "") != self_container_id:
1378
+ continue
1379
+ # Ignore containers that do not match the filter labels.
1380
+ if envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS and any(
1381
+ c.labels.get(k) != v
1382
+ for k, v in envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS.items()
1383
+ ):
1384
+ continue
1385
+ containers.append(c)
1386
+
1387
+ # Validate found containers.
1388
+ if len(containers) != 1:
1389
+ msg = (
1390
+ f"Found multiple Containers with the same hostname {self_container_id}, "
1391
+ if len(containers) > 1
1392
+ else f"Not found Container with hostname {self_container_id}, "
1393
+ "please use `--env GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=...` to specify the exact Container name"
1394
+ )
1395
+ raise podman.errors.NotFound(msg)
1396
+
1397
+ self_container = containers[0]
1398
+
1399
+ return self_container
1442
1400
 
1443
1401
  @_supported
1444
1402
  def _create(self, workload: WorkloadPlan):
@@ -1464,7 +1422,7 @@ class PodmanDeployer(Deployer):
1464
1422
  msg = f"Invalid workload type: {type(workload)}"
1465
1423
  raise TypeError(msg)
1466
1424
 
1467
- self._prepare_create()
1425
+ self._prepare_mirrored_deployment()
1468
1426
 
1469
1427
  if isinstance(workload, WorkloadPlan):
1470
1428
  workload = PodmanWorkloadPlan(**workload.__dict__)
@@ -1472,12 +1430,6 @@ class PodmanDeployer(Deployer):
1472
1430
  if logger.isEnabledFor(logging.DEBUG):
1473
1431
  logger.debug("Creating workload:\n%s", workload.to_yaml())
1474
1432
 
1475
- # Create ephemeral file if needed,
1476
- # (container index, configured path): <actual filename>
1477
- ephemeral_filename_mapping: dict[tuple[int, str] : str] = (
1478
- self._create_ephemeral_files(workload)
1479
- )
1480
-
1481
1433
  # Create ephemeral volumes if needed,
1482
1434
  # <configured volume name>: <actual volume name>
1483
1435
  ephemeral_volume_name_mapping: dict[str, str] = self._create_ephemeral_volumes(
@@ -1490,7 +1442,6 @@ class PodmanDeployer(Deployer):
1490
1442
  # Create init/run containers.
1491
1443
  init_containers, run_containers = self._create_containers(
1492
1444
  workload,
1493
- ephemeral_filename_mapping,
1494
1445
  ephemeral_volume_name_mapping,
1495
1446
  pause_container,
1496
1447
  )
@@ -1631,17 +1582,6 @@ class PodmanDeployer(Deployer):
1631
1582
  msg = f"Failed to delete volumes for workload {name}{_detail_api_call_error(e)}"
1632
1583
  raise OperationError(msg) from e
1633
1584
 
1634
- # Remove all ephemeral files for the workload.
1635
- try:
1636
- for fp in envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR.glob(
1637
- f"{name}-*",
1638
- ):
1639
- if fp.is_file():
1640
- fp.unlink(missing_ok=True)
1641
- except OSError as e:
1642
- msg = f"Failed to delete ephemeral files for workload {name}"
1643
- raise OperationError(msg) from e
1644
-
1645
1585
  return workload
1646
1586
 
1647
1587
  @_supported
@@ -1796,6 +1736,11 @@ class PodmanDeployer(Deployer):
1796
1736
  msg = f"Failed to fetch logs for container {container.name} of workload {name}{_detail_api_call_error(e)}"
1797
1737
  raise OperationError(msg) from e
1798
1738
  else:
1739
+ if not follow:
1740
+ result = bytearray()
1741
+ for chunk in output:
1742
+ result.extend(chunk)
1743
+ return result.decode("utf-8")
1799
1744
  return output
1800
1745
 
1801
1746
  @_supported
@@ -1880,6 +1825,216 @@ class PodmanDeployer(Deployer):
1880
1825
  return output
1881
1826
  return PodmanWorkloadExecStream(output)
1882
1827
 
1828
+ @_supported
1829
+ def _inspect(
1830
+ self,
1831
+ name: WorkloadName,
1832
+ namespace: WorkloadNamespace | None = None,
1833
+ ) -> str | None:
1834
+ """
1835
+ Inspect a Podman workload.
1836
+
1837
+ Args:
1838
+ name:
1839
+ The name of the workload.
1840
+ namespace:
1841
+ The namespace of the workload.
1842
+
1843
+ Returns:
1844
+ The inspection result as a JSON string. None if not found.
1845
+
1846
+ Raises:
1847
+ UnsupportedError:
1848
+ If Podman is not supported in the current environment.
1849
+ OperationError:
1850
+ If the Podman workload fails to inspect.
1851
+
1852
+ """
1853
+ workload = self._get(name=name, namespace=namespace)
1854
+ if not workload:
1855
+ return None
1856
+
1857
+ d_containers = getattr(workload, "_d_containers", [])
1858
+ if not d_containers:
1859
+ return None
1860
+
1861
+ result = []
1862
+ for c in d_containers:
1863
+ c_attrs = c.attrs
1864
+ # Mask sensitive environment variables
1865
+ if "Env" in c_attrs["Config"]:
1866
+ for i, env in enumerate(c_attrs["Config"]["Env"] or []):
1867
+ env_name, _ = env.split("=", maxsplit=1)
1868
+ if sensitive_env_var(env_name):
1869
+ c_attrs["Config"]["Env"][i] = f"{env_name}=******"
1870
+ result.append(c_attrs)
1871
+ return safe_json(result, indent=2)
1872
+
1873
+ def _find_self_container_for_endoscopy(self) -> podman.domain.containers.Container:
1874
+ """
1875
+ Find the self container for endoscopy.
1876
+ Only works in mirrored deployment mode.
1877
+
1878
+ Returns:
1879
+ The self container object.
1880
+
1881
+ Raises:
1882
+ UnsupportedError:
1883
+ If endoscopy is not supported in the current environment.
1884
+
1885
+ """
1886
+ try:
1887
+ self_container = self._find_self_container()
1888
+ except podman.errors.APIError as e:
1889
+ msg = "Endoscopy is not supported in the current environment: Mirrored deployment enabled, but failed to get self Container"
1890
+ raise UnsupportedError(msg) from e
1891
+ except Exception as e:
1892
+ msg = "Endoscopy is not supported in the current environment: Failed to get self Container"
1893
+ raise UnsupportedError(msg) from e
1894
+
1895
+ if not self_container:
1896
+ msg = "Endoscopy is not supported in the current environment: Mirrored deployment disabled"
1897
+ raise UnsupportedError(msg)
1898
+ return self_container
1899
+
1900
+ def _endoscopic_logs(
1901
+ self,
1902
+ timestamps: bool = False,
1903
+ tail: int | None = None,
1904
+ since: int | None = None,
1905
+ follow: bool = False,
1906
+ ) -> Generator[bytes | str, None, None] | bytes | str:
1907
+ """
1908
+ Get the logs of the deployer itself.
1909
+ Only works in mirrored deployment mode.
1910
+
1911
+ Args:
1912
+ timestamps:
1913
+ Show timestamps in the logs.
1914
+ tail:
1915
+ Number of lines to show from the end of the logs.
1916
+ since:
1917
+ Show logs since the given epoch in seconds.
1918
+ follow:
1919
+ Whether to follow the logs.
1920
+
1921
+ Returns:
1922
+ The logs as a byte string or a generator yielding byte strings if follow is True.
1923
+
1924
+ Raises:
1925
+ UnsupportedError:
1926
+ If endoscopy is not supported in the current environment.
1927
+ OperationError:
1928
+ If the deployer fails to get logs.
1929
+
1930
+ """
1931
+ self_container = self._find_self_container_for_endoscopy()
1932
+
1933
+ logs_options = {
1934
+ "timestamps": timestamps,
1935
+ "tail": tail if tail >= 0 else None,
1936
+ "since": since,
1937
+ "follow": follow,
1938
+ }
1939
+
1940
+ try:
1941
+ output = self_container.logs(
1942
+ stream=follow,
1943
+ **logs_options,
1944
+ )
1945
+ except podman.errors.APIError as e:
1946
+ msg = f"Failed to fetch logs for self Container {self_container.short_id}{_detail_api_call_error(e)}"
1947
+ raise OperationError(msg) from e
1948
+ else:
1949
+ if not follow:
1950
+ result = bytearray()
1951
+ for chunk in output:
1952
+ result.extend(chunk)
1953
+ return result.decode("utf-8")
1954
+ return output
1955
+
1956
+ def _endoscopic_exec(
1957
+ self,
1958
+ detach: bool = True,
1959
+ command: list[str] | None = None,
1960
+ args: list[str] | None = None,
1961
+ ) -> WorkloadExecStream | bytes | str:
1962
+ """
1963
+ Execute a command in the deployer itself.
1964
+ Only works in mirrored deployment mode.
1965
+
1966
+ Args:
1967
+ detach:
1968
+ Whether to detach from the command.
1969
+ command:
1970
+ The command to execute.
1971
+ If not specified, use /bin/sh and implicitly attach.
1972
+ args:
1973
+ The arguments to pass to the command.
1974
+
1975
+ Returns:
1976
+ If detach is False, return a WorkloadExecStream.
1977
+ otherwise, return the output of the command as a byte string or string.
1978
+
1979
+ Raises:
1980
+ UnsupportedError:
1981
+ If endoscopy is not supported in the current environment.
1982
+ OperationError:
1983
+ If the deployer fails to execute the command.
1984
+
1985
+ """
1986
+ self_container = self._find_self_container_for_endoscopy()
1987
+
1988
+ attach = not detach or not command
1989
+ exec_options = {
1990
+ "stdout": True,
1991
+ "stderr": True,
1992
+ "stdin": attach,
1993
+ "socket": attach,
1994
+ "tty": attach,
1995
+ "cmd": [*command, *(args or [])] if command else ["/bin/sh"],
1996
+ }
1997
+
1998
+ try:
1999
+ _, output = self_container.exec_run(
2000
+ detach=False,
2001
+ **exec_options,
2002
+ )
2003
+ except podman.errors.APIError as e:
2004
+ msg = f"Failed to exec command in self Container {self_container.short_id}{_detail_api_call_error(e)}"
2005
+ raise OperationError(msg) from e
2006
+ else:
2007
+ if not attach:
2008
+ return output
2009
+ return PodmanWorkloadExecStream(output)
2010
+
2011
+ def _endoscopic_inspect(self) -> str:
2012
+ """
2013
+ Inspect the deployer itself.
2014
+ Only works in mirrored deployment mode.
2015
+
2016
+ Returns:
2017
+ The inspection result.
2018
+
2019
+ Raises:
2020
+ UnsupportedError:
2021
+ If endoscopy is not supported in the current environment.
2022
+ OperationError:
2023
+ If the deployer fails to execute the command.
2024
+
2025
+ """
2026
+ self_container = self._find_self_container_for_endoscopy()
2027
+
2028
+ c_attrs = self_container.attrs
2029
+ # Mask sensitive environment variables
2030
+ if "Env" in c_attrs["Config"]:
2031
+ for i, env in enumerate(c_attrs["Config"]["Env"] or []):
2032
+ env_name, _ = env.split("=", maxsplit=1)
2033
+ if sensitive_env_var(env_name):
2034
+ c_attrs["Config"]["Env"][i] = f"{env_name}=******"
2035
+
2036
+ return safe_json(c_attrs, indent=2)
2037
+
1883
2038
 
1884
2039
  def _has_restart_policy(
1885
2040
  container: podman.domain.containers.Container,
@@ -108,7 +108,7 @@ class AMDDetector(Detector):
108
108
  dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
109
109
  else:
110
110
  dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
111
- dev_hsa_agent = hsa_agents.get(dev_uuid)
111
+ dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
112
112
 
113
113
  dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
114
114
  dev_driver_ver = dev_gpu_driver_info.get("driver_version")
@@ -108,7 +108,7 @@ class HygonDetector(Detector):
108
108
  dev_index = dev_idx
109
109
 
110
110
  dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
111
- dev_hsa_agent = hsa_agents.get(dev_uuid)
111
+ dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
112
112
 
113
113
  dev_name = dev_hsa_agent.name
114
114
  if not dev_name: