gpustack-runtime 0.1.38.post4__py3-none-any.whl → 0.1.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2114 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import json
5
+ import logging
6
+ import operator
7
+ import os
8
+ import socket
9
+ import sys
10
+ from dataclasses import dataclass, field
11
+ from functools import lru_cache, reduce
12
+ from math import ceil
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any
15
+ from unittest.mock import patch
16
+
17
+ import podman
18
+ import podman.domain
19
+ import podman.domain.containers
20
+ import podman.domain.images
21
+ import podman.domain.volumes
22
+ import podman.errors
23
+ from dataclasses_json import dataclass_json
24
+ from podman.api import parse_repository
25
+ from podman.domain.containers_create import CreateMixin
26
+ from tqdm import tqdm
27
+
28
+ from .. import envs
29
+ from ..logging import debug_log_exception
30
+ from .__patches__ import patch_render_payload
31
+ from .__types__ import (
32
+ Container,
33
+ ContainerCheck,
34
+ ContainerImagePullPolicyEnum,
35
+ ContainerMountModeEnum,
36
+ ContainerProfileEnum,
37
+ ContainerRestartPolicyEnum,
38
+ Deployer,
39
+ OperationError,
40
+ UnsupportedError,
41
+ WorkloadExecStream,
42
+ WorkloadName,
43
+ WorkloadNamespace,
44
+ WorkloadOperationToken,
45
+ WorkloadPlan,
46
+ WorkloadStatus,
47
+ WorkloadStatusOperation,
48
+ WorkloadStatusStateEnum,
49
+ )
50
+ from .__utils__ import _MiB, bytes_to_human_readable, replace_image_with, safe_json
51
+
52
+ if TYPE_CHECKING:
53
+ from collections.abc import Callable, Generator
54
+
55
+ logger = logging.getLogger(__name__)
56
+ clogger = logger.getChild("conversion")
57
+
58
+ _LABEL_WORKLOAD = f"{envs.GPUSTACK_RUNTIME_DEPLOY_LABEL_PREFIX}/workload"
59
+ _LABEL_COMPONENT = f"{envs.GPUSTACK_RUNTIME_DEPLOY_LABEL_PREFIX}/component"
60
+ _LABEL_COMPONENT_NAME = f"{_LABEL_COMPONENT}-name"
61
+ _LABEL_COMPONENT_INDEX = f"{_LABEL_COMPONENT}-index"
62
+ _LABEL_COMPONENT_HEAL_PREFIX = f"{_LABEL_COMPONENT}-heal"
63
+
64
+
65
+ @dataclass_json
66
+ @dataclass
67
+ class PodmanWorkloadPlan(WorkloadPlan):
68
+ """
69
+ Workload plan implementation for Podman containers.
70
+
71
+ Attributes:
72
+ pause_image (str):
73
+ Image used for the pause container.
74
+ unhealthy_restart_image (str):
75
+ Image used for unhealthy restart container.
76
+ resource_key_runtime_env_mapping: (dict[str, str]):
77
+ Mapping from resource names to environment variable names for device allocation,
78
+ which is used to tell the Container Runtime which GPUs to mount into the container.
79
+ For example, {"nvidia.com/gpu": "NVIDIA_VISIBLE_DEVICES"},
80
+ which sets the "NVIDIA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
81
+ With privileged mode, the container can access all GPUs even if specified.
82
+ resource_key_backend_env_mapping: (dict[str, list[str]]):
83
+ Mapping from resource names to environment variable names for device runtime,
84
+ which is used to tell the Device Runtime (e.g., ROCm, CUDA, OneAPI) which GPUs to use inside the container.
85
+ For example, {"nvidia.com/gpu": ["CUDA_VISIBLE_DEVICES"]},
86
+ which sets the "CUDA_VISIBLE_DEVICES" environment variable to the allocated GPU device IDs.
87
+ namespace (str | None):
88
+ Namespace of the workload.
89
+ name (str):
90
+ Name of the workload,
91
+ it should be unique in the deployer.
92
+ labels (dict[str, str] | None):
93
+ Labels to attach to the workload.
94
+ host_network (bool):
95
+ Indicates if the containers of the workload use the host network.
96
+ host_ipc (bool):
97
+ Indicates if the containers of the workload use the host IPC.
98
+ pid_shared (bool):
99
+ Indicates if the containers of the workload share the PID namespace.
100
+ shm_size (int | str | None):
101
+ Configure shared memory size for the workload.
102
+ run_as_user (int | None):
103
+ The user ID to run the workload as.
104
+ run_as_group (int | None):
105
+ The group ID to run the workload as.
106
+ fs_group (int | None):
107
+ The group ID to own the filesystem of the workload.
108
+ sysctls (dict[str, str] | None):
109
+ Sysctls to set for the workload.
110
+ containers (list[tuple[int, Container]] | None):
111
+ List of containers in the workload.
112
+ It must contain at least one "RUN" profile container.
113
+
114
+ """
115
+
116
+ pause_image: str = envs.GPUSTACK_RUNTIME_PODMAN_PAUSE_IMAGE
117
+ """
118
+ Image used for the pause container.
119
+ """
120
+ unhealthy_restart_image: str = envs.GPUSTACK_RUNTIME_PODMAN_UNHEALTHY_RESTART_IMAGE
121
+ """
122
+ Image used for unhealthy restart container.
123
+ """
124
+
125
+ def validate_and_default(self):
126
+ """
127
+ Validate and set defaults for the workload plan.
128
+
129
+ Raises:
130
+ ValueError:
131
+ If the workload plan is invalid.
132
+
133
+ """
134
+ if self.labels is None:
135
+ self.labels = {}
136
+ if self.containers is None:
137
+ self.containers = []
138
+
139
+ self.labels[_LABEL_WORKLOAD] = self.name
140
+
141
+ # Default and validate in the base class.
142
+ super().validate_and_default()
143
+
144
+ # Adjust default image namespace if needed.
145
+ if namespace := envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_NAMESPACE:
146
+ self.pause_image = replace_image_with(
147
+ image=self.pause_image,
148
+ namespace=namespace,
149
+ )
150
+ self.unhealthy_restart_image = replace_image_with(
151
+ image=self.unhealthy_restart_image,
152
+ namespace=namespace,
153
+ )
154
+
155
+
156
+ @dataclass_json
157
+ @dataclass
158
+ class PodmanWorkloadStatus(WorkloadStatus):
159
+ """
160
+ Workload status implementation for Podman containers.
161
+ """
162
+
163
+ _d_containers: list[podman.domain.containers.Container] | None = field(
164
+ default=None,
165
+ repr=False,
166
+ metadata={
167
+ "dataclasses_json": {
168
+ "exclude": lambda _: True,
169
+ "encoder": lambda _: None,
170
+ "decoder": lambda _: [],
171
+ },
172
+ },
173
+ )
174
+ """
175
+ List of Podman containers in the workload,
176
+ internal use only.
177
+ """
178
+
179
+ @staticmethod
180
+ def parse_state(
181
+ d_containers: list[podman.domain.containers.Container],
182
+ ) -> WorkloadStatusStateEnum:
183
+ """
184
+ Parse the state of the workload based on the status of its containers.
185
+
186
+ Args:
187
+ d_containers:
188
+ List of Podman containers in the workload.
189
+
190
+ Returns:
191
+ The state of the workload.
192
+
193
+ """
194
+ d_init_containers: list[podman.domain.containers.Container] = []
195
+ d_run_containers: list[podman.domain.containers.Container] = []
196
+ for c in d_containers:
197
+ if c.labels.get(_LABEL_COMPONENT) == "init":
198
+ d_init_containers.append(c)
199
+ elif c.labels.get(_LABEL_COMPONENT) == "run":
200
+ d_run_containers.append(c)
201
+
202
+ if not d_run_containers:
203
+ if not d_init_containers:
204
+ return WorkloadStatusStateEnum.UNKNOWN
205
+ return WorkloadStatusStateEnum.INACTIVE
206
+
207
+ d_run_state = WorkloadStatusStateEnum.RUNNING
208
+ for cr in d_run_containers:
209
+ if cr.status == "dead":
210
+ return WorkloadStatusStateEnum.FAILED
211
+ if cr.status == "exited":
212
+ if cr.attrs["State"].get("ExitCode", 1) != 0:
213
+ return (
214
+ WorkloadStatusStateEnum.FAILED
215
+ if not _has_restart_policy(cr)
216
+ else WorkloadStatusStateEnum.UNHEALTHY
217
+ )
218
+ return WorkloadStatusStateEnum.INACTIVE
219
+ if cr.status == "paused":
220
+ return WorkloadStatusStateEnum.INACTIVE
221
+ if cr.status in ["restarting", "removing"]:
222
+ return WorkloadStatusStateEnum.UNHEALTHY
223
+ if cr.status == "created":
224
+ d_run_state = WorkloadStatusStateEnum.PENDING
225
+ else:
226
+ health = cr.attrs["State"].get("Health", {})
227
+ if health and health.get("Status", "healthy") not in ["healthy", ""]:
228
+ return WorkloadStatusStateEnum.UNHEALTHY
229
+
230
+ d_init_state = None
231
+ for ci in d_init_containers or []:
232
+ if ci.status == "dead":
233
+ return WorkloadStatusStateEnum.FAILED
234
+ if ci.status == "exited":
235
+ if ci.attrs["State"].get("ExitCode", 1) != 0:
236
+ return (
237
+ WorkloadStatusStateEnum.FAILED
238
+ if not _has_restart_policy(ci)
239
+ else WorkloadStatusStateEnum.UNHEALTHY
240
+ )
241
+ elif ci.status in ["paused", "removing"]:
242
+ if _has_restart_policy(ci):
243
+ return WorkloadStatusStateEnum.UNHEALTHY
244
+ elif ci.status == "restarting":
245
+ if _has_restart_policy(ci):
246
+ return WorkloadStatusStateEnum.UNHEALTHY
247
+ d_init_state = WorkloadStatusStateEnum.INITIALIZING
248
+ elif ci.status == "created":
249
+ return WorkloadStatusStateEnum.PENDING
250
+ elif not _has_restart_policy(ci):
251
+ d_init_state = WorkloadStatusStateEnum.INITIALIZING
252
+
253
+ return d_init_state if d_init_state else d_run_state
254
+
255
+ def __init__(
256
+ self,
257
+ name: WorkloadName,
258
+ d_containers: list[podman.domain.containers.Container],
259
+ **kwargs,
260
+ ):
261
+ created_at = d_containers[0].attrs["Created"]
262
+ labels = {
263
+ k: v
264
+ for k, v in d_containers[0].labels.items()
265
+ if not k.startswith(f"{envs.GPUSTACK_RUNTIME_DEPLOY_LABEL_PREFIX}/")
266
+ }
267
+
268
+ super().__init__(
269
+ name=name,
270
+ created_at=created_at,
271
+ labels=labels,
272
+ **kwargs,
273
+ )
274
+
275
+ self._d_containers = d_containers
276
+
277
+ for c in d_containers:
278
+ op = WorkloadStatusOperation(
279
+ name=c.labels.get(_LABEL_COMPONENT_NAME, "") or c.name,
280
+ token=c.attrs.get("Id", "") or c.name,
281
+ )
282
+ match c.labels.get(_LABEL_COMPONENT):
283
+ case "init":
284
+ if c.status == "running" and _has_restart_policy(c):
285
+ self.executable.append(op)
286
+ self.loggable.append(op)
287
+ case "run":
288
+ self.executable.append(op)
289
+ self.loggable.append(op)
290
+
291
+ self.state = self.parse_state(d_containers)
292
+
293
+
294
+ _NAME = "podman"
295
+ """
296
+ Name of the Podman deployer.
297
+ """
298
+
299
+
300
+ class PodmanDeployer(Deployer):
301
+ """
302
+ Deployer implementation for Podman containers.
303
+ """
304
+
305
+ _client: podman.PodmanClient | None = None
306
+ """
307
+ Client for interacting with the Podman daemon.
308
+ """
309
+ _container_ephemeral_files_dir: Path | None = None
310
+ """
311
+ Directory for ephemeral files inside containers, internal use only.
312
+ """
313
+ _mutate_create_options: Callable[[dict[str, Any]], dict[str, Any]] | None = None
314
+ """
315
+ Function to handle mirrored deployment, internal use only.
316
+ """
317
+
318
+ @staticmethod
319
+ @lru_cache
320
+ def is_supported() -> bool:
321
+ """
322
+ Check if Podman is supported in the current environment.
323
+
324
+ Returns:
325
+ True if supported, False otherwise.
326
+
327
+ """
328
+ supported = False
329
+ if envs.GPUSTACK_RUNTIME_DEPLOY.lower() not in ("auto", _NAME):
330
+ return supported
331
+
332
+ client = PodmanDeployer._get_client()
333
+ if client:
334
+ try:
335
+ supported = client.ping()
336
+ if envs.GPUSTACK_RUNTIME_LOG_EXCEPTION:
337
+ version_info = client.version()
338
+ logger.debug(
339
+ "Connected to Podman API server: %s",
340
+ version_info,
341
+ )
342
+ except podman.errors.APIError:
343
+ debug_log_exception(logger, "Failed to connect to Podman API server")
344
+
345
+ return supported
346
+
347
+ @staticmethod
348
+ def _get_client() -> podman.PodmanClient | None:
349
+ """
350
+ Return a Podman client.
351
+
352
+ Returns:
353
+ A Podman client if available, None otherwise.
354
+
355
+ """
356
+ client = None
357
+
358
+ try:
359
+ with (
360
+ Path(os.devnull).open("w") as dev_null,
361
+ contextlib.redirect_stdout(dev_null),
362
+ contextlib.redirect_stderr(dev_null),
363
+ ):
364
+ os_env = os.environ.copy()
365
+ if envs.GPUSTACK_RUNTIME_PODMAN_HOST:
366
+ os_env["CONTAINER_HOST"] = envs.GPUSTACK_RUNTIME_PODMAN_HOST
367
+ client = podman.from_env(environment=os_env)
368
+ except podman.errors.DockerException as e:
369
+ if "FileNotFoundError" not in str(e):
370
+ debug_log_exception(logger, "Failed to get Podman client")
371
+
372
+ return client
373
+
374
+ @staticmethod
375
+ def _supported(func):
376
+ """
377
+ Decorator to check if Podman is supported in the current environment.
378
+
379
+ """
380
+
381
+ def wrapper(self, *args, **kwargs):
382
+ if not self.is_supported():
383
+ msg = "Podman is not supported in the current environment."
384
+ raise UnsupportedError(msg)
385
+ return func(self, *args, **kwargs)
386
+
387
+ return wrapper
388
+
389
+ @staticmethod
390
+ def _create_ephemeral_files(
391
+ workload: PodmanWorkloadPlan,
392
+ ) -> dict[tuple[int, str], str]:
393
+ """
394
+ Create ephemeral files as local file for the workload.
395
+
396
+ Returns:
397
+ A mapping from (container index, configured path) to actual filename.
398
+
399
+ Raises:
400
+ OperationError:
401
+ If the ephemeral files fail to create.
402
+
403
+ """
404
+ # Map (container index, configured path) to actual filename.
405
+ ephemeral_filename_mapping: dict[tuple[int, str], str] = {}
406
+ ephemeral_files: list[tuple[str, str, int]] = []
407
+ for ci, c in enumerate(workload.containers):
408
+ for fi, f in enumerate(c.files or []):
409
+ if f.content is not None:
410
+ fn = f"{workload.name}-{ci}-{fi}"
411
+ ephemeral_filename_mapping[(ci, f.path)] = fn
412
+ ephemeral_files.append((fn, f.content, f.mode))
413
+ if not ephemeral_filename_mapping:
414
+ return ephemeral_filename_mapping
415
+
416
+ # Create ephemeral files directory if not exists.
417
+ try:
418
+ for fn, fc, fm in ephemeral_files:
419
+ fp = envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR.joinpath(fn)
420
+ with fp.open("w", encoding="utf-8") as f:
421
+ f.write(fc)
422
+ f.flush()
423
+ fp.chmod(fm)
424
+ logger.debug("Created local file %s with mode %s", fp, oct(fm))
425
+ except OSError as e:
426
+ msg = "Failed to create ephemeral files"
427
+ raise OperationError(msg) from e
428
+
429
+ return ephemeral_filename_mapping
430
+
431
+ def _create_ephemeral_volumes(self, workload: PodmanWorkloadPlan) -> dict[str, str]:
432
+ """
433
+ Create ephemeral volumes for the workload.
434
+
435
+ Returns:
436
+ A mapping from configured volume name to actual volume name.
437
+
438
+ Raises:
439
+ OperationError:
440
+ If the ephemeral volumes fail to create.
441
+
442
+ """
443
+ # Map configured volume name to actual volume name.
444
+ ephemeral_volume_name_mapping: dict[str, str] = {
445
+ m.volume: f"{workload.name}-{m.volume}"
446
+ for c in workload.containers
447
+ for m in c.mounts or []
448
+ if m.volume
449
+ }
450
+ if not ephemeral_volume_name_mapping:
451
+ return ephemeral_volume_name_mapping
452
+
453
+ # Create volumes.
454
+ try:
455
+ for n in ephemeral_volume_name_mapping.values():
456
+ self._client.volumes.create(
457
+ name=n,
458
+ driver="local",
459
+ labels=workload.labels,
460
+ )
461
+ logger.debug("Created volume %s", n)
462
+ except podman.errors.APIError as e:
463
+ msg = f"Failed to create ephemeral volumes{_detail_api_call_error(e)}"
464
+ raise OperationError(msg) from e
465
+
466
+ return ephemeral_volume_name_mapping
467
+
468
+ def _pull_image(self, image: str) -> podman.domain.images.Image:
469
+ try:
470
+ logger.info("Pulling image %s", image)
471
+
472
+ repo, tag = parse_repository(image)
473
+ tag = tag or "latest"
474
+ auth_config = None
475
+ if (
476
+ envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME
477
+ and envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD
478
+ ):
479
+ auth_config = {
480
+ "username": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_USERNAME,
481
+ "password": envs.GPUSTACK_RUNTIME_DEPLOY_DEFAULT_IMAGE_REGISTRY_PASSWORD,
482
+ }
483
+
484
+ logs = self._client.api.pull(
485
+ repo,
486
+ tag=tag,
487
+ stream=True,
488
+ decode=True,
489
+ auth_config=auth_config,
490
+ )
491
+ _print_pull_logs(logs, image, tag)
492
+
493
+ logger.info("Pulled image %s", image)
494
+
495
+ sep = "@" if tag.startswith("sha256:") else ":"
496
+ return self._client.images.get(f"{repo}{sep}{tag}")
497
+ except json.decoder.JSONDecodeError as e:
498
+ msg = f"Failed to pull image {image}, invalid response"
499
+ raise OperationError(msg) from e
500
+ except podman.errors.APIError as e:
501
+ msg = f"Failed to pull image {image}{_detail_api_call_error(e)}"
502
+ raise OperationError(msg) from e
503
+
504
+ def _get_image(
505
+ self,
506
+ image: str,
507
+ policy: ContainerImagePullPolicyEnum | None = None,
508
+ ) -> podman.domain.images.Image:
509
+ """
510
+ Get image.
511
+
512
+ Args:
513
+ image:
514
+ The image to get.
515
+ policy:
516
+ The image pull policy.
517
+ If not specified, defaults to IF_NOT_PRESENT.
518
+
519
+ Returns:
520
+ The image object.
521
+
522
+ Raises:
523
+ OperationError:
524
+ If the image fails to get.
525
+
526
+ """
527
+ if not policy:
528
+ policy = ContainerImagePullPolicyEnum.IF_NOT_PRESENT
529
+
530
+ try:
531
+ if policy == ContainerImagePullPolicyEnum.ALWAYS:
532
+ return self._pull_image(image)
533
+ except podman.errors.APIError as e:
534
+ msg = f"Failed to get image {image}{_detail_api_call_error(e)}"
535
+ raise OperationError(msg) from e
536
+
537
+ try:
538
+ return self._client.images.get(image)
539
+ except podman.errors.ImageNotFound:
540
+ if policy == ContainerImagePullPolicyEnum.NEVER:
541
+ raise
542
+ return self._pull_image(image)
543
+ except podman.errors.APIError as e:
544
+ msg = f"Failed to get image {image}{_detail_api_call_error(e)}"
545
+ raise OperationError(msg) from e
546
+
547
+ def _create_pause_container(
548
+ self,
549
+ workload: PodmanWorkloadPlan,
550
+ ) -> podman.domain.containers.Container:
551
+ """
552
+ Create the pause container for the workload.
553
+
554
+ Returns:
555
+ The pause container object.
556
+
557
+ Raises:
558
+ OperationError:
559
+ If the pause container fails to create.
560
+
561
+ """
562
+ container_name = f"{workload.name}-pause"
563
+ try:
564
+ container = self._client.containers.get(container_name)
565
+ except podman.errors.NotFound:
566
+ pass
567
+ except podman.errors.APIError as e:
568
+ msg = f"Failed to confirm whether container {container_name} exists{_detail_api_call_error(e)}"
569
+ raise OperationError(msg) from e
570
+ else:
571
+ # TODO(thxCode): check if the container matches the spec
572
+ return container
573
+
574
+ privileged = any(
575
+ c.execution.privileged
576
+ for c in workload.containers
577
+ if c.profile == ContainerProfileEnum.RUN and c.execution
578
+ )
579
+
580
+ create_options: dict[str, Any] = {
581
+ "name": container_name,
582
+ "restart_policy": {"Name": "no"},
583
+ "network_mode": "bridge",
584
+ "ipc_mode": "shareable",
585
+ "oom_score_adj": -998,
586
+ "privileged": privileged,
587
+ "no_new_privileges": True,
588
+ "labels": {
589
+ **workload.labels,
590
+ _LABEL_COMPONENT: "pause",
591
+ },
592
+ }
593
+
594
+ if workload.host_network:
595
+ create_options["network_mode"] = "host"
596
+ else:
597
+ create_options["hostname"] = workload.name
598
+ port_mapping: dict[str, int] = {
599
+ # <internal port>/<protocol>: <external port>
600
+ f"{p.internal}/{p.protocol.lower()}": p.external or p.internal
601
+ for c in workload.containers
602
+ if c.profile == ContainerProfileEnum.RUN
603
+ for p in c.ports or []
604
+ }
605
+ if port_mapping:
606
+ create_options["ports"] = port_mapping
607
+
608
+ if workload.host_ipc:
609
+ create_options["ipc_mode"] = "host"
610
+ elif workload.shm_size:
611
+ create_options["shm_size"] = workload.shm_size
612
+
613
+ if envs.GPUSTACK_RUNTIME_DEPLOY_PRINT_CONVERSION:
614
+ clogger.info(
615
+ f"Creating pause container %s with options:{os.linesep}%s",
616
+ container_name,
617
+ safe_json(create_options, indent=2),
618
+ )
619
+
620
+ try:
621
+ d_container = self._client.containers.create(
622
+ image=self._get_image(workload.pause_image),
623
+ detach=True,
624
+ **create_options,
625
+ )
626
+ except podman.errors.APIError as e:
627
+ msg = f"Failed to create container {container_name}{_detail_api_call_error(e)}"
628
+ raise OperationError(msg) from e
629
+ else:
630
+ return d_container
631
+
632
+ def _create_unhealthy_restart_container(
633
+ self,
634
+ workload: PodmanWorkloadPlan,
635
+ ) -> podman.domain.containers.Container | None:
636
+ """
637
+ Create the unhealthy restart container for the workload if needed.
638
+
639
+ Returns:
640
+ The unhealthy restart container object if created, None otherwise.
641
+
642
+ Raises:
643
+ OperationError:
644
+ If the unhealthy restart container fails to create.
645
+
646
+ """
647
+ # Check if the first check of any RUN container has teardown enabled.
648
+ enabled = any(
649
+ c.checks[0].teardown
650
+ for c in workload.containers
651
+ if c.profile == ContainerProfileEnum.RUN and c.checks
652
+ )
653
+ if not enabled:
654
+ return None
655
+
656
+ container_name = f"{workload.name}-unhealthy-restart"
657
+ try:
658
+ d_container = self._client.containers.get(container_name)
659
+ except podman.errors.NotFound:
660
+ pass
661
+ except podman.errors.APIError as e:
662
+ msg = f"Failed to confirm whether container {container_name} exists{_detail_api_call_error(e)}"
663
+ raise OperationError(msg) from e
664
+ else:
665
+ # TODO(thxCode): check if the container matches the spec
666
+ return d_container
667
+
668
+ host_socket_path = None
669
+ if envs.GPUSTACK_RUNTIME_PODMAN_HOST.startswith("http+unix://"):
670
+ host_socket_path = envs.GPUSTACK_RUNTIME_PODMAN_HOST[len("http+unix://") :]
671
+ elif envs.GPUSTACK_RUNTIME_PODMAN_HOST.startswith("unix://"):
672
+ host_socket_path = envs.GPUSTACK_RUNTIME_PODMAN_HOST[len("unix://") :]
673
+ if host_socket_path and not host_socket_path.startswith("/"):
674
+ host_socket_path = f"/{host_socket_path}"
675
+
676
+ create_options: dict[str, Any] = {
677
+ "name": container_name,
678
+ "restart_policy": {"Name": "always"},
679
+ "network_mode": "none",
680
+ "labels": {
681
+ **workload.labels,
682
+ _LABEL_COMPONENT: "unhealthy-restart",
683
+ },
684
+ "environment": [
685
+ f"AUTOHEAL_CONTAINER_LABEL={_LABEL_COMPONENT_HEAL_PREFIX}-{workload.name}",
686
+ ],
687
+ }
688
+ if host_socket_path:
689
+ create_options["volumes"] = (
690
+ [
691
+ f"{host_socket_path}:/var/run/docker.sock",
692
+ ],
693
+ )
694
+ elif envs.GPUSTACK_RUNTIME_PODMAN_HOST:
695
+ create_options["environment"].append(
696
+ f"DOCKER_SOCK={envs.GPUSTACK_RUNTIME_PODMAN_HOST}",
697
+ )
698
+
699
+ if envs.GPUSTACK_RUNTIME_DEPLOY_PRINT_CONVERSION:
700
+ clogger.info(
701
+ f"Creating unhealthy restart container %s with options:{os.linesep}%s",
702
+ container_name,
703
+ safe_json(create_options, indent=2),
704
+ )
705
+
706
+ try:
707
+ d_container = self._client.containers.create(
708
+ image=self._get_image(workload.unhealthy_restart_image),
709
+ detach=True,
710
+ **create_options,
711
+ )
712
+ except podman.errors.APIError as e:
713
+ msg = f"Failed to create container {container_name}{_detail_api_call_error(e)}"
714
+ raise OperationError(msg) from e
715
+ else:
716
+ return d_container
717
+
718
+ def _append_container_mounts(
719
+ self,
720
+ create_options: dict[str, Any],
721
+ c: Container,
722
+ ci: int,
723
+ ephemeral_filename_mapping: dict[tuple[int, str] : str],
724
+ ephemeral_volume_name_mapping: dict[str, str],
725
+ ):
726
+ """
727
+ Append (bind) mounts into the create_options.
728
+ """
729
+ mount_binding: list[dict] = []
730
+
731
+ if files := c.files:
732
+ for f in files:
733
+ binding = {
734
+ "type": "bind",
735
+ "source": "",
736
+ "target": "",
737
+ }
738
+
739
+ if f.content is not None:
740
+ # Ephemeral file, use from local ephemeral files directory.
741
+ if (ci, f.path) not in ephemeral_filename_mapping:
742
+ continue
743
+ fn = ephemeral_filename_mapping[(ci, f.path)]
744
+ path = str(
745
+ self._container_ephemeral_files_dir.joinpath(fn),
746
+ )
747
+ binding["source"] = path
748
+ binding["target"] = f"/{f.path.lstrip('/')}"
749
+ elif f.path:
750
+ # Host file, bind directly.
751
+ binding["source"] = f.path
752
+ binding["target"] = f.path
753
+ else:
754
+ continue
755
+
756
+ if f.mode < 0o600:
757
+ binding["read_only"] = True
758
+
759
+ mount_binding.append(binding)
760
+
761
+ if mounts := c.mounts:
762
+ for m in mounts:
763
+ binding = {
764
+ "type": "volume",
765
+ "source": "",
766
+ "target": "",
767
+ }
768
+
769
+ if m.volume:
770
+ # Ephemeral volume, use the created volume.
771
+ binding["source"] = ephemeral_volume_name_mapping.get(
772
+ m.volume,
773
+ m.volume,
774
+ )
775
+ binding["target"] = f"/{m.path.lstrip('/')}"
776
+ # TODO(thxCode): support subpath.
777
+ elif m.path:
778
+ # Host path, bind directly.
779
+ binding["type"] = "bind"
780
+ binding["source"] = m.path
781
+ binding["target"] = m.path
782
+ else:
783
+ continue
784
+
785
+ if m.mode != ContainerMountModeEnum.RWX:
786
+ binding["read_only"] = True
787
+
788
+ mount_binding.append(binding)
789
+
790
+ if mount_binding:
791
+ create_options["mounts"] = mount_binding
792
+
793
+ @staticmethod
794
+ def _parameterize_healthcheck(
795
+ chk: ContainerCheck,
796
+ ) -> dict[str, Any]:
797
+ """
798
+ Parameterize health check for a container.
799
+
800
+ Returns:
801
+ A dictionary representing the health check configuration.
802
+
803
+ Raises:
804
+ ValueError:
805
+ If the health check configuration is invalid.
806
+
807
+ """
808
+ healthcheck: dict[str, Any] = {
809
+ "start_period": chk.delay * 1000000000,
810
+ "interval": chk.interval * 1000000000,
811
+ "timeout": chk.timeout * 1000000000,
812
+ "retries": chk.retries,
813
+ }
814
+
815
+ configured = False
816
+ for attr_k in ["execution", "tcp", "http", "https"]:
817
+ attr_v = getattr(chk, attr_k, None)
818
+ if not attr_v:
819
+ continue
820
+ configured = True
821
+ match attr_k:
822
+ case "execution":
823
+ if attr_v.command:
824
+ healthcheck["test"] = [
825
+ "CMD",
826
+ *attr_v.command,
827
+ ]
828
+ case "tcp":
829
+ host = attr_v.host or "127.0.0.1"
830
+ port = attr_v.port or 80
831
+ healthcheck["test"] = [
832
+ "CMD",
833
+ "sh",
834
+ "-c",
835
+ f"if [ `command -v netstat` ]; then netstat -an | grep -w {port} >/dev/null || exit 1; elif [ `command -v nc` ]; then nc -z {host}:{port} >/dev/null || exit 1; else cat /etc/services | grep -w {port}/tcp >/dev/null || exit 1; fi",
836
+ ]
837
+ case "http" | "https":
838
+ curl_options = "-fsSL -o /dev/null"
839
+ wget_options = "-q -O /dev/null"
840
+ if attr_k == "https":
841
+ curl_options += " -k"
842
+ wget_options += " --no-check-certificate"
843
+ if attr_v.headers:
844
+ for hk, hv in attr_v.headers.items():
845
+ curl_options += f" -H '{hk}: {hv}'"
846
+ wget_options += f" --header='{hk}: {hv}'"
847
+ url = f"{attr_k}://{attr_v.host or '127.0.0.1'}:{attr_v.port or 80}{attr_v.path or '/'}"
848
+ healthcheck["test"] = [
849
+ "CMD",
850
+ "sh",
851
+ "-c",
852
+ f"if [ `command -v curl` ]; then curl {curl_options} {url}; else wget {wget_options} {url}; fi",
853
+ ]
854
+ break
855
+ if not configured:
856
+ msg = "Invalid health check configuration"
857
+ raise ValueError(msg)
858
+
859
+ return healthcheck
860
+
861
+ def _create_containers(
862
+ self,
863
+ workload: PodmanWorkloadPlan,
864
+ ephemeral_filename_mapping: dict[tuple[int, str] : str],
865
+ ephemeral_volume_name_mapping: dict[str, str],
866
+ pause_container: podman.domain.containers.Container,
867
+ ) -> (
868
+ list[podman.domain.containers.Container],
869
+ list[podman.domain.containers.Container],
870
+ ):
871
+ """
872
+ Create init and run containers for the workload.
873
+
874
+
875
+ Returns:
876
+ A tuple of two lists: (init containers, run containers).
877
+
878
+ Raises:
879
+ OperationError:
880
+ If the containers fail to create.
881
+
882
+ """
883
+ d_init_containers: list[podman.domain.containers.Container] = []
884
+ d_run_containers: list[podman.domain.containers.Container] = []
885
+
886
+ pause_container_namespace = {
887
+ "nsmode": "container",
888
+ "value": pause_container.id,
889
+ }
890
+ for ci, c in enumerate(workload.containers):
891
+ container_name = f"{workload.name}-{c.profile.lower()}-{ci}"
892
+ try:
893
+ d_container = self._client.containers.get(container_name)
894
+ except podman.errors.NotFound:
895
+ pass
896
+ except podman.errors.APIError as e:
897
+ msg = f"Failed to confirm whether container {container_name} exists{_detail_api_call_error(e)}"
898
+ raise OperationError(msg) from e
899
+ else:
900
+ # TODO(thxCode): check if the container matches the spec
901
+ if c.profile == ContainerProfileEnum.INIT:
902
+ d_init_containers.append(d_container)
903
+ else:
904
+ d_run_containers.append(d_container)
905
+ continue
906
+
907
+ detach = c.profile == ContainerProfileEnum.RUN
908
+
909
+ create_options: dict[str, Any] = {
910
+ "name": container_name,
911
+ "network_mode": pause_container_namespace,
912
+ "ipc_mode": pause_container_namespace,
913
+ "labels": {
914
+ **workload.labels,
915
+ _LABEL_COMPONENT: f"{c.profile.lower()}",
916
+ _LABEL_COMPONENT_NAME: c.name,
917
+ _LABEL_COMPONENT_INDEX: str(ci),
918
+ },
919
+ }
920
+
921
+ if not workload.host_ipc and workload.shm_size:
922
+ create_options["shm_size"] = workload.shm_size
923
+
924
+ if workload.pid_shared:
925
+ create_options["pid_mode"] = pause_container_namespace
926
+
927
+ # Parameterize restart policy.
928
+ match c.restart_policy:
929
+ case ContainerRestartPolicyEnum.ON_FAILURE:
930
+ create_options["restart_policy"] = {
931
+ "Name": "on-failure",
932
+ }
933
+ case ContainerRestartPolicyEnum.ALWAYS:
934
+ create_options["restart_policy"] = {
935
+ "Name": "always",
936
+ }
937
+
938
+ # Parameterize execution.
939
+ if c.execution:
940
+ create_options["working_dir"] = c.execution.working_dir
941
+ create_options["entrypoint"] = c.execution.command
942
+ create_options["command"] = c.execution.args
943
+ run_as_user = c.execution.run_as_user or workload.run_as_user
944
+ run_as_group = c.execution.run_as_group or workload.run_as_group
945
+ if run_as_user is not None:
946
+ create_options["user"] = run_as_user
947
+ if run_as_group is not None:
948
+ create_options["user"] = f"{run_as_user}:{run_as_group}"
949
+ if run_as_group is not None:
950
+ create_options["group_add"] = [run_as_group]
951
+ if workload.fs_group is not None:
952
+ create_options["group_add"] = [run_as_group, workload.fs_group]
953
+ elif workload.fs_group is not None:
954
+ create_options["group_add"] = [workload.fs_group]
955
+ create_options["sysctls"] = (
956
+ {sysctl.name: sysctl.value for sysctl in workload.sysctls or []}
957
+ if workload.sysctls
958
+ else None
959
+ )
960
+ create_options["read_only"] = c.execution.readonly_rootfs
961
+ create_options["privileged"] = c.execution.privileged
962
+ if cap := c.execution.capabilities:
963
+ create_options["cap_add"] = cap.add
964
+ create_options["cap_drop"] = cap.drop
965
+
966
+ # Parameterize environment variables.
967
+ create_options["environment"] = {e.name: e.value for e in c.envs or []}
968
+
969
+ # Parameterize resources.
970
+ if c.resources:
971
+ r_k_runtime_env = workload.resource_key_runtime_env_mapping or {}
972
+ r_k_backend_env = workload.resource_key_backend_env_mapping or {}
973
+ vd_env, vd_cdis, vd_values = self.get_visible_devices_values()
974
+ for r_k, r_v in c.resources.items():
975
+ match r_k:
976
+ case "cpu":
977
+ if isinstance(r_v, int | float):
978
+ create_options["cpu_shares"] = ceil(r_v * 1024)
979
+ elif isinstance(r_v, str) and r_v.isdigit():
980
+ create_options["cpu_shares"] = ceil(float(r_v) * 1024)
981
+ case "memory":
982
+ if isinstance(r_v, int):
983
+ create_options["mem_limit"] = r_v
984
+ create_options["mem_reservation"] = r_v
985
+ create_options["memswap_limit"] = r_v
986
+ elif isinstance(r_v, str):
987
+ v = r_v.lower().removesuffix("i")
988
+ create_options["mem_limit"] = v
989
+ create_options["mem_reservation"] = v
990
+ create_options["memswap_limit"] = v
991
+ case _:
992
+ if r_k in r_k_runtime_env:
993
+ # Set env if resource key is mapped.
994
+ runtime_env = [r_k_runtime_env[r_k]]
995
+ elif (
996
+ r_k == envs.GPUSTACK_RUNTIME_DEPLOY_AUTOMAP_RESOURCE_KEY
997
+ ):
998
+ # Set env if auto-mapping key is matched.
999
+ runtime_env = list(vd_env.keys())
1000
+ else:
1001
+ continue
1002
+
1003
+ if r_k in r_k_backend_env:
1004
+ # Set env if resource key is mapped.
1005
+ backend_env = r_k_backend_env[r_k]
1006
+ else:
1007
+ # Otherwise, use the default backend env names.
1008
+ backend_env = reduce(
1009
+ operator.add,
1010
+ list(vd_env.values()),
1011
+ )
1012
+
1013
+ privileged = create_options.get("privileged", False)
1014
+
1015
+ # Configure device access environment variable.
1016
+ if r_v == "all" and backend_env:
1017
+ # Configure privileged if requested all devices.
1018
+ create_options["privileged"] = True
1019
+ # Then, set container backend visible devices env to all devices,
1020
+ # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1021
+ # and mount corresponding libs if needed.
1022
+ for re in runtime_env:
1023
+ # Request device via CDI.
1024
+ rv = [
1025
+ f"{vd_cdis[re]}={v}"
1026
+ for v in (vd_values.get(re) or ["all"])
1027
+ ]
1028
+ if "devices" not in create_options:
1029
+ create_options["devices"] = []
1030
+ create_options["devices"].extend(rv)
1031
+ else:
1032
+ # Set env to the allocated device IDs if no privileged,
1033
+ # otherwise, set container backend visible devices env to all devices,
1034
+ # so that the container backend (e.g., NVIDIA Container Toolkit) can handle it,
1035
+ # and mount corresponding libs if needed.
1036
+ for re in runtime_env:
1037
+ # Request device via CDI.
1038
+ if not privileged:
1039
+ rv = [
1040
+ f"{vd_cdis[re]}={v.strip()}"
1041
+ for v in r_v.split(",")
1042
+ ]
1043
+ else:
1044
+ rv = [
1045
+ f"{vd_cdis[re]}={v}"
1046
+ for v in (vd_values.get(re) or ["all"])
1047
+ ]
1048
+ if "devices" not in create_options:
1049
+ create_options["devices"] = []
1050
+ create_options["devices"].extend(rv)
1051
+
1052
+ # Configure runtime device access environment variables.
1053
+ if r_v != "all" and privileged:
1054
+ for be in backend_env:
1055
+ create_options["environment"][be] = (
1056
+ self.align_backend_visible_devices_env_values(
1057
+ be,
1058
+ str(r_v),
1059
+ )
1060
+ )
1061
+
1062
+ # Configure affinity if applicable.
1063
+ if (
1064
+ envs.GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1065
+ or envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1066
+ ):
1067
+ cpus, numas = self.get_visible_devices_affinities(
1068
+ runtime_env,
1069
+ r_v,
1070
+ )
1071
+ if cpus:
1072
+ create_options["cpuset_cpus"] = cpus
1073
+ if numas and envs.GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY:
1074
+ create_options["cpuset_mems"] = numas
1075
+
1076
+ # Parameterize mounts.
1077
+ self._append_container_mounts(
1078
+ create_options,
1079
+ c,
1080
+ ci,
1081
+ ephemeral_filename_mapping,
1082
+ ephemeral_volume_name_mapping,
1083
+ )
1084
+
1085
+ if envs.GPUSTACK_RUNTIME_PODMAN_MUTE_ORIGINAL_HEALTHCHECK:
1086
+ create_options["healthcheck"] = {
1087
+ "test": [
1088
+ "NONE",
1089
+ ],
1090
+ }
1091
+
1092
+ # Parameterize health checks.
1093
+ # Since Podman only support one complete check,
1094
+ # we always pick the first check as target.
1095
+ if c.profile == ContainerProfileEnum.RUN and c.checks:
1096
+ # If the first check is teardown-enabled,
1097
+ # enable auto-heal for the container.
1098
+ if c.checks[0].teardown:
1099
+ create_options["labels"][
1100
+ f"{_LABEL_COMPONENT_HEAL_PREFIX}-{workload.name}"
1101
+ ] = "true"
1102
+
1103
+ create_options["healthcheck"] = self._parameterize_healthcheck(
1104
+ c.checks[0],
1105
+ )
1106
+
1107
+ # Create the container.
1108
+ try:
1109
+ if c.profile == ContainerProfileEnum.RUN:
1110
+ create_options = self._mutate_create_options(create_options)
1111
+ if envs.GPUSTACK_RUNTIME_DEPLOY_PRINT_CONVERSION:
1112
+ clogger.info(
1113
+ f"Creating container %s with options:{os.linesep}%s",
1114
+ container_name,
1115
+ safe_json(create_options, indent=2),
1116
+ )
1117
+
1118
+ with patch.object(
1119
+ CreateMixin,
1120
+ "_render_payload",
1121
+ staticmethod(patch_render_payload),
1122
+ ):
1123
+ d_container = self._client.containers.create(
1124
+ image=self._get_image(c.image, c.image_pull_policy),
1125
+ detach=detach,
1126
+ **create_options,
1127
+ )
1128
+ except podman.errors.APIError as e:
1129
+ msg = f"Failed to create container {container_name}{_detail_api_call_error(e)}"
1130
+ raise OperationError(msg) from e
1131
+ else:
1132
+ if c.profile == ContainerProfileEnum.INIT:
1133
+ d_init_containers.append(d_container)
1134
+ else:
1135
+ d_run_containers.append(d_container)
1136
+
1137
+ return d_init_containers, d_run_containers
1138
+
1139
+ @staticmethod
1140
+ def _start_containers(
1141
+ container: podman.domain.containers.Container
1142
+ | list[podman.domain.containers.Container],
1143
+ force: bool = True,
1144
+ ):
1145
+ """
1146
+ Start or restart the container(s) based on their current status.
1147
+
1148
+ Args:
1149
+ container:
1150
+ A Podman container or a list of Podman containers to start or restart.
1151
+ force:
1152
+ To force restart or unpause the container if it's in exited or paused status.
1153
+
1154
+ Raises:
1155
+ podman.errors.APIError:
1156
+ If the container fails to start or restart.
1157
+
1158
+ """
1159
+ if isinstance(container, list):
1160
+ for c in container:
1161
+ PodmanDeployer._start_containers(c)
1162
+ return
1163
+
1164
+ match container.status:
1165
+ case "created":
1166
+ container.start()
1167
+ case "exited" | "stopped":
1168
+ if force:
1169
+ container.restart()
1170
+ case "paused":
1171
+ if force:
1172
+ container.unpause()
1173
+
1174
+ def __init__(self):
1175
+ super().__init__(_NAME)
1176
+ self._client = self._get_client()
1177
+ self._container_ephemeral_files_dir = (
1178
+ envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR
1179
+ )
1180
+
1181
+ def _prepare_create(self):
1182
+ """
1183
+ Prepare for creation.
1184
+
1185
+ """
1186
+ # Prepare mirrored deployment if enabled.
1187
+ if self._mutate_create_options:
1188
+ return
1189
+ self._mutate_create_options = lambda o: o
1190
+ if not envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT:
1191
+ logger.debug("Mirrored deployment disabled")
1192
+ return
1193
+
1194
+ # Retrieve self-container info.
1195
+ ## - Get Container name, default to hostname if not set.
1196
+ self_container_id = envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME
1197
+ if not self_container_id:
1198
+ self_container_id = socket.gethostname()
1199
+ logger.warning(
1200
+ "Mirrored deployment enabled, but no Container name set, using hostname(%s) instead",
1201
+ self_container_id,
1202
+ )
1203
+ try:
1204
+ self_container = self._find_self_container(self_container_id)
1205
+ logger.info(
1206
+ "Mirrored deployment enabled, using self Container %s for options mirroring",
1207
+ self_container.id[:12],
1208
+ )
1209
+ self_image = self_container.image
1210
+ except podman.errors.APIError:
1211
+ logger.exception(
1212
+ "Mirrored deployment enabled, but failed to get self Container %s, skipping",
1213
+ self_container_id,
1214
+ )
1215
+ return
1216
+
1217
+ # Process mirrored deployment options.
1218
+ ## - Container runtime
1219
+ mirrored_runtime: str = self_container.attrs["HostConfig"].get("Runtime", "")
1220
+ ## - Container customized envs
1221
+ self_container_envs: dict[str, str] = dict(
1222
+ item.split("=", 1) for item in self_container.attrs["Config"].get("Env", [])
1223
+ )
1224
+ self_image_envs: dict[str, str] = dict(
1225
+ item.split("=", 1) for item in self_image.attrs["Config"].get("Env", [])
1226
+ )
1227
+ mirrored_envs: dict[str, str] = {
1228
+ # Filter out gpustack-internal envs and same-as-image envs.
1229
+ k: v
1230
+ for k, v in self_container_envs.items()
1231
+ if (
1232
+ not k.startswith("GPUSTACK_")
1233
+ and (k not in self_image_envs or v != self_image_envs[k])
1234
+ )
1235
+ }
1236
+ if igs := envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_ENVIRONMENTS:
1237
+ mirrored_envs = {
1238
+ # Filter out ignored envs.
1239
+ k: v
1240
+ for k, v in mirrored_envs.items()
1241
+ if k not in igs
1242
+ }
1243
+ ## - Container customized mounts
1244
+ mirrored_mounts: list[dict[str, Any]] = [
1245
+ # Always filter out Podman Socket mount.
1246
+ m
1247
+ for m in (self_container.attrs["Mounts"] or [])
1248
+ if not m.get("Destination").endswith("/podman.sock")
1249
+ ]
1250
+ if igs := envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES:
1251
+ mirrored_mounts = [
1252
+ # Filter out ignored volume mounts.
1253
+ m
1254
+ for m in mirrored_mounts
1255
+ if m.get("Destination") not in igs
1256
+ ]
1257
+ ## - Container customized devices
1258
+ mirrored_devices: list[dict[str, Any]] = (
1259
+ self_container.attrs["HostConfig"].get("Devices") or []
1260
+ )
1261
+ if igs := envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES:
1262
+ mirrored_devices = [
1263
+ # Filter out ignored device mounts.
1264
+ d
1265
+ for d in mirrored_devices
1266
+ if d.get("PathInContainer") not in igs
1267
+ ]
1268
+ ## - Container customized device requests
1269
+ (self_container.attrs["HostConfig"].get("DeviceRequests") or [])
1270
+ ## - Container capabilities
1271
+ mirrored_capabilities: dict[str, list[str]] = {}
1272
+ if cap := self_container.attrs["HostConfig"].get("CapAdd"):
1273
+ mirrored_capabilities["add"] = cap
1274
+ if cap := self_container.attrs["HostConfig"].get("CapDrop"):
1275
+ mirrored_capabilities["drop"] = cap
1276
+ ## - Container group_adds
1277
+ mirrored_group_adds: list[str] = (
1278
+ self_container.attrs["HostConfig"].get("GroupAdd") or []
1279
+ )
1280
+
1281
+ # Construct mutation function.
1282
+ def mutate_create_options(create_options: dict[str, Any]) -> dict[str, Any]:
1283
+ if create_options.get("name", "").endswith("-pause"):
1284
+ return create_options
1285
+
1286
+ if mirrored_runtime and "runtime" not in create_options:
1287
+ create_options["runtime"] = mirrored_runtime
1288
+
1289
+ if mirrored_envs:
1290
+ c_envs: dict[str, str] = create_options.get("environment", {})
1291
+ for k, v in mirrored_envs.items():
1292
+ if k not in c_envs:
1293
+ c_envs[k] = v
1294
+ create_options["environment"] = c_envs
1295
+
1296
+ if mirrored_mounts:
1297
+ c_mounts: list[dict[str, Any]] = create_options.get("mounts") or []
1298
+ c_mounts_paths = {m.get("Target") for m in c_mounts}
1299
+ for m in mirrored_mounts:
1300
+ if m.get("Destination") in c_mounts_paths:
1301
+ continue
1302
+ type_ = m.get("Type", "volume")
1303
+ source = m.get("Source")
1304
+ if type_ == "volume":
1305
+ source = m.get("Name")
1306
+ target = m.get("Destination")
1307
+ read_only = (
1308
+ m.get("Mode", "") in ("ro", "readonly")
1309
+ or m.get("RW", True) is False
1310
+ )
1311
+ propagation = (
1312
+ m.get("Propagation") if m.get("Propagation", "") else None
1313
+ )
1314
+ c_mounts.append(
1315
+ {
1316
+ "type": type_,
1317
+ "source": source,
1318
+ "target": target,
1319
+ "read_only": read_only,
1320
+ "propagation": propagation,
1321
+ },
1322
+ )
1323
+ c_mounts_paths.add(target)
1324
+ create_options["mounts"] = c_mounts
1325
+
1326
+ if mirrored_devices:
1327
+ c_devices: list[dict[str, Any]] = []
1328
+ for c_device in create_options.get("devices") or []:
1329
+ sp = c_device.split(":")
1330
+ c_device.append(
1331
+ {
1332
+ "PathOnHost": sp[0],
1333
+ "PathInContainer": sp[1] if len(sp) > 1 else sp[0],
1334
+ "CgroupPermissions": sp[2] if len(sp) > 2 else "rwm",
1335
+ },
1336
+ )
1337
+ c_devices_paths = {d.get("PathInContainer") for d in c_devices}
1338
+ for d in mirrored_devices:
1339
+ if d.get("PathInContainer") in c_devices_paths:
1340
+ continue
1341
+ c_devices.append(d)
1342
+ c_devices_paths.add(d.get("PathInContainer"))
1343
+ create_options["devices"] = [
1344
+ f"{d['PathOnHost']}:{d['PathInContainer']}:{d['CgroupPermissions']}"
1345
+ for d in c_devices
1346
+ ]
1347
+
1348
+ if mirrored_capabilities:
1349
+ if "cap_add" in mirrored_capabilities:
1350
+ c_cap_add: list[str] = create_options.get("cap_add", [])
1351
+ for c_cap in mirrored_capabilities["add"]:
1352
+ if c_cap not in c_cap_add:
1353
+ c_cap_add.append(c_cap)
1354
+ create_options["cap_add"] = c_cap_add
1355
+ if "cap_drop" in mirrored_capabilities:
1356
+ c_cap_drop: list[str] = create_options.get("cap_drop", [])
1357
+ for c_cap in mirrored_capabilities["drop"]:
1358
+ if c_cap not in c_cap_drop:
1359
+ c_cap_drop.append(c_cap)
1360
+ create_options["cap_drop"] = c_cap_drop
1361
+
1362
+ if mirrored_group_adds:
1363
+ c_group_adds: list[str] = create_options.get("group_add", [])
1364
+ for c_ga in mirrored_group_adds:
1365
+ if c_ga not in c_group_adds:
1366
+ c_group_adds.append(c_ga)
1367
+ create_options["group_add"] = c_group_adds
1368
+
1369
+ return create_options
1370
+
1371
+ self._mutate_create_options = mutate_create_options
1372
+
1373
+ # Extract ephemeral files dir mutation if any.
1374
+ if mirrored_mounts:
1375
+ e_target = str(envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR)
1376
+ b_source = ""
1377
+ b_target = ""
1378
+ for m in mirrored_mounts:
1379
+ c_target = m.get("Destination", "///")
1380
+ if (
1381
+ e_target == c_target or e_target.startswith(f"{c_target}/")
1382
+ ) and len(c_target) >= len(b_target):
1383
+ b_source = m.get("Source")
1384
+ b_target = c_target
1385
+ if b_source:
1386
+ result = Path(b_source)
1387
+ if e_target != b_target:
1388
+ b_subpath = e_target.removeprefix(b_target)
1389
+ result = result.joinpath(b_subpath.lstrip("/"))
1390
+ self._container_ephemeral_files_dir = result
1391
+
1392
+ def _find_self_container(
1393
+ self,
1394
+ self_container_id: str,
1395
+ ) -> podman.domain.containers.Container:
1396
+ """
1397
+ Find the current container if running inside a Podman container.
1398
+
1399
+ Args:
1400
+ self_container_id:
1401
+ The container name or ID to find.
1402
+
1403
+ Returns:
1404
+ The Podman container if found, None otherwise.
1405
+
1406
+ Raises:
1407
+ If failed to find itself.
1408
+
1409
+ """
1410
+ if envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME:
1411
+ # Directly get container by name or ID.
1412
+ return self._client.containers.get(self_container_id)
1413
+
1414
+ # Find containers that matches the hostname.
1415
+ containers: list[podman.domain.containers.Container] = []
1416
+ for c in self._client.containers.list(compatible=True):
1417
+ # Ignore workload containers with host network enabled.
1418
+ if _LABEL_WORKLOAD in c.labels:
1419
+ continue
1420
+ # Ignore containers that do not match the hostname.
1421
+ if c.attrs["Config"].get("Hostname", "") != self_container_id:
1422
+ continue
1423
+ # Ignore containers that do not match the filter labels.
1424
+ if envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS and any(
1425
+ c.labels.get(k) != v
1426
+ for k, v in envs.GPUSTACK_RUNTIME_PODMAN_MIRRORED_NAME_FILTER_LABELS.items()
1427
+ ):
1428
+ continue
1429
+ containers.append(c)
1430
+
1431
+ # Validate found containers.
1432
+ if len(containers) != 1:
1433
+ msg = (
1434
+ f"Found multiple Containers with the same hostname {self_container_id}, "
1435
+ if len(containers) > 1
1436
+ else f"Not found Container with hostname {self_container_id}, "
1437
+ "please use `--env GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=...` to specify the exact container name"
1438
+ )
1439
+ raise podman.errors.NotFound(msg)
1440
+
1441
+ return containers[0]
1442
+
1443
+ @_supported
1444
+ def _create(self, workload: WorkloadPlan):
1445
+ """
1446
+ Deploy a Podman workload.
1447
+
1448
+ Args:
1449
+ workload:
1450
+ The workload to deploy.
1451
+
1452
+ Raises:
1453
+ TypeError:
1454
+ If the Podman workload type is invalid.
1455
+ ValueError:
1456
+ If the Podman workload fails to validate.
1457
+ UnsupportedError:
1458
+ If Podman is not supported in the current environment.
1459
+ OperationError:
1460
+ If the Podman workload fails to deploy.
1461
+
1462
+ """
1463
+ if not isinstance(workload, PodmanWorkloadPlan | WorkloadPlan):
1464
+ msg = f"Invalid workload type: {type(workload)}"
1465
+ raise TypeError(msg)
1466
+
1467
+ self._prepare_create()
1468
+
1469
+ if isinstance(workload, WorkloadPlan):
1470
+ workload = PodmanWorkloadPlan(**workload.__dict__)
1471
+ workload.validate_and_default()
1472
+ if logger.isEnabledFor(logging.DEBUG):
1473
+ logger.debug("Creating workload:\n%s", workload.to_yaml())
1474
+
1475
+ # Create ephemeral file if needed,
1476
+ # (container index, configured path): <actual filename>
1477
+ ephemeral_filename_mapping: dict[tuple[int, str] : str] = (
1478
+ self._create_ephemeral_files(workload)
1479
+ )
1480
+
1481
+ # Create ephemeral volumes if needed,
1482
+ # <configured volume name>: <actual volume name>
1483
+ ephemeral_volume_name_mapping: dict[str, str] = self._create_ephemeral_volumes(
1484
+ workload,
1485
+ )
1486
+
1487
+ # Create pause container.
1488
+ pause_container = self._create_pause_container(workload)
1489
+
1490
+ # Create init/run containers.
1491
+ init_containers, run_containers = self._create_containers(
1492
+ workload,
1493
+ ephemeral_filename_mapping,
1494
+ ephemeral_volume_name_mapping,
1495
+ pause_container,
1496
+ )
1497
+
1498
+ # Create unhealthy restart container if needed.
1499
+ unhealthy_restart_container = self._create_unhealthy_restart_container(workload)
1500
+
1501
+ # Start containers in order: pause -> init(s) -> run(s) -> unhealthy restart
1502
+ try:
1503
+ self._start_containers(pause_container)
1504
+ self._start_containers(init_containers, force=False)
1505
+ self._start_containers(run_containers)
1506
+ if unhealthy_restart_container:
1507
+ self._start_containers(unhealthy_restart_container)
1508
+ except podman.errors.APIError as e:
1509
+ msg = (
1510
+ f"Failed to create workload {workload.name}{_detail_api_call_error(e)}"
1511
+ )
1512
+ raise OperationError(msg) from e
1513
+
1514
+ @_supported
1515
+ def _get(
1516
+ self,
1517
+ name: WorkloadName,
1518
+ namespace: WorkloadNamespace | None = None, # noqa: ARG002
1519
+ ) -> WorkloadStatus | None:
1520
+ """
1521
+ Get the status of a Podman workload.
1522
+
1523
+ Args:
1524
+ name:
1525
+ The name of the workload.
1526
+ namespace:
1527
+ The namespace of the workload.
1528
+
1529
+ Returns:
1530
+ The status if found, None otherwise.
1531
+
1532
+ Raises:
1533
+ UnsupportedError:
1534
+ If Podman is not supported in the current environment.
1535
+ OperationError:
1536
+ If the Podman workload fails to get.
1537
+
1538
+ """
1539
+ list_options = {
1540
+ "filters": [
1541
+ f"label={_LABEL_WORKLOAD}={name}",
1542
+ f"label={_LABEL_COMPONENT}",
1543
+ ],
1544
+ }
1545
+
1546
+ try:
1547
+ d_containers = self._client.containers.list(
1548
+ compatible=True,
1549
+ all=True,
1550
+ **list_options,
1551
+ )
1552
+ except podman.errors.APIError as e:
1553
+ msg = f"Failed to list containers for workload {name}{_detail_api_call_error(e)}"
1554
+ raise OperationError(msg) from e
1555
+
1556
+ if not d_containers:
1557
+ return None
1558
+
1559
+ return PodmanWorkloadStatus(
1560
+ name=name,
1561
+ d_containers=d_containers,
1562
+ )
1563
+
1564
+ @_supported
1565
+ def _delete(
1566
+ self,
1567
+ name: WorkloadName,
1568
+ namespace: WorkloadNamespace | None = None,
1569
+ ) -> WorkloadStatus | None:
1570
+ """
1571
+ Delete a Podman workload.
1572
+
1573
+ Args:
1574
+ name:
1575
+ The name of the workload.
1576
+ namespace:
1577
+ The namespace of the workload.
1578
+
1579
+ Return:
1580
+ The status if found, None otherwise.
1581
+
1582
+ Raises:
1583
+ UnsupportedError:
1584
+ If Podman is not supported in the current environment.
1585
+ OperationError:
1586
+ If the Podman workload fails to delete.
1587
+
1588
+ """
1589
+ # Check if the workload exists.
1590
+ workload = self.get(name=name, namespace=namespace)
1591
+ if not workload:
1592
+ return None
1593
+
1594
+ # Remove all containers with the workload label.
1595
+ try:
1596
+ d_containers = getattr(workload, "_d_containers", [])
1597
+ # Remove non-pause containers first.
1598
+ for c in d_containers:
1599
+ if "-pause" not in c.name:
1600
+ c.remove(
1601
+ force=True,
1602
+ )
1603
+ # Then remove pause containers.
1604
+ for c in d_containers:
1605
+ if "-pause" in c.name:
1606
+ c.remove(
1607
+ force=True,
1608
+ )
1609
+ except podman.errors.APIError as e:
1610
+ msg = f"Failed to delete containers for workload {name}{_detail_api_call_error(e)}"
1611
+ raise OperationError(msg) from e
1612
+
1613
+ # Remove all ephemeral volumes with the workload label.
1614
+ try:
1615
+ list_options = {
1616
+ "filters": [
1617
+ f"label={_LABEL_WORKLOAD}={name}",
1618
+ ],
1619
+ }
1620
+
1621
+ d_volumes = self._client.volumes.list(
1622
+ compatible=True,
1623
+ **list_options,
1624
+ )
1625
+
1626
+ for v in d_volumes:
1627
+ v.remove(
1628
+ force=True,
1629
+ )
1630
+ except podman.errors.APIError as e:
1631
+ msg = f"Failed to delete volumes for workload {name}{_detail_api_call_error(e)}"
1632
+ raise OperationError(msg) from e
1633
+
1634
+ # Remove all ephemeral files for the workload.
1635
+ try:
1636
+ for fp in envs.GPUSTACK_RUNTIME_PODMAN_EPHEMERAL_FILES_DIR.glob(
1637
+ f"{name}-*",
1638
+ ):
1639
+ if fp.is_file():
1640
+ fp.unlink(missing_ok=True)
1641
+ except OSError as e:
1642
+ msg = f"Failed to delete ephemeral files for workload {name}"
1643
+ raise OperationError(msg) from e
1644
+
1645
+ return workload
1646
+
1647
+ @_supported
1648
+ def _list(
1649
+ self,
1650
+ namespace: WorkloadNamespace | None = None, # noqa: ARG002
1651
+ labels: dict[str, str] | None = None,
1652
+ ) -> list[WorkloadStatus]:
1653
+ """
1654
+ List all Podman workloads.
1655
+
1656
+ Args:
1657
+ namespace:
1658
+ The namespace of the workloads.
1659
+ labels:
1660
+ Labels to filter workloads.
1661
+
1662
+ Returns:
1663
+ A list of workload statuses.
1664
+
1665
+ Raises:
1666
+ UnsupportedError:
1667
+ If Podman is not supported in the current environment.
1668
+ OperationError:
1669
+ If the Podman workloads fail to list.
1670
+
1671
+ """
1672
+ list_options = {
1673
+ "filters": [
1674
+ *[
1675
+ f"label={k}={v}"
1676
+ for k, v in (labels or {}).items()
1677
+ if k
1678
+ not in (
1679
+ _LABEL_WORKLOAD,
1680
+ _LABEL_COMPONENT,
1681
+ _LABEL_COMPONENT_INDEX,
1682
+ )
1683
+ ],
1684
+ f"label={_LABEL_WORKLOAD}",
1685
+ f"label={_LABEL_COMPONENT}",
1686
+ ],
1687
+ }
1688
+
1689
+ try:
1690
+ d_containers = self._client.containers.list(
1691
+ compatible=True,
1692
+ all=True,
1693
+ **list_options,
1694
+ )
1695
+ except podman.errors.APIError as e:
1696
+ msg = f"Failed to list workloads' containers{_detail_api_call_error(e)}"
1697
+ raise OperationError(msg) from e
1698
+
1699
+ # Group containers by workload name,
1700
+ # <workload name>: [podman.domain.containers.Container, ...]
1701
+ workload_mapping: dict[str, list[podman.domain.containers.Container]] = {}
1702
+ for c in d_containers:
1703
+ n = c.labels.get(_LABEL_WORKLOAD, None)
1704
+ if not n:
1705
+ continue
1706
+ if n not in workload_mapping:
1707
+ workload_mapping[n] = []
1708
+ workload_mapping[n].append(c)
1709
+
1710
+ return [
1711
+ PodmanWorkloadStatus(
1712
+ name=name,
1713
+ d_containers=d_containers,
1714
+ )
1715
+ for name, d_containers in workload_mapping.items()
1716
+ ]
1717
+
1718
+ @_supported
1719
+ def _logs(
1720
+ self,
1721
+ name: WorkloadName,
1722
+ namespace: WorkloadNamespace | None = None,
1723
+ token: WorkloadOperationToken | None = None,
1724
+ timestamps: bool = False,
1725
+ tail: int | None = None,
1726
+ since: int | None = None,
1727
+ follow: bool = False,
1728
+ ) -> Generator[bytes | str, None, None] | bytes | str:
1729
+ """
1730
+ Get logs of a Podman workload or a specific container.
1731
+
1732
+ Args:
1733
+ name:
1734
+ The name of the workload.
1735
+ namespace:
1736
+ The namespace of the workload.
1737
+ token:
1738
+ The operation token representing a specific container ID.
1739
+ If None, fetch logs from the main RUN container of the workload.
1740
+ timestamps:
1741
+ Whether to include timestamps in the logs.
1742
+ tail:
1743
+ Number of lines from the end of the logs to show. If None, show all logs.
1744
+ since:
1745
+ Show logs since this time (in seconds since epoch). If None, show all logs.
1746
+ follow:
1747
+ Whether to stream the logs in real-time.
1748
+
1749
+ Returns:
1750
+ The logs as a byte string, a string or a generator yielding byte strings or strings if follow is True.
1751
+
1752
+ Raises:
1753
+ UnsupportedError:
1754
+ If Podman is not supported in the current environment.
1755
+ OperationError:
1756
+ If the Podman workload fails to fetch logs.
1757
+
1758
+ """
1759
+ workload = self.get(name=name, namespace=namespace)
1760
+ if not workload:
1761
+ msg = f"Workload {name} not found"
1762
+ raise OperationError(msg)
1763
+
1764
+ d_containers = getattr(workload, "_d_containers", [])
1765
+ container = next(
1766
+ (
1767
+ c
1768
+ for c in d_containers
1769
+ if (
1770
+ c.id == token
1771
+ if token
1772
+ else c.labels.get(_LABEL_COMPONENT_INDEX) == "0"
1773
+ )
1774
+ ),
1775
+ None,
1776
+ )
1777
+ if not container:
1778
+ msg = f"Loggable container of workload {name} not found"
1779
+ if token:
1780
+ msg += f" with token {token}"
1781
+ raise OperationError(msg)
1782
+
1783
+ logs_options = {
1784
+ "timestamps": timestamps,
1785
+ "tail": tail,
1786
+ "since": since,
1787
+ "follow": follow,
1788
+ }
1789
+
1790
+ try:
1791
+ output = container.logs(
1792
+ stream=follow,
1793
+ **logs_options,
1794
+ )
1795
+ except podman.errors.APIError as e:
1796
+ msg = f"Failed to fetch logs for container {container.name} of workload {name}{_detail_api_call_error(e)}"
1797
+ raise OperationError(msg) from e
1798
+ else:
1799
+ return output
1800
+
1801
+ @_supported
1802
+ def _exec(
1803
+ self,
1804
+ name: WorkloadName,
1805
+ namespace: WorkloadNamespace | None = None,
1806
+ token: WorkloadOperationToken | None = None,
1807
+ detach: bool = True,
1808
+ command: list[str] | None = None,
1809
+ args: list[str] | None = None,
1810
+ ) -> WorkloadExecStream | bytes | str:
1811
+ """
1812
+ Execute a command in a Podman workload or a specific container.
1813
+
1814
+ Args:
1815
+ name:
1816
+ The name of the workload.
1817
+ namespace:
1818
+ The namespace of the workload.
1819
+ token:
1820
+ The operation token representing a specific container ID.
1821
+ If None, execute in the main RUN container of the workload.
1822
+ detach:
1823
+ Whether to run the command in detached mode.
1824
+ command:
1825
+ The command to execute. If None, defaults to "/bin/sh".
1826
+ args:
1827
+ Additional arguments for the command.
1828
+
1829
+ Returns:
1830
+ If detach is False, return a WorkloadExecStream.
1831
+ otherwise, return the output of the command as a byte string or string.
1832
+
1833
+ Raises:
1834
+ UnsupportedError:
1835
+ If Podman is not supported in the current environment.
1836
+ OperationError:
1837
+ If the Podman workload fails to execute the command.
1838
+
1839
+ """
1840
+ workload = self.get(name=name, namespace=namespace)
1841
+ if not workload:
1842
+ msg = f"Workload {name} not found"
1843
+ raise OperationError(msg)
1844
+
1845
+ d_containers = getattr(workload, "_d_containers", [])
1846
+ container = next(
1847
+ (
1848
+ c
1849
+ for c in d_containers
1850
+ if (c.id == token if token else c.labels.get(_LABEL_COMPONENT) == "run")
1851
+ ),
1852
+ None,
1853
+ )
1854
+ if not container:
1855
+ msg = f"Executable container of workload {name} not found"
1856
+ if token:
1857
+ msg += f" with token {token}"
1858
+ raise OperationError(msg)
1859
+
1860
+ attach = not detach or not command
1861
+ exec_options = {
1862
+ "stdout": True,
1863
+ "stderr": True,
1864
+ "stdin": attach,
1865
+ "socket": attach,
1866
+ "tty": attach,
1867
+ "cmd": [*command, *(args or [])] if command else ["/bin/sh"],
1868
+ }
1869
+
1870
+ try:
1871
+ _status_code, output = container.exec_run(
1872
+ detach=False,
1873
+ **exec_options,
1874
+ )
1875
+ except podman.errors.APIError as e:
1876
+ msg = f"Failed to exec command in container {container.name} of workload {name}{_detail_api_call_error(e)}"
1877
+ raise OperationError(msg) from e
1878
+ else:
1879
+ if not attach:
1880
+ return output
1881
+ return PodmanWorkloadExecStream(output)
1882
+
1883
+
1884
+ def _has_restart_policy(
1885
+ container: podman.domain.containers.Container,
1886
+ ) -> bool:
1887
+ return (
1888
+ container.attrs["HostConfig"].get("RestartPolicy", {}).get("Name", "no") != "no"
1889
+ )
1890
+
1891
+
1892
+ class PodmanWorkloadExecStream(WorkloadExecStream):
1893
+ """
1894
+ A WorkloadExecStream implementation for Podman exec socket streams.
1895
+ """
1896
+
1897
+ _sock: socket.SocketIO | None = None
1898
+
1899
+ def __init__(self, sock: socket.SocketIO):
1900
+ super().__init__()
1901
+ self._sock = sock
1902
+
1903
+ @property
1904
+ def closed(self) -> bool:
1905
+ return not (self._sock and not self._sock.closed)
1906
+
1907
+ def fileno(self) -> int:
1908
+ return self._sock.fileno()
1909
+
1910
+ def read(self, size=-1) -> bytes | None:
1911
+ if self.closed:
1912
+ return None
1913
+ return self._sock.read(size)
1914
+
1915
+ def write(self, data: bytes) -> int:
1916
+ if self.closed:
1917
+ return 0
1918
+ return self._sock.write(data)
1919
+
1920
+ def close(self):
1921
+ if self.closed:
1922
+ return
1923
+ self._sock.close()
1924
+
1925
+
1926
+ def _detail_api_call_error(err: podman.errors.APIError) -> str:
1927
+ """
1928
+ Explain a Podman API error in a concise way,
1929
+ if the envs.GPUSTACK_RUNTIME_DEPLOY_API_CALL_ERROR_DETAIL is enabled.
1930
+
1931
+ Args:
1932
+ err:
1933
+ The Podman API error.
1934
+
1935
+ Returns:
1936
+ A concise explanation of the error.
1937
+
1938
+ """
1939
+ if not envs.GPUSTACK_RUNTIME_DEPLOY_API_CALL_ERROR_DETAIL:
1940
+ return ""
1941
+
1942
+ msg = f": Podman {'Client' if err.is_client_error() else 'Server'} Error"
1943
+ if err.explanation:
1944
+ msg += f": {err.explanation}"
1945
+ elif err.response.reason:
1946
+ msg += f": {err.response.reason}"
1947
+ else:
1948
+ msg += f": status code {err.response.status_code}"
1949
+
1950
+ return msg
1951
+
1952
+
1953
+ def _print_pull_logs(logs, image, tag):
1954
+ """
1955
+ Display Podman image pull logs.
1956
+
1957
+ Args:
1958
+ logs:
1959
+ The logs from Podman image pull.
1960
+ image:
1961
+ The image being pulled.
1962
+ tag:
1963
+ The image tag being pulled.
1964
+
1965
+ """
1966
+ if (
1967
+ not envs.GPUSTACK_RUNTIME_PODMAN_IMAGE_NO_PULL_VISUALIZATION
1968
+ and sys.stderr.isatty()
1969
+ ):
1970
+ _visualize_pull_logs(logs, tag)
1971
+ else:
1972
+ _textualize_pull_logs(logs, image, tag)
1973
+
1974
+
1975
+ def _visualize_pull_logs(logs, tag):
1976
+ """
1977
+ Display Podman image pull logs as progress bars.
1978
+
1979
+ Args:
1980
+ logs:
1981
+ The logs from Podman image pull.
1982
+ tag:
1983
+ The image tag being pulled.
1984
+
1985
+ """
1986
+ pbars: dict[str, tqdm] = {}
1987
+ dmsgs: list[str] = []
1988
+
1989
+ try:
1990
+ for log in logs:
1991
+ id_ = log.get("id", None)
1992
+ status = log.get("status", "")
1993
+ if not id_:
1994
+ dmsgs.append(status)
1995
+ continue
1996
+ if id_ == tag:
1997
+ continue
1998
+
1999
+ progress = log.get("progressDetail", {})
2000
+ progress_total = progress.get("total", None)
2001
+ progress_current = progress.get("current", None)
2002
+
2003
+ if id_ not in pbars:
2004
+ pbars[id_] = tqdm(
2005
+ unit="B",
2006
+ unit_scale=True,
2007
+ desc=f"{id_}: {status}",
2008
+ bar_format="{desc}",
2009
+ )
2010
+ continue
2011
+
2012
+ pbars[id_].desc = f"{id_}: {status}"
2013
+ if progress_total is not None:
2014
+ pbars[id_].total = progress_total
2015
+ bf = "{desc} |{bar}| {n_fmt}/{total_fmt} [{rate_fmt}{postfix}]"
2016
+ pbars[id_].bar_format = bf
2017
+ elif progress_current is not None:
2018
+ pbars[id_].bar_format = "{desc} {n_fmt} [{rate_fmt}{postfix}]"
2019
+ else:
2020
+ pbars[id_].bar_format = "{desc}"
2021
+
2022
+ if progress_current:
2023
+ pbars[id_].n = progress_current
2024
+
2025
+ pbars[id_].refresh()
2026
+ finally:
2027
+ for pbar in pbars.values():
2028
+ pbar.close()
2029
+ pbars.clear()
2030
+
2031
+ for msg in dmsgs:
2032
+ print(msg, flush=True)
2033
+
2034
+
2035
+ def _textualize_pull_logs(logs, image, tag):
2036
+ """
2037
+ Display Podman image pull logs as plain text.
2038
+
2039
+ Args:
2040
+ logs:
2041
+ The logs from Podman image pull.
2042
+ image:
2043
+ The image being pulled.
2044
+ tag:
2045
+ The image tag being pulled.
2046
+
2047
+ """
2048
+ pstats: dict[str, tuple[int, int, int]] = {}
2049
+ dmsgs: list[str] = []
2050
+
2051
+ p_c: int = 0 # bytes cursor
2052
+ p_c_m: int = 1 # bytes cursor move
2053
+ p_c_p: int = 0 # progress cursor
2054
+ p_c_p_m: int = 1 # progress cursor move
2055
+
2056
+ for log in logs:
2057
+ id_ = log.get("id", None)
2058
+ status = log.get("status", "")
2059
+ if not id_:
2060
+ dmsgs.append(status)
2061
+ continue
2062
+ if id_ == tag:
2063
+ continue
2064
+
2065
+ if id_ not in pstats:
2066
+ pstats[id_] = (0, 0, 0)
2067
+ if status in ["Pull complete", "Already exists"]:
2068
+ pstats[id_] = (0, 0, 100)
2069
+ continue
2070
+
2071
+ progress = log.get("progressDetail", {})
2072
+ progress_total = progress.get("total", None)
2073
+ progress_current = progress.get("current", None)
2074
+
2075
+ if progress_total is not None or progress_current is not None:
2076
+ pstats[id_] = (
2077
+ progress_total or 0,
2078
+ progress_current or 0,
2079
+ 0 if not progress_total else progress_current * 100 // progress_total,
2080
+ )
2081
+
2082
+ pstats_total, pstats_current, pstats_progress = 0, 0, 0
2083
+ for t, c, p in pstats.values():
2084
+ pstats_total += t
2085
+ pstats_current += c
2086
+ pstats_progress += p
2087
+
2088
+ p_c_d = pstats_current - p_c # bytes cursor delta
2089
+
2090
+ if pstats_total:
2091
+ p_c_p_d = pstats_progress // len(pstats) - p_c_p # progress cursor delta
2092
+ # Update textual progress when:
2093
+ # 1. Progress is not complete yet, and
2094
+ # 2. Progress cursor delta >= progress cursor move, or
2095
+ # 3. Bytes cursor delta >= bytes cursor move.
2096
+ if p_c_p < 100 and (p_c_p_d >= p_c_p_m or p_c_d >= p_c_m):
2097
+ p_c += p_c_d
2098
+ p_c_m = min(200 * _MiB, p_c_m + 2 * _MiB)
2099
+ p_c_p_n = min(p_c_p + p_c_p_d, 100) # progress cursor new
2100
+ # Update progress cursor if it has advanced.
2101
+ if p_c_p_n > p_c_p:
2102
+ p_c_p = p_c_p_n
2103
+ p_c_p_m = min(5, p_c_p_m + 1, 100 - p_c_p)
2104
+ print(f"Pulling image {image}: {p_c_p}%", flush=True)
2105
+ elif pstats_current:
2106
+ # Update textual progress when bytes cursor delta >= bytes cursor move.
2107
+ if p_c_d >= p_c_m:
2108
+ p_c += p_c_d
2109
+ p_c_m = min(200 * _MiB, p_c_m + 2 * _MiB)
2110
+ p_c_h = bytes_to_human_readable(p_c)
2111
+ print(f"Pulling image {image}: {p_c_h}", flush=True)
2112
+
2113
+ for msg in dmsgs:
2114
+ print(msg, flush=True)