gpustack-runtime 0.1.40.post1__py3-none-any.whl → 0.1.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/__init__.py +1 -1
- gpustack_runtime/__main__.py +5 -3
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/__init__.py +5 -3
- gpustack_runtime/cmds/__types__.py +1 -1
- gpustack_runtime/cmds/deployer.py +140 -18
- gpustack_runtime/cmds/detector.py +1 -1
- gpustack_runtime/cmds/images.py +1 -1
- gpustack_runtime/deployer/__init__.py +28 -2
- gpustack_runtime/deployer/__patches__.py +1 -1
- gpustack_runtime/deployer/__types__.py +2 -1
- gpustack_runtime/deployer/__utils__.py +2 -2
- gpustack_runtime/deployer/cdi/__init__.py +85 -5
- gpustack_runtime/deployer/cdi/__types__.py +92 -29
- gpustack_runtime/deployer/cdi/__utils__.py +178 -0
- gpustack_runtime/deployer/cdi/amd.py +146 -0
- gpustack_runtime/deployer/cdi/ascend.py +164 -0
- gpustack_runtime/deployer/cdi/hygon.py +147 -0
- gpustack_runtime/deployer/cdi/iluvatar.py +136 -0
- gpustack_runtime/deployer/cdi/metax.py +148 -0
- gpustack_runtime/deployer/cdi/thead.py +57 -23
- gpustack_runtime/deployer/docker.py +9 -8
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +240 -0
- gpustack_runtime/deployer/k8s/deviceplugin/__types__.py +131 -0
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +586 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py +3 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto +212 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py +86 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi +168 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py +358 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py +34 -0
- gpustack_runtime/deployer/kuberentes.py +37 -4
- gpustack_runtime/deployer/podman.py +9 -8
- gpustack_runtime/detector/__init__.py +42 -5
- gpustack_runtime/detector/__types__.py +8 -24
- gpustack_runtime/detector/__utils__.py +46 -39
- gpustack_runtime/detector/amd.py +55 -66
- gpustack_runtime/detector/ascend.py +29 -41
- gpustack_runtime/detector/cambricon.py +3 -3
- gpustack_runtime/detector/hygon.py +21 -49
- gpustack_runtime/detector/iluvatar.py +44 -60
- gpustack_runtime/detector/metax.py +54 -37
- gpustack_runtime/detector/mthreads.py +74 -36
- gpustack_runtime/detector/nvidia.py +130 -93
- gpustack_runtime/detector/pyacl/__init__.py +1 -1
- gpustack_runtime/detector/pyamdgpu/__init__.py +1 -1
- gpustack_runtime/detector/pyamdsmi/__init__.py +1 -1
- gpustack_runtime/detector/pycuda/__init__.py +1 -1
- gpustack_runtime/detector/pydcmi/__init__.py +1 -1
- gpustack_runtime/detector/pyhsa/__init__.py +1 -1
- gpustack_runtime/detector/pymxsml/__init__.py +1553 -1
- gpustack_runtime/detector/pyrocmcore/__init__.py +1 -1
- gpustack_runtime/detector/pyrocmsmi/__init__.py +1 -1
- gpustack_runtime/detector/thead.py +41 -60
- gpustack_runtime/envs.py +104 -12
- gpustack_runtime/logging.py +6 -2
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/METADATA +6 -1
- gpustack_runtime-0.1.41.dist-info/RECORD +67 -0
- gpustack_runtime/detector/pymxsml/mxsml.py +0 -1580
- gpustack_runtime/detector/pymxsml/mxsml_extension.py +0 -816
- gpustack_runtime/detector/pymxsml/mxsml_mcm.py +0 -476
- gpustack_runtime-0.1.40.post1.dist-info/RECORD +0 -55
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,8 +11,10 @@ from .__types__ import (
|
|
|
11
11
|
ConfigContainerEdits,
|
|
12
12
|
ConfigDevice,
|
|
13
13
|
Generator,
|
|
14
|
-
|
|
14
|
+
manufacturer_to_cdi_kind,
|
|
15
|
+
manufacturer_to_runtime_env,
|
|
15
16
|
)
|
|
17
|
+
from .__utils__ import device_to_cdi_device_node
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
class THeadGenerator(Generator):
|
|
@@ -23,13 +25,20 @@ class THeadGenerator(Generator):
|
|
|
23
25
|
def __init__(self):
|
|
24
26
|
super().__init__(ManufacturerEnum.THEAD)
|
|
25
27
|
|
|
26
|
-
def generate(
|
|
28
|
+
def generate(
|
|
29
|
+
self,
|
|
30
|
+
devices: Devices | None = None,
|
|
31
|
+
include_all_devices: bool = True,
|
|
32
|
+
) -> Config | None:
|
|
27
33
|
"""
|
|
28
34
|
Generate the CDI configuration for T-Head devices.
|
|
29
35
|
|
|
30
36
|
Args:
|
|
31
|
-
devices:
|
|
32
|
-
|
|
37
|
+
devices:
|
|
38
|
+
The detected devices.
|
|
39
|
+
If None, all available devices are considered.
|
|
40
|
+
include_all_devices:
|
|
41
|
+
Whether to include a device entry that represents all T-Head devices.
|
|
33
42
|
|
|
34
43
|
Returns:
|
|
35
44
|
The Config object, or None if not supported.
|
|
@@ -46,30 +55,44 @@ class THeadGenerator(Generator):
|
|
|
46
55
|
if not devices:
|
|
47
56
|
return None
|
|
48
57
|
|
|
49
|
-
kind =
|
|
58
|
+
kind = manufacturer_to_cdi_kind(self.manufacturer)
|
|
50
59
|
if not kind:
|
|
51
60
|
return None
|
|
52
61
|
|
|
53
62
|
cdi_devices: list[ConfigDevice] = []
|
|
54
63
|
|
|
55
|
-
|
|
64
|
+
common_device_nodes = []
|
|
65
|
+
for p in [
|
|
56
66
|
"/dev/alixpu",
|
|
57
67
|
"/dev/alixpu_ctl",
|
|
58
|
-
]
|
|
68
|
+
]:
|
|
69
|
+
cdn = device_to_cdi_device_node(
|
|
70
|
+
path=p,
|
|
71
|
+
)
|
|
72
|
+
if cdn:
|
|
73
|
+
common_device_nodes.append(cdn)
|
|
74
|
+
if not common_device_nodes:
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
all_device_nodes = []
|
|
78
|
+
|
|
59
79
|
for dev in devices:
|
|
60
80
|
if not dev:
|
|
61
81
|
continue
|
|
62
|
-
|
|
63
|
-
|
|
82
|
+
|
|
83
|
+
container_device_nodes = []
|
|
84
|
+
|
|
85
|
+
cdn = device_to_cdi_device_node(
|
|
86
|
+
path=f"/dev/alixpu_ppu{dev.index}",
|
|
64
87
|
)
|
|
88
|
+
if not cdn:
|
|
89
|
+
continue
|
|
90
|
+
all_device_nodes.append(cdn)
|
|
91
|
+
container_device_nodes.append(cdn)
|
|
65
92
|
|
|
66
|
-
# Add specific container edits for each device
|
|
93
|
+
# Add specific container edits for each device.
|
|
67
94
|
cdi_container_edits = ConfigContainerEdits(
|
|
68
|
-
device_nodes=
|
|
69
|
-
"/dev/alixpu",
|
|
70
|
-
"/dev/alixpu_ctl",
|
|
71
|
-
f"/dev/alixpu_ppu{dev.index}",
|
|
72
|
-
],
|
|
95
|
+
device_nodes=container_device_nodes,
|
|
73
96
|
)
|
|
74
97
|
cdi_devices.append(
|
|
75
98
|
ConfigDevice(
|
|
@@ -87,17 +110,28 @@ class THeadGenerator(Generator):
|
|
|
87
110
|
if not cdi_devices:
|
|
88
111
|
return None
|
|
89
112
|
|
|
90
|
-
# Add common container edits for all devices
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
113
|
+
# Add common container edits for all devices.
|
|
114
|
+
if include_all_devices:
|
|
115
|
+
cdi_devices.append(
|
|
116
|
+
ConfigDevice(
|
|
117
|
+
name="all",
|
|
118
|
+
container_edits=ConfigContainerEdits(
|
|
119
|
+
device_nodes=all_device_nodes,
|
|
120
|
+
),
|
|
96
121
|
),
|
|
97
|
-
)
|
|
98
|
-
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
runtime_env = manufacturer_to_runtime_env(self.manufacturer)
|
|
99
125
|
|
|
100
126
|
return Config(
|
|
101
127
|
kind=kind,
|
|
102
128
|
devices=cdi_devices,
|
|
129
|
+
container_edits=[
|
|
130
|
+
ConfigContainerEdits(
|
|
131
|
+
env=[
|
|
132
|
+
f"{runtime_env}=void",
|
|
133
|
+
],
|
|
134
|
+
device_nodes=common_device_nodes,
|
|
135
|
+
),
|
|
136
|
+
],
|
|
103
137
|
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import io
|
|
@@ -10,7 +10,7 @@ import socket
|
|
|
10
10
|
import sys
|
|
11
11
|
import tarfile
|
|
12
12
|
from dataclasses import dataclass, field
|
|
13
|
-
from functools import
|
|
13
|
+
from functools import reduce
|
|
14
14
|
from math import ceil
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from typing import TYPE_CHECKING, Any
|
|
@@ -21,6 +21,7 @@ import docker.models.containers
|
|
|
21
21
|
import docker.models.images
|
|
22
22
|
import docker.models.volumes
|
|
23
23
|
import docker.types
|
|
24
|
+
from cachetools.func import ttl_cache
|
|
24
25
|
from dataclasses_json import dataclass_json
|
|
25
26
|
from gpustack_runner import split_image
|
|
26
27
|
from tqdm import tqdm
|
|
@@ -54,7 +55,7 @@ from .__utils__ import (
|
|
|
54
55
|
safe_json,
|
|
55
56
|
sensitive_env_var,
|
|
56
57
|
)
|
|
57
|
-
from .cdi import
|
|
58
|
+
from .cdi import dump_config as cdi_dump_config
|
|
58
59
|
|
|
59
60
|
if TYPE_CHECKING:
|
|
60
61
|
from collections.abc import Callable, Generator
|
|
@@ -314,7 +315,7 @@ class DockerDeployer(EndoscopicDeployer):
|
|
|
314
315
|
"""
|
|
315
316
|
|
|
316
317
|
@staticmethod
|
|
317
|
-
@
|
|
318
|
+
@ttl_cache(maxsize=1, ttl=60)
|
|
318
319
|
def is_supported() -> bool:
|
|
319
320
|
"""
|
|
320
321
|
Check if Docker is supported in the current environment.
|
|
@@ -1002,9 +1003,9 @@ class DockerDeployer(EndoscopicDeployer):
|
|
|
1002
1003
|
privileged = create_options.get("privileged", False)
|
|
1003
1004
|
|
|
1004
1005
|
# Generate CDI config if not yet.
|
|
1005
|
-
if cdi and envs.
|
|
1006
|
+
if cdi and envs.GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE:
|
|
1006
1007
|
for re in runtime_env:
|
|
1007
|
-
|
|
1008
|
+
cdi_dump_config(
|
|
1008
1009
|
manufacturer=vd_manus[re],
|
|
1009
1010
|
output=envs.GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY,
|
|
1010
1011
|
)
|
|
@@ -1548,7 +1549,7 @@ class DockerDeployer(EndoscopicDeployer):
|
|
|
1548
1549
|
def _get(
|
|
1549
1550
|
self,
|
|
1550
1551
|
name: WorkloadName,
|
|
1551
|
-
namespace: WorkloadNamespace | None = None,
|
|
1552
|
+
namespace: WorkloadNamespace | None = None,
|
|
1552
1553
|
) -> WorkloadStatus | None:
|
|
1553
1554
|
"""
|
|
1554
1555
|
Get the status of a Docker workload.
|
|
@@ -1670,7 +1671,7 @@ class DockerDeployer(EndoscopicDeployer):
|
|
|
1670
1671
|
@_supported
|
|
1671
1672
|
def _list(
|
|
1672
1673
|
self,
|
|
1673
|
-
namespace: WorkloadNamespace | None = None,
|
|
1674
|
+
namespace: WorkloadNamespace | None = None,
|
|
1674
1675
|
labels: dict[str, str] | None = None,
|
|
1675
1676
|
) -> list[WorkloadStatus]:
|
|
1676
1677
|
"""
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
import logging
|
|
6
|
+
import signal
|
|
7
|
+
import threading
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from .... import envs
|
|
11
|
+
from ....deployer.cdi import dump_config as cdi_dump_config
|
|
12
|
+
from ....detector import ManufacturerEnum, detect_devices, supported_manufacturers
|
|
13
|
+
from .__types__ import GroupedError, PluginServer
|
|
14
|
+
from .plugin import SharableDevicePlugin, cdi_kind_to_kdp_resource
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__package__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
async def serve_async(
|
|
23
|
+
stop_event: asyncio.Event,
|
|
24
|
+
manufacturer: ManufacturerEnum | None = None,
|
|
25
|
+
cdi_generation_output: Path | None = None,
|
|
26
|
+
kubelet_endpoint: Path | None = None,
|
|
27
|
+
start_timeout: int = 5,
|
|
28
|
+
register_timeout: int = 5,
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
Serve device plugins for all detected devices asynchronously.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
stop_event:
|
|
35
|
+
An asyncio event to signal stopping the server.
|
|
36
|
+
manufacturer:
|
|
37
|
+
Manufacturer for serving.
|
|
38
|
+
If None, detected from current environment.
|
|
39
|
+
cdi_generation_output:
|
|
40
|
+
A directory to store generated CDI configuration files.
|
|
41
|
+
If None, CDI configuration is not stored.
|
|
42
|
+
kubelet_endpoint:
|
|
43
|
+
The path to the kubelet endpoint.
|
|
44
|
+
start_timeout:
|
|
45
|
+
The timeout in seconds for starting the device plugin.
|
|
46
|
+
register_timeout:
|
|
47
|
+
The timeout in seconds for registering the device plugin.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
If any device plugin fails during serving, the exception is raised.
|
|
51
|
+
|
|
52
|
+
"""
|
|
53
|
+
manufacturers = [manufacturer] if manufacturer else supported_manufacturers()
|
|
54
|
+
|
|
55
|
+
# Create servers for all detected devices.
|
|
56
|
+
servers: list[PluginServer] = []
|
|
57
|
+
for manu in manufacturers:
|
|
58
|
+
devices = detect_devices(
|
|
59
|
+
manufacturer=manu,
|
|
60
|
+
)
|
|
61
|
+
if not devices:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
# Also works if the manufacturer does not have a CDI generator,
|
|
65
|
+
# which means we are relying on other tools to generate CDI specs.
|
|
66
|
+
if envs.GPUSTACK_RUNTIME_KUBERNETES_KDP_CDI_SPECS_GENERATE:
|
|
67
|
+
generated_content, generated_path = cdi_dump_config(
|
|
68
|
+
manufacturer=manu,
|
|
69
|
+
output=cdi_generation_output,
|
|
70
|
+
)
|
|
71
|
+
if generated_content:
|
|
72
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
73
|
+
logger.debug(
|
|
74
|
+
"Generated CDI configuration for '%s' at '%s':\n%s",
|
|
75
|
+
manu,
|
|
76
|
+
generated_path,
|
|
77
|
+
generated_content,
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
logger.info(
|
|
81
|
+
"Generated CDI configuration for '%s' at '%s'",
|
|
82
|
+
manu,
|
|
83
|
+
generated_path,
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
logger.warning(
|
|
87
|
+
"Delegated CDI configuration by other tools for manufacturer '%s', "
|
|
88
|
+
"e.g. NVIDIA Container Toolkit Manual CDI Specification Generation, "
|
|
89
|
+
"see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html#manual-cdi-specification-generation",
|
|
90
|
+
manu,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
for dev in devices:
|
|
94
|
+
servers.append(
|
|
95
|
+
SharableDevicePlugin(
|
|
96
|
+
device=dev,
|
|
97
|
+
id_by="index" if manu == ManufacturerEnum.ASCEND else "uuid",
|
|
98
|
+
),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if not servers:
|
|
102
|
+
logger.warning("No supported devices found. Waiting for stop event...")
|
|
103
|
+
await stop_event.wait()
|
|
104
|
+
logger.info("Stop event triggered, shutting down...")
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
# Create tasks to start all servers.
|
|
108
|
+
serve_tasks = [
|
|
109
|
+
asyncio.create_task(
|
|
110
|
+
server.serve(
|
|
111
|
+
stop_event=stop_event,
|
|
112
|
+
kubelet_endpoint=kubelet_endpoint,
|
|
113
|
+
start_timeout=start_timeout,
|
|
114
|
+
register_timeout=register_timeout,
|
|
115
|
+
),
|
|
116
|
+
name=f"serve-{server.name}",
|
|
117
|
+
)
|
|
118
|
+
for server in servers
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
# Create a task to wait for the stop event.
|
|
122
|
+
stop_task = asyncio.create_task(
|
|
123
|
+
stop_event.wait(),
|
|
124
|
+
name="stop",
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
# Wait for stop event or any server to fail.
|
|
129
|
+
done, pending = await asyncio.wait(
|
|
130
|
+
[*serve_tasks, stop_task],
|
|
131
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Check if stop event was triggered.
|
|
135
|
+
if stop_task in done:
|
|
136
|
+
logger.info("Stop event triggered, shutting down servers...")
|
|
137
|
+
for task in serve_tasks:
|
|
138
|
+
if not task.done():
|
|
139
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
140
|
+
task.cancel()
|
|
141
|
+
await asyncio.gather(*serve_tasks, return_exceptions=True)
|
|
142
|
+
logger.info("All servers shut down gracefully")
|
|
143
|
+
return
|
|
144
|
+
|
|
145
|
+
# Otherwise, one or more servers have failed.
|
|
146
|
+
errs = []
|
|
147
|
+
for task in pending:
|
|
148
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
149
|
+
task.cancel()
|
|
150
|
+
for task in done:
|
|
151
|
+
if err := task.exception():
|
|
152
|
+
errs.append(err)
|
|
153
|
+
if errs:
|
|
154
|
+
logger.error(
|
|
155
|
+
"One or more servers have failed, shutting down remaining servers...",
|
|
156
|
+
)
|
|
157
|
+
raise GroupedError(errs)
|
|
158
|
+
finally:
|
|
159
|
+
# Ensure all tasks are cleaned up.
|
|
160
|
+
for task in serve_tasks:
|
|
161
|
+
if not task.done():
|
|
162
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
163
|
+
task.cancel()
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def serve(
|
|
167
|
+
manufacturer: ManufacturerEnum | None = None,
|
|
168
|
+
cdi_generation_output: Path | None = None,
|
|
169
|
+
kubelet_endpoint: Path | None = None,
|
|
170
|
+
start_timeout: int = 5,
|
|
171
|
+
register_timeout: int = 5,
|
|
172
|
+
):
|
|
173
|
+
"""
|
|
174
|
+
Serve device plugins for all detected devices.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
manufacturer:
|
|
178
|
+
Manufacturer for serving.
|
|
179
|
+
If None, detected from current environment.
|
|
180
|
+
cdi_generation_output:
|
|
181
|
+
A directory to store generated CDI configuration files.
|
|
182
|
+
If None, CDI configuration is not stored.
|
|
183
|
+
kubelet_endpoint:
|
|
184
|
+
The path to the kubelet endpoint.
|
|
185
|
+
start_timeout:
|
|
186
|
+
The timeout in seconds for starting the device plugin.
|
|
187
|
+
register_timeout:
|
|
188
|
+
The timeout in seconds for registering the device plugin.
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
If any device plugin fails during serving, the exception is raised.
|
|
192
|
+
|
|
193
|
+
"""
|
|
194
|
+
# Ensure we're running in the main thread.
|
|
195
|
+
if threading.current_thread() != threading.main_thread():
|
|
196
|
+
logger.warning("Serve should be called from main thread")
|
|
197
|
+
|
|
198
|
+
# Create a stop event and register signal handlers.
|
|
199
|
+
loop = asyncio.get_event_loop()
|
|
200
|
+
stop_event = asyncio.Event()
|
|
201
|
+
|
|
202
|
+
def handle_signal(_s, _f):
|
|
203
|
+
logger.info("Received termination signal, stopping servers...")
|
|
204
|
+
stop_event.set()
|
|
205
|
+
|
|
206
|
+
# Register signal handlers for graceful shutdown.
|
|
207
|
+
signals = [signal.SIGINT, signal.SIGTERM]
|
|
208
|
+
try:
|
|
209
|
+
for sig in signals:
|
|
210
|
+
loop.add_signal_handler(sig, handle_signal, sig, None)
|
|
211
|
+
except (NotImplementedError, RuntimeError):
|
|
212
|
+
for sig in signals:
|
|
213
|
+
signal.signal(sig, handle_signal)
|
|
214
|
+
|
|
215
|
+
# Run the asynchronous server.
|
|
216
|
+
try:
|
|
217
|
+
loop.run_until_complete(
|
|
218
|
+
serve_async(
|
|
219
|
+
stop_event=stop_event,
|
|
220
|
+
manufacturer=manufacturer,
|
|
221
|
+
cdi_generation_output=cdi_generation_output,
|
|
222
|
+
kubelet_endpoint=kubelet_endpoint,
|
|
223
|
+
start_timeout=start_timeout,
|
|
224
|
+
register_timeout=register_timeout,
|
|
225
|
+
),
|
|
226
|
+
)
|
|
227
|
+
finally:
|
|
228
|
+
# Remove signal handlers to avoid side effects.
|
|
229
|
+
try:
|
|
230
|
+
for sig in signals:
|
|
231
|
+
loop.remove_signal_handler(sig)
|
|
232
|
+
except (NotImplementedError, RuntimeError):
|
|
233
|
+
pass
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
__all__ = [
|
|
237
|
+
"cdi_kind_to_kdp_resource",
|
|
238
|
+
"serve",
|
|
239
|
+
"serve_async",
|
|
240
|
+
]
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
import asyncio
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GroupedError(Exception):
|
|
12
|
+
"""
|
|
13
|
+
Exception to encapsulate multiple errors.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
errors:
|
|
17
|
+
A list of exceptions that occurred.
|
|
18
|
+
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
_errors: list[BaseException | Exception]
|
|
22
|
+
"""
|
|
23
|
+
Errors that occurred.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, errors: list[BaseException | Exception] | None = None):
|
|
27
|
+
"""
|
|
28
|
+
Initialize the GroupedError exception.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
errors:
|
|
32
|
+
A list of exceptions that occurred.
|
|
33
|
+
|
|
34
|
+
"""
|
|
35
|
+
self._errors = errors or []
|
|
36
|
+
|
|
37
|
+
def append(self, error: BaseException | Exception):
|
|
38
|
+
"""
|
|
39
|
+
Append an error to the GroupedError.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
error:
|
|
43
|
+
The exception to append.
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
self._errors.append(error)
|
|
47
|
+
|
|
48
|
+
def extend(self, errors: list[BaseException | Exception]):
|
|
49
|
+
"""
|
|
50
|
+
Extend the GroupedError with multiple errors.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
errors:
|
|
54
|
+
A list of exceptions to extend with.
|
|
55
|
+
|
|
56
|
+
"""
|
|
57
|
+
self._errors.extend(errors)
|
|
58
|
+
|
|
59
|
+
def __str__(self) -> str:
|
|
60
|
+
"""
|
|
61
|
+
Get the string representation of the GroupedError exception.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
A string representation of the GroupedError exception.
|
|
65
|
+
|
|
66
|
+
"""
|
|
67
|
+
error_messages = "\n".join(
|
|
68
|
+
f"{i + 1}. {error!s}" for i, error in enumerate(self._errors)
|
|
69
|
+
)
|
|
70
|
+
return f"{len(self._errors)} errors occurred:\n{error_messages}"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class PluginServer(ABC):
|
|
74
|
+
"""
|
|
75
|
+
Base class for Kubernetes device plugins.
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
_name: str
|
|
80
|
+
"""
|
|
81
|
+
Name of the device plugin.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(self, name: str):
|
|
85
|
+
"""
|
|
86
|
+
Initialize the device plugin.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
name:
|
|
90
|
+
The name of the device plugin.
|
|
91
|
+
|
|
92
|
+
"""
|
|
93
|
+
self._name = name
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def name(self) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Get the name of the device plugin.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
The name of the device plugin.
|
|
102
|
+
|
|
103
|
+
"""
|
|
104
|
+
return self._name
|
|
105
|
+
|
|
106
|
+
@abstractmethod
|
|
107
|
+
async def serve(
|
|
108
|
+
self,
|
|
109
|
+
stop_event: asyncio.Event,
|
|
110
|
+
kubelet_endpoint: Path | None = None,
|
|
111
|
+
start_timeout: int = 5,
|
|
112
|
+
register_timeout: int = 5,
|
|
113
|
+
):
|
|
114
|
+
"""
|
|
115
|
+
Serve the device plugin asynchronously.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
stop_event:
|
|
119
|
+
An asyncio event to signal stopping the server.
|
|
120
|
+
kubelet_endpoint:
|
|
121
|
+
The path to the kubelet endpoint.
|
|
122
|
+
Default is None, which uses the default kubelet socket path.
|
|
123
|
+
start_timeout:
|
|
124
|
+
The timeout in seconds for starting the device plugin server.
|
|
125
|
+
Default is 5 seconds.
|
|
126
|
+
register_timeout:
|
|
127
|
+
The timeout in seconds for registering the device plugin.
|
|
128
|
+
Default is 5 seconds.
|
|
129
|
+
|
|
130
|
+
"""
|
|
131
|
+
raise NotImplementedError
|