gpustack-runtime 0.1.40.post1__py3-none-any.whl → 0.1.41.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/__init__.py +1 -1
- gpustack_runtime/__main__.py +5 -3
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/__init__.py +5 -3
- gpustack_runtime/cmds/__types__.py +1 -1
- gpustack_runtime/cmds/deployer.py +140 -18
- gpustack_runtime/cmds/detector.py +1 -1
- gpustack_runtime/cmds/images.py +1 -1
- gpustack_runtime/deployer/__init__.py +28 -2
- gpustack_runtime/deployer/__patches__.py +1 -1
- gpustack_runtime/deployer/__types__.py +2 -1
- gpustack_runtime/deployer/__utils__.py +2 -2
- gpustack_runtime/deployer/cdi/__init__.py +86 -5
- gpustack_runtime/deployer/cdi/__types__.py +92 -29
- gpustack_runtime/deployer/cdi/__utils__.py +180 -0
- gpustack_runtime/deployer/cdi/amd.py +146 -0
- gpustack_runtime/deployer/cdi/ascend.py +164 -0
- gpustack_runtime/deployer/cdi/hygon.py +147 -0
- gpustack_runtime/deployer/cdi/iluvatar.py +136 -0
- gpustack_runtime/deployer/cdi/metax.py +148 -0
- gpustack_runtime/deployer/cdi/thead.py +57 -23
- gpustack_runtime/deployer/docker.py +9 -8
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +325 -0
- gpustack_runtime/deployer/k8s/deviceplugin/__types__.py +131 -0
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +590 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py +3 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto +212 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py +86 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi +168 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py +358 -0
- gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py +34 -0
- gpustack_runtime/deployer/kuberentes.py +50 -4
- gpustack_runtime/deployer/podman.py +9 -8
- gpustack_runtime/detector/__init__.py +42 -5
- gpustack_runtime/detector/__types__.py +8 -24
- gpustack_runtime/detector/__utils__.py +46 -39
- gpustack_runtime/detector/amd.py +55 -66
- gpustack_runtime/detector/ascend.py +29 -41
- gpustack_runtime/detector/cambricon.py +3 -3
- gpustack_runtime/detector/hygon.py +21 -49
- gpustack_runtime/detector/iluvatar.py +44 -60
- gpustack_runtime/detector/metax.py +54 -37
- gpustack_runtime/detector/mthreads.py +74 -36
- gpustack_runtime/detector/nvidia.py +130 -93
- gpustack_runtime/detector/pyacl/__init__.py +1 -1
- gpustack_runtime/detector/pyamdgpu/__init__.py +1 -1
- gpustack_runtime/detector/pyamdsmi/__init__.py +1 -1
- gpustack_runtime/detector/pycuda/__init__.py +1 -1
- gpustack_runtime/detector/pydcmi/__init__.py +1 -1
- gpustack_runtime/detector/pyhsa/__init__.py +1 -1
- gpustack_runtime/detector/pymxsml/__init__.py +1553 -1
- gpustack_runtime/detector/pyrocmcore/__init__.py +1 -1
- gpustack_runtime/detector/pyrocmsmi/__init__.py +1 -1
- gpustack_runtime/detector/thead.py +41 -60
- gpustack_runtime/envs.py +106 -12
- gpustack_runtime/logging.py +6 -2
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/METADATA +6 -1
- gpustack_runtime-0.1.41.post1.dist-info/RECORD +67 -0
- gpustack_runtime/detector/pymxsml/mxsml.py +0 -1580
- gpustack_runtime/detector/pymxsml/mxsml_extension.py +0 -816
- gpustack_runtime/detector/pymxsml/mxsml_mcm.py +0 -476
- gpustack_runtime-0.1.40.post1.dist-info/RECORD +0 -55
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.40.post1.dist-info → gpustack_runtime-0.1.41.post1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,590 @@
|
|
|
1
|
+
from __future__ import annotations as __future_annotations__
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
import logging
|
|
6
|
+
from functools import lru_cache
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
9
|
+
|
|
10
|
+
import grpc
|
|
11
|
+
from grpc_interceptor import AsyncServerInterceptor
|
|
12
|
+
from grpc_interceptor.exceptions import GrpcException
|
|
13
|
+
|
|
14
|
+
from .... import envs
|
|
15
|
+
from ....detector import Device, str_range_to_list
|
|
16
|
+
from ...cdi import (
|
|
17
|
+
generate_config,
|
|
18
|
+
manufacturer_to_cdi_kind,
|
|
19
|
+
manufacturer_to_runtime_env,
|
|
20
|
+
)
|
|
21
|
+
from ..types.kubelet.deviceplugin.v1beta1 import (
|
|
22
|
+
AllocateRequest,
|
|
23
|
+
AllocateResponse,
|
|
24
|
+
CDIDevice,
|
|
25
|
+
ContainerAllocateRequest,
|
|
26
|
+
ContainerAllocateResponse,
|
|
27
|
+
ContainerPreferredAllocationRequest,
|
|
28
|
+
ContainerPreferredAllocationResponse,
|
|
29
|
+
DevicePluginOptions,
|
|
30
|
+
DevicePluginServicer,
|
|
31
|
+
DeviceSpec,
|
|
32
|
+
Empty,
|
|
33
|
+
Healthy,
|
|
34
|
+
ListAndWatchResponse,
|
|
35
|
+
Mount,
|
|
36
|
+
NUMANode,
|
|
37
|
+
PreferredAllocationRequest,
|
|
38
|
+
PreferredAllocationResponse,
|
|
39
|
+
PreStartContainerRequest,
|
|
40
|
+
PreStartContainerResponse,
|
|
41
|
+
RegisterRequest,
|
|
42
|
+
RegistrationStub,
|
|
43
|
+
TopologyInfo,
|
|
44
|
+
Version,
|
|
45
|
+
add_DevicePluginServicer_to_server,
|
|
46
|
+
)
|
|
47
|
+
from ..types.kubelet.deviceplugin.v1beta1 import (
|
|
48
|
+
Device as DevicePluginDevice,
|
|
49
|
+
)
|
|
50
|
+
from .__types__ import PluginServer
|
|
51
|
+
|
|
52
|
+
if TYPE_CHECKING:
|
|
53
|
+
from collections.abc import AsyncIterator, Callable
|
|
54
|
+
|
|
55
|
+
logger = logging.getLogger(__name__)
|
|
56
|
+
|
|
57
|
+
_ID_SPLIT = "::"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _to_device_plugin_device_id(device_id: str, shard: int) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Converts a device ID and shard number to a Kubernetes Device Plugin device ID.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
device_id:
|
|
66
|
+
The base device ID.
|
|
67
|
+
shard:
|
|
68
|
+
The shard number.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
The combined device plugin device ID.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
return f"{device_id}{_ID_SPLIT}{shard}"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _from_device_plugin_device_id(device_plugin_device_id: str) -> tuple[str, int]:
|
|
78
|
+
"""
|
|
79
|
+
Converts a Kubernetes Device Plugin device ID to its base device ID and shard number.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
device_plugin_device_id:
|
|
83
|
+
The combined device plugin device ID.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
A tuple containing the base device ID and shard number.
|
|
87
|
+
|
|
88
|
+
"""
|
|
89
|
+
parts = device_plugin_device_id.split(_ID_SPLIT)
|
|
90
|
+
device_id = parts[0]
|
|
91
|
+
shard = int(parts[1])
|
|
92
|
+
return device_id, shard
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class SharableDevicePlugin(PluginServer, DevicePluginServicer):
|
|
96
|
+
"""
|
|
97
|
+
SharableDevicePlugin is a Kubernetes Device Plugin that supports device sharing.
|
|
98
|
+
|
|
99
|
+
It allows multiple containers to share the same underlying device by
|
|
100
|
+
creating multiple device IDs (shards) for each physical device.
|
|
101
|
+
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
_device: Device
|
|
105
|
+
"""
|
|
106
|
+
The underlying device to be managed by this device plugin.
|
|
107
|
+
"""
|
|
108
|
+
_id_by: Literal["uuid", "index"]
|
|
109
|
+
"""
|
|
110
|
+
Controls how the device IDs of the Kubernetes Device Plugin are generated.
|
|
111
|
+
"""
|
|
112
|
+
_allocation_policy: Literal["env", "cdi", "opaque"]
|
|
113
|
+
"""
|
|
114
|
+
Controls the device allocation policy.
|
|
115
|
+
"""
|
|
116
|
+
_max_allocations: int
|
|
117
|
+
"""
|
|
118
|
+
Controls the maximum shards per underlying device.
|
|
119
|
+
"""
|
|
120
|
+
_cdi_kind: str
|
|
121
|
+
"""
|
|
122
|
+
The CDI kind associated with the `_device`.
|
|
123
|
+
"""
|
|
124
|
+
_runtime_env: str
|
|
125
|
+
"""
|
|
126
|
+
The runtime environment associated with the `_device`.
|
|
127
|
+
"""
|
|
128
|
+
_kdp_resource: str
|
|
129
|
+
"""
|
|
130
|
+
The device plugin resource name associated with the `_device`.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
def __init__(
|
|
134
|
+
self,
|
|
135
|
+
device: Device,
|
|
136
|
+
id_by: Literal["uuid", "index"] = "uuid",
|
|
137
|
+
allocation_policy: Literal["env", "cdi", "opaque"] = "cdi",
|
|
138
|
+
max_allocations: int | None = None,
|
|
139
|
+
):
|
|
140
|
+
"""
|
|
141
|
+
Initializes the SharableDevicePlugin.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
device:
|
|
145
|
+
The underlying device to be managed by this device plugin.
|
|
146
|
+
id_by:
|
|
147
|
+
Controls how the device IDs of the Kubernetes Device Plugin are generated.
|
|
148
|
+
Either "uuid" or "index". Default is "uuid".
|
|
149
|
+
allocation_policy:
|
|
150
|
+
Controls the device allocation policy.
|
|
151
|
+
max_allocations:
|
|
152
|
+
Controls the maximum allocations per underlying device.
|
|
153
|
+
If None, uses the environment variable `GPUSTACK_RUNTIME_KUBERNETES_KDP_PER_DEVICE_MAX_ALLOCATIONS`.
|
|
154
|
+
|
|
155
|
+
"""
|
|
156
|
+
self._device = device
|
|
157
|
+
self._id_by = id_by
|
|
158
|
+
self._allocation_policy = allocation_policy
|
|
159
|
+
self._max_allocations = max_allocations
|
|
160
|
+
if not self._max_allocations:
|
|
161
|
+
self._max_allocations = (
|
|
162
|
+
envs.GPUSTACK_RUNTIME_KUBERNETES_KDP_PER_DEVICE_MAX_ALLOCATIONS
|
|
163
|
+
)
|
|
164
|
+
self._max_allocations = max(self._max_allocations, 1)
|
|
165
|
+
self._cdi_kind = manufacturer_to_cdi_kind(device.manufacturer)
|
|
166
|
+
self._runtime_env = manufacturer_to_runtime_env(device.manufacturer)
|
|
167
|
+
self._kdp_resource = cdi_kind_to_kdp_resource(
|
|
168
|
+
cdi_kind=self._cdi_kind,
|
|
169
|
+
device_index=device.index,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
super().__init__(self._kdp_resource)
|
|
173
|
+
|
|
174
|
+
@contextlib.asynccontextmanager
|
|
175
|
+
async def _serve_device_plugin(
|
|
176
|
+
self,
|
|
177
|
+
endpoint: Path,
|
|
178
|
+
timeout: int = 5,
|
|
179
|
+
) -> AsyncIterator[None]:
|
|
180
|
+
"""
|
|
181
|
+
Serve the device plugin asynchronously.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
endpoint:
|
|
185
|
+
The path to the device plugin server endpoint.
|
|
186
|
+
timeout:
|
|
187
|
+
The timeout in seconds for starting the device plugin.
|
|
188
|
+
|
|
189
|
+
"""
|
|
190
|
+
endpoint.unlink(missing_ok=True)
|
|
191
|
+
|
|
192
|
+
# Create the device plugin server.
|
|
193
|
+
server = grpc.aio.server(
|
|
194
|
+
interceptors=[
|
|
195
|
+
_LoggingInterceptor(
|
|
196
|
+
name=self._kdp_resource,
|
|
197
|
+
),
|
|
198
|
+
],
|
|
199
|
+
)
|
|
200
|
+
server.add_insecure_port(
|
|
201
|
+
address=f"unix://{endpoint}",
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Start the device plugin server.
|
|
205
|
+
add_DevicePluginServicer_to_server(self, server)
|
|
206
|
+
server_task = asyncio.create_task(server.start())
|
|
207
|
+
|
|
208
|
+
# Wait for the server to be ready,
|
|
209
|
+
# then yield for use within the context,
|
|
210
|
+
# and ensure proper shutdown afterward.
|
|
211
|
+
try:
|
|
212
|
+
loop = asyncio.get_event_loop()
|
|
213
|
+
start_time = loop.time()
|
|
214
|
+
while (loop.time() - start_time) < timeout:
|
|
215
|
+
if endpoint.exists():
|
|
216
|
+
with contextlib.suppress(ConnectionError, asyncio.TimeoutError):
|
|
217
|
+
_, writer = await asyncio.wait_for(
|
|
218
|
+
asyncio.open_unix_connection(endpoint),
|
|
219
|
+
timeout=0.1,
|
|
220
|
+
)
|
|
221
|
+
writer.close()
|
|
222
|
+
await writer.wait_closed()
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
await asyncio.sleep(0.05)
|
|
226
|
+
else:
|
|
227
|
+
msg = f"Failed to start device plugin server within {timeout} seconds"
|
|
228
|
+
raise TimeoutError(msg)
|
|
229
|
+
|
|
230
|
+
yield
|
|
231
|
+
finally:
|
|
232
|
+
await server.stop(grace=3.0)
|
|
233
|
+
await server_task
|
|
234
|
+
endpoint.unlink(missing_ok=True)
|
|
235
|
+
|
|
236
|
+
async def serve(
|
|
237
|
+
self,
|
|
238
|
+
stop_event: asyncio.Event,
|
|
239
|
+
kubelet_endpoint: Path,
|
|
240
|
+
start_timeout: int = 5,
|
|
241
|
+
register_timeout: int = 5,
|
|
242
|
+
):
|
|
243
|
+
"""
|
|
244
|
+
Serve the device plugin asynchronously.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
stop_event:
|
|
248
|
+
An asyncio event to signal stopping the server.
|
|
249
|
+
kubelet_endpoint:
|
|
250
|
+
The path to the kubelet endpoint.
|
|
251
|
+
start_timeout:
|
|
252
|
+
The timeout in seconds for starting the device plugin.
|
|
253
|
+
register_timeout:
|
|
254
|
+
The timeout in seconds for registering the device plugin.
|
|
255
|
+
|
|
256
|
+
"""
|
|
257
|
+
resource_name = self._kdp_resource
|
|
258
|
+
endpoint = kubelet_endpoint.parent / f"{resource_name.replace('/', '.')}.sock"
|
|
259
|
+
|
|
260
|
+
async with self._serve_device_plugin(
|
|
261
|
+
endpoint=endpoint,
|
|
262
|
+
timeout=start_timeout,
|
|
263
|
+
):
|
|
264
|
+
request = RegisterRequest(
|
|
265
|
+
version=Version,
|
|
266
|
+
endpoint=str(endpoint.name),
|
|
267
|
+
resource_name=resource_name,
|
|
268
|
+
options=self._get_device_plugin_options(),
|
|
269
|
+
)
|
|
270
|
+
async with grpc.aio.insecure_channel(
|
|
271
|
+
target=f"unix://{kubelet_endpoint}",
|
|
272
|
+
) as channel:
|
|
273
|
+
stub = RegistrationStub(channel)
|
|
274
|
+
try:
|
|
275
|
+
await stub.Register(
|
|
276
|
+
request=request,
|
|
277
|
+
timeout=register_timeout,
|
|
278
|
+
)
|
|
279
|
+
except Exception:
|
|
280
|
+
logger.exception(
|
|
281
|
+
f"Failed to register device plugin for resource '{resource_name}' "
|
|
282
|
+
f"at endpoint '{endpoint}'",
|
|
283
|
+
)
|
|
284
|
+
raise
|
|
285
|
+
|
|
286
|
+
logger.info(
|
|
287
|
+
f"Serving device plugin for resource '{resource_name}' "
|
|
288
|
+
f"at endpoint '{endpoint}'",
|
|
289
|
+
)
|
|
290
|
+
await stop_event.wait()
|
|
291
|
+
|
|
292
|
+
@staticmethod
|
|
293
|
+
def _get_device_plugin_options() -> DevicePluginOptions:
|
|
294
|
+
"""
|
|
295
|
+
Returns the device plugin options.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
The device plugin options.
|
|
299
|
+
|
|
300
|
+
"""
|
|
301
|
+
return DevicePluginOptions(
|
|
302
|
+
pre_start_required=False,
|
|
303
|
+
get_preferred_allocation_available=True,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
async def GetDevicePluginOptions( # noqa: N802
|
|
307
|
+
self,
|
|
308
|
+
req: Empty,
|
|
309
|
+
ctx: grpc.aio.ServicerContext,
|
|
310
|
+
) -> DevicePluginOptions:
|
|
311
|
+
"""
|
|
312
|
+
Returns the device plugin options.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
req:
|
|
316
|
+
An empty request message.
|
|
317
|
+
ctx:
|
|
318
|
+
The request context.
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
The device plugin options.
|
|
322
|
+
|
|
323
|
+
"""
|
|
324
|
+
return self._get_device_plugin_options()
|
|
325
|
+
|
|
326
|
+
async def ListAndWatch( # noqa: N802
|
|
327
|
+
self,
|
|
328
|
+
req: Empty,
|
|
329
|
+
ctx: grpc.aio.ServicerContext,
|
|
330
|
+
) -> AsyncIterator[ListAndWatchResponse]:
|
|
331
|
+
"""
|
|
332
|
+
List and watch for device changes.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
req:
|
|
336
|
+
An empty request message.
|
|
337
|
+
ctx:
|
|
338
|
+
The request context.
|
|
339
|
+
|
|
340
|
+
Yields:
|
|
341
|
+
The response containing the list of devices.
|
|
342
|
+
|
|
343
|
+
"""
|
|
344
|
+
device_id = (
|
|
345
|
+
self._device.uuid if self._id_by == "uuid" else str(self._device.index)
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
dp_devices: list[DevicePluginDevice] = []
|
|
349
|
+
dp_device_health = Healthy
|
|
350
|
+
dp_device_topo = TopologyInfo(
|
|
351
|
+
nodes=[
|
|
352
|
+
NUMANode(
|
|
353
|
+
ID=node_id,
|
|
354
|
+
)
|
|
355
|
+
for node_id in str_range_to_list(
|
|
356
|
+
self._device.appendix.get("numa", "0"),
|
|
357
|
+
)
|
|
358
|
+
],
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
for device_replica in range(1, self._max_allocations + 1):
|
|
362
|
+
dp_device_id = _to_device_plugin_device_id(device_id, device_replica)
|
|
363
|
+
dp_devices.append(
|
|
364
|
+
DevicePluginDevice(
|
|
365
|
+
ID=dp_device_id,
|
|
366
|
+
health=dp_device_health,
|
|
367
|
+
topology=dp_device_topo,
|
|
368
|
+
),
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
yield ListAndWatchResponse(
|
|
372
|
+
devices=dp_devices,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# TODO(thxCode): implement health check
|
|
376
|
+
|
|
377
|
+
while not ctx.done(): # noqa: ASYNC110
|
|
378
|
+
await asyncio.sleep(15)
|
|
379
|
+
|
|
380
|
+
@staticmethod
|
|
381
|
+
def _get_preferred_allocation(
|
|
382
|
+
req: ContainerPreferredAllocationRequest,
|
|
383
|
+
) -> ContainerPreferredAllocationResponse:
|
|
384
|
+
available_dp_device_ids = req.available_deviceIDs
|
|
385
|
+
required_dp_device_ids = req.must_include_deviceIDs
|
|
386
|
+
allocation_size = req.allocation_size
|
|
387
|
+
|
|
388
|
+
if len(available_dp_device_ids) == allocation_size:
|
|
389
|
+
return ContainerPreferredAllocationResponse(
|
|
390
|
+
deviceIDs=available_dp_device_ids,
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
if len(required_dp_device_ids) == allocation_size:
|
|
394
|
+
return ContainerPreferredAllocationResponse(
|
|
395
|
+
deviceIDs=required_dp_device_ids,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
selected_dp_device_ids = list(required_dp_device_ids)
|
|
399
|
+
for dp_device_id in available_dp_device_ids:
|
|
400
|
+
if dp_device_id not in selected_dp_device_ids:
|
|
401
|
+
selected_dp_device_ids.append(dp_device_id)
|
|
402
|
+
if len(selected_dp_device_ids) == allocation_size:
|
|
403
|
+
break
|
|
404
|
+
|
|
405
|
+
return ContainerPreferredAllocationResponse(
|
|
406
|
+
deviceIDs=selected_dp_device_ids,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
async def GetPreferredAllocation( # noqa: N802
|
|
410
|
+
self,
|
|
411
|
+
req: PreferredAllocationRequest,
|
|
412
|
+
ctx: grpc.aio.ServicerContext,
|
|
413
|
+
) -> PreferredAllocationResponse:
|
|
414
|
+
allocation_responses = []
|
|
415
|
+
for request in req.container_requests:
|
|
416
|
+
allocation_responses.append(
|
|
417
|
+
self._get_preferred_allocation(request),
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
return PreferredAllocationResponse(
|
|
421
|
+
container_responses=allocation_responses,
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
def _allocate(
|
|
425
|
+
self,
|
|
426
|
+
req: ContainerAllocateRequest,
|
|
427
|
+
) -> ContainerAllocateResponse:
|
|
428
|
+
policy = envs.GPUSTACK_RUNTIME_KUBERNETES_KDP_DEVICE_ALLOCATION_POLICY.lower()
|
|
429
|
+
|
|
430
|
+
request_dp_device_ids = req.devices_ids
|
|
431
|
+
|
|
432
|
+
# CDI device allocation.
|
|
433
|
+
if policy == "cdi":
|
|
434
|
+
cdi_devices: list[CDIDevice] = []
|
|
435
|
+
for dp_device_id in request_dp_device_ids:
|
|
436
|
+
device_id, _ = _from_device_plugin_device_id(dp_device_id)
|
|
437
|
+
cdi_devices.append(
|
|
438
|
+
CDIDevice(
|
|
439
|
+
name=f"{self._cdi_kind}={device_id}",
|
|
440
|
+
),
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
return ContainerAllocateResponse(
|
|
444
|
+
cdi_devices=cdi_devices,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Environment variable device allocation.
|
|
448
|
+
if policy == "env":
|
|
449
|
+
return ContainerAllocateResponse(
|
|
450
|
+
envs={
|
|
451
|
+
self._runtime_env: ",".join(request_dp_device_ids),
|
|
452
|
+
},
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Opaque device allocation.
|
|
456
|
+
if cdi_cfg := generate_config(self._device):
|
|
457
|
+
dev_envs: dict[str, str] = {}
|
|
458
|
+
dev_mounts: list[Mount] = []
|
|
459
|
+
dev_devices: list[DeviceSpec] = []
|
|
460
|
+
|
|
461
|
+
container_edits = cdi_cfg.container_edits or []
|
|
462
|
+
for cdi_dev in cdi_cfg.devices:
|
|
463
|
+
container_edits.append(cdi_dev.container_edits)
|
|
464
|
+
|
|
465
|
+
for edit in container_edits:
|
|
466
|
+
for e in edit.env or []:
|
|
467
|
+
k, v = e.split("=", 1)
|
|
468
|
+
dev_envs[k] = v
|
|
469
|
+
for m in edit.mounts or []:
|
|
470
|
+
dev_mounts.append(
|
|
471
|
+
Mount(
|
|
472
|
+
container_path=m.container_path,
|
|
473
|
+
host_path=m.host_path,
|
|
474
|
+
read_only="ro" in (m.options or []),
|
|
475
|
+
),
|
|
476
|
+
)
|
|
477
|
+
for d in edit.device_nodes or []:
|
|
478
|
+
dev_devices.append(
|
|
479
|
+
DeviceSpec(
|
|
480
|
+
container_path=d.path,
|
|
481
|
+
host_path=d.host_path or d.path,
|
|
482
|
+
permissions=d.permissions,
|
|
483
|
+
),
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
return ContainerAllocateResponse(
|
|
487
|
+
envs=dev_envs,
|
|
488
|
+
mounts=dev_mounts,
|
|
489
|
+
devices=dev_devices,
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
return ContainerAllocateResponse()
|
|
493
|
+
|
|
494
|
+
async def Allocate( # noqa: N802
|
|
495
|
+
self,
|
|
496
|
+
req: AllocateRequest,
|
|
497
|
+
ctx: grpc.aio.ServicerContext,
|
|
498
|
+
) -> AllocateResponse:
|
|
499
|
+
allocation_response = []
|
|
500
|
+
for request in req.container_requests:
|
|
501
|
+
allocation_response.append(
|
|
502
|
+
self._allocate(request),
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
return AllocateResponse(
|
|
506
|
+
container_responses=allocation_response,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
async def PreStartContainer( # noqa: N802
|
|
510
|
+
self,
|
|
511
|
+
req: PreStartContainerRequest,
|
|
512
|
+
ctx: grpc.aio.ServicerContext,
|
|
513
|
+
) -> PreStartContainerResponse:
|
|
514
|
+
return PreStartContainerResponse()
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
@lru_cache
|
|
518
|
+
def cdi_kind_to_kdp_resource(
|
|
519
|
+
cdi_kind: str,
|
|
520
|
+
device_index: int,
|
|
521
|
+
) -> str:
|
|
522
|
+
"""
|
|
523
|
+
Map CDI kind and device index to a Kubernetes Device Plugin resource name.
|
|
524
|
+
|
|
525
|
+
Args:
|
|
526
|
+
cdi_kind:
|
|
527
|
+
The CDI kind.
|
|
528
|
+
device_index:
|
|
529
|
+
The index of the device.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
The resource name for the device plugin.
|
|
533
|
+
|
|
534
|
+
"""
|
|
535
|
+
return f"{cdi_kind}{device_index}"
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
class _LoggingInterceptor(AsyncServerInterceptor):
|
|
539
|
+
_name: str
|
|
540
|
+
"""
|
|
541
|
+
Name of the server.
|
|
542
|
+
"""
|
|
543
|
+
|
|
544
|
+
def __init__(self, name: str):
|
|
545
|
+
self._name = name
|
|
546
|
+
|
|
547
|
+
def log_debug(self, msg, *args, **kwargs):
|
|
548
|
+
logger.debug(f"[{self._name}] {msg}", *args, **kwargs)
|
|
549
|
+
|
|
550
|
+
def log_exception(self, msg, *args, **kwargs):
|
|
551
|
+
logger.exception(f"[{self._name}] {msg}", *args, **kwargs)
|
|
552
|
+
|
|
553
|
+
@staticmethod
|
|
554
|
+
@lru_cache
|
|
555
|
+
def simplify_method_name(method_name: str) -> str:
|
|
556
|
+
return Path(method_name).name
|
|
557
|
+
|
|
558
|
+
async def intercept(
|
|
559
|
+
self,
|
|
560
|
+
method: Callable,
|
|
561
|
+
request: Any,
|
|
562
|
+
context: grpc.aio.ServicerContext,
|
|
563
|
+
method_name: str,
|
|
564
|
+
) -> object:
|
|
565
|
+
method_name = self.simplify_method_name(method_name)
|
|
566
|
+
|
|
567
|
+
if hasattr(request, "__aiter__"):
|
|
568
|
+
self.log_debug(f"{method_name} received streaming request")
|
|
569
|
+
elif isinstance(request, Empty):
|
|
570
|
+
self.log_debug(f"{method_name} received empty request")
|
|
571
|
+
else:
|
|
572
|
+
self.log_debug(f"{method_name} received request:\n{request}")
|
|
573
|
+
try:
|
|
574
|
+
response = method(request, context)
|
|
575
|
+
if hasattr(response, "__aiter__"):
|
|
576
|
+
self.log_debug(f"{method_name} returning streaming response")
|
|
577
|
+
else:
|
|
578
|
+
response = await response
|
|
579
|
+
if isinstance(response, Empty):
|
|
580
|
+
self.log_debug(f"{method_name} returned empty response")
|
|
581
|
+
else:
|
|
582
|
+
self.log_debug(f"{method_name} returned response:\n{response}")
|
|
583
|
+
except GrpcException:
|
|
584
|
+
self.log_exception(f"{method_name} raised grpc exception")
|
|
585
|
+
raise
|
|
586
|
+
except Exception:
|
|
587
|
+
self.log_exception(f"{method_name} raised unexpected exception")
|
|
588
|
+
raise
|
|
589
|
+
else:
|
|
590
|
+
return response
|