gpustack-runtime 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/detector.py +3 -1
- gpustack_runtime/deployer/__types__.py +314 -233
- gpustack_runtime/deployer/cdi/__utils__.py +4 -1
- gpustack_runtime/deployer/docker.py +109 -148
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +21 -3
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
- gpustack_runtime/deployer/kuberentes.py +91 -126
- gpustack_runtime/deployer/podman.py +89 -122
- gpustack_runtime/detector/__init__.py +2 -0
- gpustack_runtime/detector/__types__.py +26 -0
- gpustack_runtime/detector/amd.py +28 -8
- gpustack_runtime/detector/ascend.py +49 -4
- gpustack_runtime/detector/cambricon.py +3 -0
- gpustack_runtime/detector/hygon.py +16 -1
- gpustack_runtime/detector/iluvatar.py +6 -0
- gpustack_runtime/detector/metax.py +8 -0
- gpustack_runtime/detector/mthreads.py +11 -0
- gpustack_runtime/detector/nvidia.py +139 -134
- gpustack_runtime/detector/pyixml/__init__.py +16 -0
- gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
- gpustack_runtime/detector/thead.py +135 -127
- gpustack_runtime/envs.py +7 -6
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0
|
@@ -960,6 +960,14 @@ NVML_HOST_VGPU_MODE_SRIOV = 1
|
|
|
960
960
|
# GSP firmware
|
|
961
961
|
NVML_GSP_FIRMWARE_VERSION_BUF_SIZE = 0x40
|
|
962
962
|
|
|
963
|
+
# Health
|
|
964
|
+
IXML_HEALTH_SYSHUB_ERROR = 0x0000000000000001
|
|
965
|
+
IXML_HEALTH_MC_ERROR = 0x0000000000000002
|
|
966
|
+
IXML_HEALTH_ECC_ERROR = 0x0000000000000010
|
|
967
|
+
IXML_HEALTH_MEMORY_ERROR = 0x0000000000000020
|
|
968
|
+
IXML_HEALTH_PCIE_ERROR = 0x0000000000000040
|
|
969
|
+
IXML_HEALTH_OK = 0x0000000000000000
|
|
970
|
+
|
|
963
971
|
|
|
964
972
|
## Error Checking ##
|
|
965
973
|
class NVMLError(Exception):
|
|
@@ -5267,3 +5275,11 @@ def nvmlDeviceGetGpuFabricInfo(device, gpuFabricInfo):
|
|
|
5267
5275
|
ret = fn(device, gpuFabricInfo)
|
|
5268
5276
|
_nvmlCheckReturn(ret)
|
|
5269
5277
|
return ret
|
|
5278
|
+
|
|
5279
|
+
|
|
5280
|
+
def ixmlDeviceGetHealth(device):
|
|
5281
|
+
c_health = c_longlong()
|
|
5282
|
+
fn = _nvmlGetFunctionPointer("ixmlDeviceGetHealth")
|
|
5283
|
+
ret = fn(device, byref(c_health))
|
|
5284
|
+
_nvmlCheckReturn(ret)
|
|
5285
|
+
return c_health.value
|
|
@@ -393,3 +393,17 @@ def rsmi_is_p2p_accessible(device_a=0, device_b=0):
|
|
|
393
393
|
)
|
|
394
394
|
_rocmsmiCheckReturn(ret)
|
|
395
395
|
return c_accessible.value
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def rsmi_dev_ecc_count_get(device=0, gpu_block=None):
|
|
399
|
+
if gpu_block is None:
|
|
400
|
+
gpu_block = rsmi_gpu_block_t.RSMI_GPU_BLOCK_UMC
|
|
401
|
+
c_error_count = rsmi_error_count_t()
|
|
402
|
+
fn = _rocmsmiGetFunctionPointer("rsmi_dev_ecc_count_get")
|
|
403
|
+
ret = fn(
|
|
404
|
+
device,
|
|
405
|
+
gpu_block,
|
|
406
|
+
byref(c_error_count),
|
|
407
|
+
)
|
|
408
|
+
_rocmsmiCheckReturn(ret)
|
|
409
|
+
return c_error_count
|
|
@@ -12,6 +12,7 @@ from . import pyhgml
|
|
|
12
12
|
from .__types__ import (
|
|
13
13
|
Detector,
|
|
14
14
|
Device,
|
|
15
|
+
DeviceMemoryStatusEnum,
|
|
15
16
|
Devices,
|
|
16
17
|
ManufacturerEnum,
|
|
17
18
|
Topology,
|
|
@@ -138,17 +139,33 @@ class THeadDetector(Detector):
|
|
|
138
139
|
)
|
|
139
140
|
dev_numa = bitmask_to_str(list(dev_node_affinity))
|
|
140
141
|
|
|
142
|
+
dev_temp = None
|
|
143
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
144
|
+
dev_temp = pyhgml.hgmlDeviceGetTemperature(
|
|
145
|
+
dev,
|
|
146
|
+
pyhgml.HGML_TEMPERATURE_GPU,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
dev_power = None
|
|
150
|
+
dev_power_used = None
|
|
151
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
152
|
+
dev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(dev)
|
|
153
|
+
dev_power = dev_power // 1000 # mW to W
|
|
154
|
+
dev_power_used = (
|
|
155
|
+
pyhgml.hgmlDeviceGetPowerUsage(dev) // 1000
|
|
156
|
+
) # mW to W
|
|
157
|
+
|
|
141
158
|
dev_mig_mode = pyhgml.HGML_DEVICE_MIG_DISABLE
|
|
142
159
|
with contextlib.suppress(pyhgml.HGMLError):
|
|
143
160
|
dev_mig_mode, _ = pyhgml.hgmlDeviceGetMigMode(dev)
|
|
144
161
|
|
|
162
|
+
dev_index = dev_idx
|
|
163
|
+
if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
|
|
164
|
+
dev_index = pyhgml.hgmlDeviceGetMinorNumber(dev)
|
|
165
|
+
|
|
145
166
|
# With MIG disabled, treat as a single device.
|
|
146
167
|
|
|
147
168
|
if dev_mig_mode == pyhgml.HGML_DEVICE_MIG_DISABLE:
|
|
148
|
-
dev_index = dev_idx
|
|
149
|
-
if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
|
|
150
|
-
dev_index = pyhgml.hgmlDeviceGetMinorNumber(dev)
|
|
151
|
-
|
|
152
169
|
dev_name = pyhgml.hgmlDeviceGetName(dev)
|
|
153
170
|
|
|
154
171
|
dev_uuid = pyhgml.hgmlDeviceGetUUID(dev)
|
|
@@ -171,6 +188,7 @@ class THeadDetector(Detector):
|
|
|
171
188
|
|
|
172
189
|
dev_mem = 0
|
|
173
190
|
dev_mem_used = 0
|
|
191
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
174
192
|
with contextlib.suppress(pyhgml.HGMLError):
|
|
175
193
|
dev_mem_info = pyhgml.hgmlDeviceGetMemoryInfo(dev)
|
|
176
194
|
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
@@ -179,22 +197,14 @@ class THeadDetector(Detector):
|
|
|
179
197
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
180
198
|
dev_mem_info.used,
|
|
181
199
|
)
|
|
182
|
-
|
|
183
|
-
dev_temp = None
|
|
184
|
-
with contextlib.suppress(pyhgml.HGMLError):
|
|
185
|
-
dev_temp = pyhgml.hgmlDeviceGetTemperature(
|
|
200
|
+
dev_mem_ecc_errors = pyhgml.hgmlDeviceGetMemoryErrorCounter(
|
|
186
201
|
dev,
|
|
187
|
-
pyhgml.
|
|
202
|
+
pyhgml.HGML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
|
203
|
+
pyhgml.HGML_VOLATILE_ECC,
|
|
204
|
+
pyhgml.HGML_MEMORY_LOCATION_DRAM,
|
|
188
205
|
)
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
dev_power_used = None
|
|
192
|
-
with contextlib.suppress(pyhgml.HGMLError):
|
|
193
|
-
dev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(dev)
|
|
194
|
-
dev_power = dev_power // 1000 # mW to W
|
|
195
|
-
dev_power_used = (
|
|
196
|
-
pyhgml.hgmlDeviceGetPowerUsage(dev) // 1000
|
|
197
|
-
) # mW to W
|
|
206
|
+
if dev_mem_ecc_errors > 0:
|
|
207
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
198
208
|
|
|
199
209
|
dev_is_vgpu = False
|
|
200
210
|
if dev_bdf:
|
|
@@ -221,6 +231,7 @@ class THeadDetector(Detector):
|
|
|
221
231
|
memory=dev_mem,
|
|
222
232
|
memory_used=dev_mem_used,
|
|
223
233
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
234
|
+
memory_status=dev_mem_status,
|
|
224
235
|
temperature=dev_temp,
|
|
225
236
|
power=dev_power,
|
|
226
237
|
power_used=dev_power_used,
|
|
@@ -236,35 +247,34 @@ class THeadDetector(Detector):
|
|
|
236
247
|
mdev_cores = None
|
|
237
248
|
mdev_count = pyhgml.hgmlDeviceGetMaxMigDeviceCount(dev)
|
|
238
249
|
for mdev_idx in range(mdev_count):
|
|
239
|
-
mdev =
|
|
250
|
+
mdev = None
|
|
251
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
252
|
+
mdev = pyhgml.hgmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
|
|
253
|
+
if not mdev:
|
|
254
|
+
continue
|
|
240
255
|
|
|
241
|
-
mdev_index = mdev_idx
|
|
256
|
+
mdev_index = mdev_idx + dev_count * (dev_idx + 1)
|
|
242
257
|
mdev_uuid = pyhgml.hgmlDeviceGetUUID(mdev)
|
|
243
258
|
|
|
244
|
-
mdev_mem
|
|
259
|
+
mdev_mem = 0
|
|
260
|
+
mdev_mem_used = 0
|
|
261
|
+
mdev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
245
262
|
with contextlib.suppress(pyhgml.HGMLError):
|
|
246
263
|
mdev_mem_info = pyhgml.hgmlDeviceGetMemoryInfo(mdev)
|
|
247
|
-
byte_to_mebibyte( # byte to MiB
|
|
264
|
+
mdev_mem = byte_to_mebibyte( # byte to MiB
|
|
248
265
|
mdev_mem_info.total,
|
|
249
266
|
)
|
|
250
|
-
byte_to_mebibyte( # byte to MiB
|
|
267
|
+
mdev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
251
268
|
mdev_mem_info.used,
|
|
252
269
|
)
|
|
253
|
-
|
|
254
|
-
mdev_temp = pyhgml.hgmlDeviceGetTemperature(
|
|
255
|
-
mdev,
|
|
256
|
-
pyhgml.HGML_TEMPERATURE_GPU,
|
|
257
|
-
)
|
|
258
|
-
|
|
259
|
-
mdev_power = None
|
|
260
|
-
with contextlib.suppress(pyhgml.HGMLError):
|
|
261
|
-
mdev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(
|
|
270
|
+
mdev_mem_ecc_errors = pyhgml.hgmlDeviceGetMemoryErrorCounter(
|
|
262
271
|
mdev,
|
|
272
|
+
pyhgml.HGML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
|
273
|
+
pyhgml.HGML_AGGREGATE_ECC,
|
|
274
|
+
pyhgml.HGML_MEMORY_LOCATION_SRAM,
|
|
263
275
|
)
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
pyhgml.hgmlDeviceGetPowerUsage(mdev) // 1000
|
|
267
|
-
) # mW to W
|
|
276
|
+
if mdev_mem_ecc_errors > 0:
|
|
277
|
+
mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
268
278
|
|
|
269
279
|
mdev_appendix = {
|
|
270
280
|
"vgpu": True,
|
|
@@ -279,63 +289,64 @@ class THeadDetector(Detector):
|
|
|
279
289
|
|
|
280
290
|
mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
|
|
281
291
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
if dev_gi_prf.id != mdev_gi_info.profileId:
|
|
299
|
-
continue
|
|
300
|
-
except pyhgml.HGMLError:
|
|
292
|
+
mdev_gi = pyhgml.hgmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
|
|
293
|
+
mdev_ci = pyhgml.hgmlGpuInstanceGetComputeInstanceById(
|
|
294
|
+
mdev_gi,
|
|
295
|
+
mdev_ci_id,
|
|
296
|
+
)
|
|
297
|
+
mdev_gi_info = pyhgml.hgmlGpuInstanceGetInfo(mdev_gi)
|
|
298
|
+
mdev_ci_info = pyhgml.hgmlComputeInstanceGetInfo(mdev_ci)
|
|
299
|
+
for dev_gi_prf_id in range(
|
|
300
|
+
pyhgml.HGML_GPU_INSTANCE_PROFILE_COUNT,
|
|
301
|
+
):
|
|
302
|
+
try:
|
|
303
|
+
dev_gi_prf = pyhgml.hgmlDeviceGetGpuInstanceProfileInfo(
|
|
304
|
+
dev,
|
|
305
|
+
dev_gi_prf_id,
|
|
306
|
+
)
|
|
307
|
+
if dev_gi_prf.id != mdev_gi_info.profileId:
|
|
301
308
|
continue
|
|
309
|
+
except pyhgml.HGMLError:
|
|
310
|
+
continue
|
|
302
311
|
|
|
303
|
-
|
|
304
|
-
|
|
312
|
+
for dev_ci_prf_id in range(
|
|
313
|
+
pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_COUNT,
|
|
314
|
+
):
|
|
315
|
+
for dev_cig_prf_id in range(
|
|
316
|
+
pyhgml.HGML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
|
|
305
317
|
):
|
|
306
|
-
|
|
307
|
-
pyhgml.
|
|
308
|
-
|
|
309
|
-
try:
|
|
310
|
-
mdev_ci_prf = pyhgml.hgmlGpuInstanceGetComputeInstanceProfileInfo(
|
|
311
|
-
mdev_gi,
|
|
312
|
-
dev_ci_prf_id,
|
|
313
|
-
dev_cig_prf_id,
|
|
314
|
-
)
|
|
315
|
-
if mdev_ci_prf.id != mdev_ci_info.profileId:
|
|
316
|
-
continue
|
|
317
|
-
except pyhgml.HGMLError:
|
|
318
|
-
continue
|
|
319
|
-
|
|
320
|
-
ci_slice = _get_compute_instance_slice(
|
|
318
|
+
try:
|
|
319
|
+
mdev_ci_prf = pyhgml.hgmlGpuInstanceGetComputeInstanceProfileInfo(
|
|
320
|
+
mdev_gi,
|
|
321
321
|
dev_ci_prf_id,
|
|
322
|
+
dev_cig_prf_id,
|
|
322
323
|
)
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
)
|
|
324
|
+
if mdev_ci_prf.id != mdev_ci_info.profileId:
|
|
325
|
+
continue
|
|
326
|
+
except pyhgml.HGMLError:
|
|
327
|
+
continue
|
|
328
328
|
|
|
329
|
-
|
|
330
|
-
|
|
329
|
+
ci_slice = _get_compute_instance_slice(dev_ci_prf_id)
|
|
330
|
+
gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
|
|
331
|
+
if ci_slice == gi_slice:
|
|
332
|
+
if hasattr(dev_gi_prf, "name"):
|
|
333
|
+
mdev_name = dev_gi_prf.name
|
|
331
334
|
else:
|
|
332
|
-
|
|
333
|
-
|
|
335
|
+
gi_mem = round(
|
|
336
|
+
math.ceil(dev_gi_prf.memorySizeMB >> 10),
|
|
334
337
|
)
|
|
338
|
+
mdev_name = f"{gi_slice}g.{gi_mem}gb"
|
|
339
|
+
elif hasattr(mdev_ci_prf, "name"):
|
|
340
|
+
mdev_name = mdev_ci_prf.name
|
|
341
|
+
else:
|
|
342
|
+
gi_mem = round(
|
|
343
|
+
math.ceil(dev_gi_prf.memorySizeMB >> 10),
|
|
344
|
+
)
|
|
345
|
+
mdev_name = f"{ci_slice}u.{gi_slice}g.{gi_mem}gb"
|
|
335
346
|
|
|
336
|
-
|
|
347
|
+
mdev_cores = mdev_ci_prf.multiprocessorCount
|
|
337
348
|
|
|
338
|
-
|
|
349
|
+
break
|
|
339
350
|
|
|
340
351
|
ret.append(
|
|
341
352
|
Device(
|
|
@@ -352,9 +363,10 @@ class THeadDetector(Detector):
|
|
|
352
363
|
memory=mdev_mem,
|
|
353
364
|
memory_used=mdev_mem_used,
|
|
354
365
|
memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
366
|
+
memory_status=mdev_mem_status,
|
|
367
|
+
temperature=dev_temp,
|
|
368
|
+
power=dev_power,
|
|
369
|
+
power_used=dev_power_used,
|
|
358
370
|
appendix=mdev_appendix,
|
|
359
371
|
),
|
|
360
372
|
)
|
|
@@ -392,11 +404,17 @@ class THeadDetector(Detector):
|
|
|
392
404
|
devices_count=len(devices),
|
|
393
405
|
)
|
|
394
406
|
|
|
407
|
+
get_links_cache = {}
|
|
408
|
+
|
|
395
409
|
try:
|
|
396
410
|
pyhgml.hgmlInit()
|
|
397
411
|
|
|
398
412
|
for i, dev_i in enumerate(devices):
|
|
399
|
-
|
|
413
|
+
dev_i_bdf = dev_i.appendix.get("bdf")
|
|
414
|
+
if dev_i.appendix.get("vgpu", False):
|
|
415
|
+
dev_i_handle = pyhgml.hgmlDeviceGetHandleByPciBusId(dev_i_bdf)
|
|
416
|
+
else:
|
|
417
|
+
dev_i_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_i.uuid)
|
|
400
418
|
|
|
401
419
|
# Get NUMA and CPU affinities.
|
|
402
420
|
ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
|
|
@@ -405,7 +423,12 @@ class THeadDetector(Detector):
|
|
|
405
423
|
)
|
|
406
424
|
|
|
407
425
|
# Get links state if applicable.
|
|
408
|
-
if
|
|
426
|
+
if dev_i_bdf in get_links_cache:
|
|
427
|
+
dev_i_links_state = get_links_cache[dev_i_bdf]
|
|
428
|
+
else:
|
|
429
|
+
dev_i_links_state = _get_links_state(dev_i_handle)
|
|
430
|
+
get_links_cache[dev_i_bdf] = dev_i_links_state
|
|
431
|
+
if dev_i_links_state:
|
|
409
432
|
ret.appendices[i].update(dev_i_links_state)
|
|
410
433
|
# In practice, if a card has an active *Link,
|
|
411
434
|
# then other cards in the same machine should be interconnected with it through the *Link.
|
|
@@ -422,21 +445,30 @@ class THeadDetector(Detector):
|
|
|
422
445
|
if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
|
|
423
446
|
continue
|
|
424
447
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
448
|
+
dev_j_bdf = dev_j.appendix.get("bdf")
|
|
449
|
+
if dev_i_bdf == dev_j_bdf:
|
|
450
|
+
distance = TopologyDistanceEnum.SELF
|
|
451
|
+
else:
|
|
452
|
+
if dev_j.appendix.get("vgpu", False):
|
|
453
|
+
dev_j_handle = pyhgml.hgmlDeviceGetHandleByPciBusId(
|
|
454
|
+
dev_j_bdf,
|
|
455
|
+
)
|
|
456
|
+
else:
|
|
457
|
+
dev_j_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_j.uuid)
|
|
458
|
+
|
|
459
|
+
distance = TopologyDistanceEnum.UNK
|
|
460
|
+
try:
|
|
461
|
+
distance = pyhgml.hgmlDeviceGetTopologyCommonAncestor(
|
|
462
|
+
dev_i_handle,
|
|
463
|
+
dev_j_handle,
|
|
464
|
+
)
|
|
465
|
+
except pyhgml.HGMLError:
|
|
466
|
+
debug_log_exception(
|
|
467
|
+
logger,
|
|
468
|
+
"Failed to get distance between device %d and %d",
|
|
469
|
+
dev_i.index,
|
|
470
|
+
dev_j.index,
|
|
471
|
+
)
|
|
440
472
|
|
|
441
473
|
ret.devices_distances[i][j] = distance
|
|
442
474
|
ret.devices_distances[j][i] = distance
|
|
@@ -655,30 +687,6 @@ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
|
|
|
655
687
|
raise AttributeError(msg)
|
|
656
688
|
|
|
657
689
|
|
|
658
|
-
def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
|
|
659
|
-
"""
|
|
660
|
-
Compute the memory size of a MIG compute instance in GiB.
|
|
661
|
-
|
|
662
|
-
Args:
|
|
663
|
-
dev_mem:
|
|
664
|
-
The total memory info of the parent GPU device.
|
|
665
|
-
dev_gi_prf:
|
|
666
|
-
The profile info of the GPU instance.
|
|
667
|
-
|
|
668
|
-
Returns:
|
|
669
|
-
The memory size in GiB.
|
|
670
|
-
|
|
671
|
-
"""
|
|
672
|
-
mem = dev_gi_prf.memorySizeMB * (1 << 20) # MiB to byte
|
|
673
|
-
|
|
674
|
-
gib = round(
|
|
675
|
-
math.ceil(mem / dev_mem.total * 8)
|
|
676
|
-
/ 8
|
|
677
|
-
* ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
|
|
678
|
-
)
|
|
679
|
-
return gib
|
|
680
|
-
|
|
681
|
-
|
|
682
690
|
def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
|
|
683
691
|
"""
|
|
684
692
|
Get the number of slice for a given Compute Instance Profile ID.
|
gpustack_runtime/envs.py
CHANGED
|
@@ -246,7 +246,7 @@ if TYPE_CHECKING:
|
|
|
246
246
|
GPUSTACK_RUNTIME_DOCKER_CDI_SPECS_GENERATE: bool = True
|
|
247
247
|
"""
|
|
248
248
|
Generate CDI specifications during deployment when using CDI resource injection policy,
|
|
249
|
-
requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to
|
|
249
|
+
requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
|
|
250
250
|
Works only when `GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY` is set to `CDI`.
|
|
251
251
|
Using internal knowledge to generate the CDI specifications for deployer,
|
|
252
252
|
if the output file conflicts with other tools generating CDI specifications(e.g., NVIDIA Container Toolkit),
|
|
@@ -283,7 +283,7 @@ if TYPE_CHECKING:
|
|
|
283
283
|
Resource injection policy for the Kubernetes deployer (e.g., Auto, Env, KDP).
|
|
284
284
|
`Auto`: Automatically choose the resource injection policy based on the environment.
|
|
285
285
|
`Env`: Injects resources using standard environment variable, depends on underlying Container Toolkit, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES`.
|
|
286
|
-
`KDP`: Injects resources using Kubernetes Device Plugin
|
|
286
|
+
`KDP`: Injects resources using Kubernetes Device Plugin.
|
|
287
287
|
"""
|
|
288
288
|
GPUSTACK_RUNTIME_KUBERNETES_KDP_PER_DEVICE_MAX_ALLOCATIONS: int | None = None
|
|
289
289
|
"""
|
|
@@ -294,14 +294,14 @@ if TYPE_CHECKING:
|
|
|
294
294
|
"""
|
|
295
295
|
Device allocation policy for the Kubernetes Device Plugin (e.g., CDI, Env, Opaque).
|
|
296
296
|
`Auto`: Automatically choose the device allocation policy based on the environment.
|
|
297
|
-
`Env`: Allocates devices using runtime-visible environment variables
|
|
298
|
-
`CDI`: Allocates devices using generated CDI specifications, making it easy to debug and troubleshoot; requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
|
|
297
|
+
`Env`: Allocates devices using runtime-visible environment variables, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES`; requires Container Toolkit support.
|
|
298
|
+
`CDI`: Allocates devices using generated CDI specifications, based on `GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_CDI`, making it easy to debug and troubleshoot; requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
|
|
299
299
|
`Opaque`: Uses internal logic for allocation, which is convenient for deployment but difficult to troubleshoot.
|
|
300
300
|
"""
|
|
301
301
|
GPUSTACK_RUNTIME_KUBERNETES_KDP_CDI_SPECS_GENERATE: bool = True
|
|
302
302
|
"""
|
|
303
303
|
Generate CDI specifications during deployment,
|
|
304
|
-
requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to
|
|
304
|
+
requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
|
|
305
305
|
Works only when `GPUSTACK_RUNTIME_KUBERNETES_KDP_DEVICE_ALLOCATION_POLICY` is set to `CDI`.
|
|
306
306
|
Using internal knowledge to generate the CDI specifications for deployer,
|
|
307
307
|
if the output file conflicts with other tools generating CDI specifications(e.g., NVIDIA Container Toolkit),
|
|
@@ -344,7 +344,7 @@ if TYPE_CHECKING:
|
|
|
344
344
|
GPUSTACK_RUNTIME_PODMAN_CDI_SPECS_GENERATE: bool = True
|
|
345
345
|
"""
|
|
346
346
|
Generate CDI specifications during deployment,
|
|
347
|
-
requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to
|
|
347
|
+
requires `GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY` to exist.
|
|
348
348
|
Using internal knowledge to generate the CDI specifications for deployer,
|
|
349
349
|
if the output file conflicts with other tools generating CDI specifications(e.g., NVIDIA Container Toolkit),
|
|
350
350
|
please disable this and remove the output file manually.
|
|
@@ -577,6 +577,7 @@ variables: dict[str, Callable[[], Any]] = {
|
|
|
577
577
|
"GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID": lambda: to_set(
|
|
578
578
|
getenv(
|
|
579
579
|
"GPUSTACK_RUNTIME_DEPLOY_RUNTIME_VISIBLE_DEVICES_VALUE_UUID",
|
|
580
|
+
"NVIDIA_VISIBLE_DEVICES",
|
|
580
581
|
),
|
|
581
582
|
sep=",",
|
|
582
583
|
),
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpustack-runtime
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.42
|
|
4
4
|
Summary: GPUStack Runtime is library for detecting GPU resources and launching GPU workloads.
|
|
5
5
|
Project-URL: Homepage, https://github.com/gpustack/runtime
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/gpustack/gpustack/issues
|
|
@@ -16,7 +16,7 @@ Requires-Python: >=3.10
|
|
|
16
16
|
Requires-Dist: argcomplete>=3.6.3
|
|
17
17
|
Requires-Dist: cachetools>=5.5.2
|
|
18
18
|
Requires-Dist: docker>=7.1.0
|
|
19
|
-
Requires-Dist: gpustack-runner>=0.1.24.
|
|
19
|
+
Requires-Dist: gpustack-runner>=0.1.24.post4
|
|
20
20
|
Requires-Dist: grpc-interceptor>=0.15.4
|
|
21
21
|
Requires-Dist: grpcio>=1.76.0
|
|
22
22
|
Requires-Dist: kubernetes>=33.1.0
|
|
@@ -1,51 +1,51 @@
|
|
|
1
1
|
gpustack_runtime/__init__.py,sha256=Xw_PVWneitx-8QmW6sJQeymj6zVbEgEndGhIB_km6TI,186
|
|
2
2
|
gpustack_runtime/__main__.py,sha256=O9yJKcN7vg0Ppgc13qesxHwST2wkH3ccOkTQXPWHnNA,3939
|
|
3
|
-
gpustack_runtime/_version.py,sha256=
|
|
3
|
+
gpustack_runtime/_version.py,sha256=Vvw3zQNp4FXvAmZn_g3ZyhEUudGslUaT_mMkqwv_Tdg,777
|
|
4
4
|
gpustack_runtime/_version.pyi,sha256=A42NoSgcqEXVy2OeNm4LXC9CbyonbooYrSUBlPm2lGY,156
|
|
5
|
-
gpustack_runtime/envs.py,sha256=
|
|
5
|
+
gpustack_runtime/envs.py,sha256=gkr30NnIq3USzSYvbW6ipf8tKdbXx-BrDcY3-4Oc-Hg,38894
|
|
6
6
|
gpustack_runtime/logging.py,sha256=wMPriPpOuVsuClsjMh0qwEPQKyJiJa89ggdDjqkk7i0,6934
|
|
7
7
|
gpustack_runtime/cmds/__init__.py,sha256=-_X2O2lBn6KcdLGUzhL3lEjQC4_cwA36fvWDnFAgtVM,1382
|
|
8
8
|
gpustack_runtime/cmds/__types__.py,sha256=TBnUWUqzTkDtJnsMv363kdw-H8fOf-XQYbOvrmQif-M,815
|
|
9
9
|
gpustack_runtime/cmds/deployer.py,sha256=KvhPhU6ZW-UV6vLykI5adKI1ThgVFFJqWaII3n4OhL8,32846
|
|
10
|
-
gpustack_runtime/cmds/detector.py,sha256=
|
|
10
|
+
gpustack_runtime/cmds/detector.py,sha256=2ORRF53q3goyYRB4T8gu3X8u0VZ4v0xeEndJqtuktyQ,8872
|
|
11
11
|
gpustack_runtime/cmds/images.py,sha256=7tb-D3G4yqLPkbS9aSuWI1bD3DYK8BLbPbgqac56blI,594
|
|
12
12
|
gpustack_runtime/deployer/__init__.py,sha256=impMrmvkMjuCBthsn3QUz3LuwpmmNAymHJKJ2o6SZoc,16249
|
|
13
13
|
gpustack_runtime/deployer/__patches__.py,sha256=cTBge8BT6IsY5MzETKY3kN28k3igYfNj7pcpgDzfDzw,17849
|
|
14
|
-
gpustack_runtime/deployer/__types__.py,sha256=
|
|
14
|
+
gpustack_runtime/deployer/__types__.py,sha256=J2YX8X7EYY_56_L9WL5YMmdsyJ572uOIhMoHCVjPaog,72469
|
|
15
15
|
gpustack_runtime/deployer/__utils__.py,sha256=paQu2M1UeoSfQPsiskmAqJSiln-8qwibTssEoWFMLec,21109
|
|
16
|
-
gpustack_runtime/deployer/docker.py,sha256=
|
|
17
|
-
gpustack_runtime/deployer/kuberentes.py,sha256
|
|
18
|
-
gpustack_runtime/deployer/podman.py,sha256=
|
|
16
|
+
gpustack_runtime/deployer/docker.py,sha256=e48conm3gfu8dlwcIhvTvM5NhlhdgKlvk6Ix8xGYVeI,81448
|
|
17
|
+
gpustack_runtime/deployer/kuberentes.py,sha256=-G7eYuqTDDi3T9u2Jqr6j0Ut-8vkP5u2lxzSyDx0EWM,86776
|
|
18
|
+
gpustack_runtime/deployer/podman.py,sha256=9lo4AvXzD3HUteY17-Fuz9A0ItScPb_D1tweDgm7PVo,79090
|
|
19
19
|
gpustack_runtime/deployer/cdi/__init__.py,sha256=2wHrxkud3GJokE3ytNc3jvjddemXkNuuz_oIKzxD3-I,4000
|
|
20
20
|
gpustack_runtime/deployer/cdi/__types__.py,sha256=04DKvcogk7OoHS7TU2Bmht3VVMu7iOEBWTEOvxpHt4w,18399
|
|
21
|
-
gpustack_runtime/deployer/cdi/__utils__.py,sha256=
|
|
21
|
+
gpustack_runtime/deployer/cdi/__utils__.py,sha256=CAYUv76akZiHJYZO_VY0NXKhEI2jrP7G3OgvQa8Pg4U,4050
|
|
22
22
|
gpustack_runtime/deployer/cdi/amd.py,sha256=-eq_SOlC56VX2QscZXvnoeffWSRindhr8zFZmaIcKrE,4082
|
|
23
23
|
gpustack_runtime/deployer/cdi/ascend.py,sha256=lDs75a9--c0lM34xfJqu-_QbfWNFrf4zE-GXPKReBe4,4538
|
|
24
24
|
gpustack_runtime/deployer/cdi/hygon.py,sha256=h6-vQfv03sgxYjMJAf_JOMq9cHFPaNjK1YbUYIiSXck,4117
|
|
25
25
|
gpustack_runtime/deployer/cdi/iluvatar.py,sha256=6nNECZpU5IPP6-5l-O1rzU-ib-WcuwKvDg7ZV__1NE4,3650
|
|
26
26
|
gpustack_runtime/deployer/cdi/metax.py,sha256=tmJBvr-n9pERAp-dXsa54qv6xmxt0rJoJwY36TFdoWk,4143
|
|
27
27
|
gpustack_runtime/deployer/cdi/thead.py,sha256=SvIDKNYZx7FwMPTTxyJ2RRjlr9LXLN8BUYCUhidmiQk,3671
|
|
28
|
-
gpustack_runtime/deployer/k8s/deviceplugin/__init__.py,sha256=
|
|
28
|
+
gpustack_runtime/deployer/k8s/deviceplugin/__init__.py,sha256=kvjsDx_8kNt3h8a5MOx5A7qPvqRsk1amvFr_ZYDA1l0,10931
|
|
29
29
|
gpustack_runtime/deployer/k8s/deviceplugin/__types__.py,sha256=LCkgPDZ64Mra7bo5jmtsAO2Ypbc4qK99lMl6R_nQhnY,3043
|
|
30
|
-
gpustack_runtime/deployer/k8s/deviceplugin/plugin.py,sha256=
|
|
30
|
+
gpustack_runtime/deployer/k8s/deviceplugin/plugin.py,sha256=20eUDvM_SBFCi5WDR3AfyDJpnL7CJxxcPdW4p626I_M,17671
|
|
31
31
|
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/__init__.py,sha256=3rOYmgDIIJ4idEtwgnumGStH7PaK-J7EYrOnLa9A-8o,118
|
|
32
32
|
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api.proto,sha256=rmB8RDe4LN5FCVkQ608uS-pl32mk5tt6iGe-g2lKtPs,7919
|
|
33
33
|
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.py,sha256=dNkzjTE-2y25q8NF0QRznNJ5r1-5ZxxJS598MHbjx98,45998
|
|
34
34
|
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2.pyi,sha256=lq1dbSgBYqJ7zyGfoKKHCyfr6R5vcCGzJxteeyQpbuI,8232
|
|
35
35
|
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/api_pb2_grpc.py,sha256=GM6EyCEFeyOjL0XOCisbcHurRoLKqKDUI5obsUyTxpE,17446
|
|
36
36
|
gpustack_runtime/deployer/k8s/types/kubelet/deviceplugin/v1beta1/constants.py,sha256=tpNk3e_cvY67C9RwVsdTNl75YuNXBgsn53fSJIzeTR4,828
|
|
37
|
-
gpustack_runtime/detector/__init__.py,sha256=
|
|
38
|
-
gpustack_runtime/detector/__types__.py,sha256=
|
|
37
|
+
gpustack_runtime/detector/__init__.py,sha256=2GaczS5lWLuPsighvq7oPhlPFQSUJfBKLAliXPfl4i0,8162
|
|
38
|
+
gpustack_runtime/detector/__types__.py,sha256=ZiKuYZb0jeXqzyTCUWTDppoHnRmkaZuljAWbnR_ohX8,13626
|
|
39
39
|
gpustack_runtime/detector/__utils__.py,sha256=QdLWXwsU1_EMxXG5Y29psqnttWJyXWMphHDjiC_6Byc,25153
|
|
40
|
-
gpustack_runtime/detector/amd.py,sha256=
|
|
41
|
-
gpustack_runtime/detector/ascend.py,sha256=
|
|
42
|
-
gpustack_runtime/detector/cambricon.py,sha256=
|
|
43
|
-
gpustack_runtime/detector/hygon.py,sha256=
|
|
44
|
-
gpustack_runtime/detector/iluvatar.py,sha256=
|
|
45
|
-
gpustack_runtime/detector/metax.py,sha256=
|
|
46
|
-
gpustack_runtime/detector/mthreads.py,sha256=
|
|
47
|
-
gpustack_runtime/detector/nvidia.py,sha256=
|
|
48
|
-
gpustack_runtime/detector/thead.py,sha256=
|
|
40
|
+
gpustack_runtime/detector/amd.py,sha256=ywJMDKFnmF2REdJc1F8_zGYK6O4K0o5xh5eqWU-X2EE,18294
|
|
41
|
+
gpustack_runtime/detector/ascend.py,sha256=a6QRnJfXdyU2tyGiiUKy0fgsp6NF652Zr2fFQgVg1Xw,19470
|
|
42
|
+
gpustack_runtime/detector/cambricon.py,sha256=5AXILG9NAMYiWjaLRZ5h8lXtFk7FLC7LB_aFQz0ZtYU,4102
|
|
43
|
+
gpustack_runtime/detector/hygon.py,sha256=3AcHBlPXTFiH0JQ0VS_xZcqjX-FXy-cdle6Nc-rNj5w,12795
|
|
44
|
+
gpustack_runtime/detector/iluvatar.py,sha256=klFl5H607w8ksTvYSt21QkHMRzzeg-TkJKfoh9CMzqc,10551
|
|
45
|
+
gpustack_runtime/detector/metax.py,sha256=P24WiqK2Ngjpu6AQt0Fp1wEVNra2Xgs-C8JAAwpYews,10801
|
|
46
|
+
gpustack_runtime/detector/mthreads.py,sha256=mwNdsc42nebnSJMPFo6ue1tbiOwHmvPw6dF2CrLwdIQ,12714
|
|
47
|
+
gpustack_runtime/detector/nvidia.py,sha256=oD3HUPfYWXRIRZ87iidNTW2Tg8CTVNIJh8qW1Z3HBO4,34535
|
|
48
|
+
gpustack_runtime/detector/thead.py,sha256=hIRtlZNPa7xzAT0W_2XgFiDVH3YHSGi8NqCdaFaqQcA,26818
|
|
49
49
|
gpustack_runtime/detector/pyacl/__init__.py,sha256=UQjaBxP7nJNyzr08N8_lH-5wPtnFmUY9pyQhs6vIChU,16232
|
|
50
50
|
gpustack_runtime/detector/pyamdgpu/__init__.py,sha256=x-UO07EpKEgfTLmXQOD6j9f6kibuvDC7riQFof3YGdw,8617
|
|
51
51
|
gpustack_runtime/detector/pyamdsmi/__init__.py,sha256=800-khq2w6HLgXM12RkhcdvXBGeAJ4s1_TWJyHebCMk,955
|
|
@@ -55,13 +55,13 @@ gpustack_runtime/detector/pyhgml/__init__.py,sha256=Yp9s-QhHS4ck7Iq9kd4v6a4BruyJ
|
|
|
55
55
|
gpustack_runtime/detector/pyhgml/libhgml.so,sha256=BPzGVBpzrMX1tSvbXddq8Q0Qhi8w-No2JXX8sRxTioI,2101640
|
|
56
56
|
gpustack_runtime/detector/pyhgml/libuki.so,sha256=EE6v1vIYYT4FSDMMm9rSfAqwrwIPFD-4_6KtP51lSps,702352
|
|
57
57
|
gpustack_runtime/detector/pyhsa/__init__.py,sha256=4DuGnBBMUVOCPa6vTx3XT5mffGrKk6M6CYbUWBoMTJ0,15792
|
|
58
|
-
gpustack_runtime/detector/pyixml/__init__.py,sha256=
|
|
58
|
+
gpustack_runtime/detector/pyixml/__init__.py,sha256=2YmNoYhcIvc4CbRZgORM9o-GKdQ6O05J-5L3JbMZdhA,163157
|
|
59
59
|
gpustack_runtime/detector/pymxsml/__init__.py,sha256=YxfNHq7TWd7CpNroP45BGXhcWNpY_sXgVzNGtx68DII,45409
|
|
60
60
|
gpustack_runtime/detector/pyrocmcore/__init__.py,sha256=rgwIdPS-7GG7_5luRMR1XG9QyNM3lJh5ryD7kfZqpWg,2523
|
|
61
|
-
gpustack_runtime/detector/pyrocmsmi/__init__.py,sha256=
|
|
62
|
-
gpustack_runtime/_version_appendix.py,sha256=
|
|
63
|
-
gpustack_runtime-0.1.
|
|
64
|
-
gpustack_runtime-0.1.
|
|
65
|
-
gpustack_runtime-0.1.
|
|
66
|
-
gpustack_runtime-0.1.
|
|
67
|
-
gpustack_runtime-0.1.
|
|
61
|
+
gpustack_runtime/detector/pyrocmsmi/__init__.py,sha256=Gk4pTadOMzLCZJvQJ2S1N_1ivogtYokfVPHj_9Y874Y,12286
|
|
62
|
+
gpustack_runtime/_version_appendix.py,sha256=2B6zFAHFYbVzMJ1w6ZW4XpqNz2XaMa-cAueeeQ4OfJk,23
|
|
63
|
+
gpustack_runtime-0.1.42.dist-info/METADATA,sha256=sUS5YnNvheiK-tDT-rGSzuOKrOIFIKKhUBH1Jxc7lPE,2358
|
|
64
|
+
gpustack_runtime-0.1.42.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
65
|
+
gpustack_runtime-0.1.42.dist-info/entry_points.txt,sha256=bBO_61GxP6dIT74uZwbSDgW5Vt2pTePUS3CgjUJkUgg,68
|
|
66
|
+
gpustack_runtime-0.1.42.dist-info/licenses/LICENSE,sha256=OiPibowBvB-NHV3TP_NOj18XNBlXcshXZFMpa3uvKVE,10362
|
|
67
|
+
gpustack_runtime-0.1.42.dist-info/RECORD,,
|
|
File without changes
|
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|