gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/detector.py +4 -2
- gpustack_runtime/deployer/__types__.py +314 -233
- gpustack_runtime/deployer/cdi/__init__.py +1 -1
- gpustack_runtime/deployer/cdi/__types__.py +2 -2
- gpustack_runtime/deployer/cdi/__utils__.py +4 -1
- gpustack_runtime/deployer/cdi/amd.py +6 -8
- gpustack_runtime/deployer/cdi/ascend.py +7 -9
- gpustack_runtime/deployer/cdi/hygon.py +6 -8
- gpustack_runtime/deployer/cdi/iluvatar.py +6 -8
- gpustack_runtime/deployer/cdi/metax.py +6 -8
- gpustack_runtime/deployer/cdi/thead.py +6 -8
- gpustack_runtime/deployer/docker.py +133 -146
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +13 -8
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +26 -21
- gpustack_runtime/deployer/kuberentes.py +89 -108
- gpustack_runtime/deployer/podman.py +113 -120
- gpustack_runtime/detector/__init__.py +2 -0
- gpustack_runtime/detector/__types__.py +26 -0
- gpustack_runtime/detector/__utils__.py +3 -0
- gpustack_runtime/detector/amd.py +32 -10
- gpustack_runtime/detector/ascend.py +67 -13
- gpustack_runtime/detector/cambricon.py +3 -0
- gpustack_runtime/detector/hygon.py +22 -3
- gpustack_runtime/detector/iluvatar.py +15 -7
- gpustack_runtime/detector/metax.py +16 -6
- gpustack_runtime/detector/mthreads.py +22 -8
- gpustack_runtime/detector/nvidia.py +148 -140
- gpustack_runtime/detector/pyacl/__init__.py +34 -14
- gpustack_runtime/detector/pydcmi/__init__.py +4 -2
- gpustack_runtime/detector/pyixml/__init__.py +16 -0
- gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
- gpustack_runtime/detector/thead.py +145 -134
- gpustack_runtime/envs.py +7 -6
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/METADATA +2 -2
- gpustack_runtime-0.1.42.post1.dist-info/RECORD +67 -0
- gpustack_runtime-0.1.41.post3.dist-info/RECORD +0 -67
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,17 +3,17 @@ from __future__ import annotations as __future_annotations__
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import logging
|
|
5
5
|
import math
|
|
6
|
+
import re
|
|
6
7
|
import time
|
|
7
8
|
from _ctypes import byref
|
|
8
9
|
from functools import lru_cache
|
|
9
10
|
from pathlib import Path
|
|
10
|
-
from typing import re
|
|
11
11
|
|
|
12
12
|
import pynvml
|
|
13
13
|
|
|
14
14
|
from .. import envs
|
|
15
15
|
from ..logging import debug_log_exception, debug_log_warning
|
|
16
|
-
from . import Topology, pycuda
|
|
16
|
+
from . import DeviceMemoryStatusEnum, Topology, pycuda
|
|
17
17
|
from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
|
|
18
18
|
from .__utils__ import (
|
|
19
19
|
PCIDevice,
|
|
@@ -78,7 +78,7 @@ class NVIDIADetector(Detector):
|
|
|
78
78
|
def __init__(self):
|
|
79
79
|
super().__init__(ManufacturerEnum.NVIDIA)
|
|
80
80
|
|
|
81
|
-
def detect(self) -> Devices | None:
|
|
81
|
+
def detect(self) -> Devices | None: # noqa: PLR0915
|
|
82
82
|
"""
|
|
83
83
|
Detect NVIDIA GPUs using pynvml.
|
|
84
84
|
|
|
@@ -134,12 +134,29 @@ class NVIDIADetector(Detector):
|
|
|
134
134
|
|
|
135
135
|
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
136
136
|
if not dev_numa:
|
|
137
|
-
|
|
137
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
138
|
+
dev_node_affinity = pynvml.nvmlDeviceGetMemoryAffinity(
|
|
139
|
+
dev,
|
|
140
|
+
get_numa_nodeset_size(),
|
|
141
|
+
pynvml.NVML_AFFINITY_SCOPE_NODE,
|
|
142
|
+
)
|
|
143
|
+
dev_numa = bitmask_to_str(list(dev_node_affinity))
|
|
144
|
+
|
|
145
|
+
dev_temp = None
|
|
146
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
147
|
+
dev_temp = pynvml.nvmlDeviceGetTemperature(
|
|
138
148
|
dev,
|
|
139
|
-
|
|
140
|
-
pynvml.NVML_AFFINITY_SCOPE_NODE,
|
|
149
|
+
pynvml.NVML_TEMPERATURE_GPU,
|
|
141
150
|
)
|
|
142
|
-
|
|
151
|
+
|
|
152
|
+
dev_power = None
|
|
153
|
+
dev_power_used = None
|
|
154
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
155
|
+
dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
|
|
156
|
+
dev_power = dev_power // 1000 # mW to W
|
|
157
|
+
dev_power_used = (
|
|
158
|
+
pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
|
|
159
|
+
) # mW to W
|
|
143
160
|
|
|
144
161
|
dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
|
|
145
162
|
with contextlib.suppress(pynvml.NVMLError):
|
|
@@ -180,6 +197,7 @@ class NVIDIADetector(Detector):
|
|
|
180
197
|
|
|
181
198
|
dev_mem = 0
|
|
182
199
|
dev_mem_used = 0
|
|
200
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
183
201
|
with contextlib.suppress(pynvml.NVMLError):
|
|
184
202
|
dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
|
|
185
203
|
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
@@ -188,24 +206,16 @@ class NVIDIADetector(Detector):
|
|
|
188
206
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
189
207
|
dev_mem_info.used,
|
|
190
208
|
)
|
|
191
|
-
|
|
192
|
-
dev_mem, dev_mem_used = get_memory()
|
|
193
|
-
|
|
194
|
-
dev_temp = None
|
|
195
|
-
with contextlib.suppress(pynvml.NVMLError):
|
|
196
|
-
dev_temp = pynvml.nvmlDeviceGetTemperature(
|
|
209
|
+
dev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
|
|
197
210
|
dev,
|
|
198
|
-
pynvml.
|
|
211
|
+
pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
|
212
|
+
pynvml.NVML_VOLATILE_ECC,
|
|
213
|
+
pynvml.NVML_MEMORY_LOCATION_DRAM,
|
|
199
214
|
)
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
|
|
205
|
-
dev_power = dev_power // 1000 # mW to W
|
|
206
|
-
dev_power_used = (
|
|
207
|
-
pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
|
|
208
|
-
) # mW to W
|
|
215
|
+
if dev_mem_ecc_errors > 0:
|
|
216
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
217
|
+
if dev_mem == 0:
|
|
218
|
+
dev_mem, dev_mem_used = get_memory()
|
|
209
219
|
|
|
210
220
|
dev_is_vgpu = False
|
|
211
221
|
if dev_bdf in pci_devs:
|
|
@@ -215,8 +225,9 @@ class NVIDIADetector(Detector):
|
|
|
215
225
|
"arch_family": _get_arch_family(dev_cc_t),
|
|
216
226
|
"vgpu": dev_is_vgpu,
|
|
217
227
|
"bdf": dev_bdf,
|
|
218
|
-
"numa": dev_numa,
|
|
219
228
|
}
|
|
229
|
+
if dev_numa:
|
|
230
|
+
dev_appendix["numa"] = dev_numa
|
|
220
231
|
|
|
221
232
|
if dev_fabric_info := _get_fabric_info(dev):
|
|
222
233
|
dev_appendix.update(dev_fabric_info)
|
|
@@ -236,6 +247,7 @@ class NVIDIADetector(Detector):
|
|
|
236
247
|
memory=dev_mem,
|
|
237
248
|
memory_used=dev_mem_used,
|
|
238
249
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
250
|
+
memory_status=dev_mem_status,
|
|
239
251
|
temperature=dev_temp,
|
|
240
252
|
power=dev_power,
|
|
241
253
|
power_used=dev_power_used,
|
|
@@ -254,12 +266,18 @@ class NVIDIADetector(Detector):
|
|
|
254
266
|
mdev_cores = None
|
|
255
267
|
mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
|
|
256
268
|
for mdev_idx in range(mdev_count):
|
|
257
|
-
mdev =
|
|
269
|
+
mdev = None
|
|
270
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
271
|
+
mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
|
|
272
|
+
if not mdev:
|
|
273
|
+
continue
|
|
258
274
|
|
|
259
|
-
mdev_index = mdev_idx
|
|
275
|
+
mdev_index = mdev_idx + dev_count * (dev_idx + 1)
|
|
260
276
|
mdev_uuid = pynvml.nvmlDeviceGetUUID(mdev)
|
|
261
277
|
|
|
262
|
-
mdev_mem
|
|
278
|
+
mdev_mem = 0
|
|
279
|
+
mdev_mem_used = 0
|
|
280
|
+
mdev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
263
281
|
with contextlib.suppress(pynvml.NVMLError):
|
|
264
282
|
mdev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(mdev)
|
|
265
283
|
mdev_mem = byte_to_mebibyte( # byte to MiB
|
|
@@ -268,28 +286,22 @@ class NVIDIADetector(Detector):
|
|
|
268
286
|
mdev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
269
287
|
mdev_mem_info.used,
|
|
270
288
|
)
|
|
271
|
-
|
|
272
|
-
mdev_temp = pynvml.nvmlDeviceGetTemperature(
|
|
273
|
-
mdev,
|
|
274
|
-
pynvml.NVML_TEMPERATURE_GPU,
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
mdev_power = None
|
|
278
|
-
with contextlib.suppress(pynvml.NVMLError):
|
|
279
|
-
mdev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(
|
|
289
|
+
mdev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
|
|
280
290
|
mdev,
|
|
291
|
+
pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
|
292
|
+
pynvml.NVML_AGGREGATE_ECC,
|
|
293
|
+
pynvml.NVML_MEMORY_LOCATION_SRAM,
|
|
281
294
|
)
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
|
|
285
|
-
) # mW to W
|
|
295
|
+
if mdev_mem_ecc_errors > 0:
|
|
296
|
+
mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
286
297
|
|
|
287
298
|
mdev_appendix = {
|
|
288
299
|
"arch_family": _get_arch_family(dev_cc_t),
|
|
289
300
|
"vgpu": True,
|
|
290
301
|
"bdf": dev_bdf,
|
|
291
|
-
"numa": dev_numa,
|
|
292
302
|
}
|
|
303
|
+
if dev_numa:
|
|
304
|
+
mdev_appendix["numa"] = dev_numa
|
|
293
305
|
|
|
294
306
|
mdev_gi_id = pynvml.nvmlDeviceGetGpuInstanceId(mdev)
|
|
295
307
|
mdev_appendix["gpu_instance_id"] = mdev_gi_id
|
|
@@ -305,71 +317,70 @@ class NVIDIADetector(Detector):
|
|
|
305
317
|
|
|
306
318
|
mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
|
|
307
319
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
if dev_gi_prf.id != mdev_gi_info.profileId:
|
|
325
|
-
continue
|
|
326
|
-
except pynvml.NVMLError:
|
|
320
|
+
mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
|
|
321
|
+
mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
|
|
322
|
+
mdev_gi,
|
|
323
|
+
mdev_ci_id,
|
|
324
|
+
)
|
|
325
|
+
mdev_gi_info = pynvml.nvmlGpuInstanceGetInfo(mdev_gi)
|
|
326
|
+
mdev_ci_info = pynvml.nvmlComputeInstanceGetInfo(mdev_ci)
|
|
327
|
+
for dev_gi_prf_id in range(
|
|
328
|
+
pynvml.NVML_GPU_INSTANCE_PROFILE_COUNT,
|
|
329
|
+
):
|
|
330
|
+
try:
|
|
331
|
+
dev_gi_prf = pynvml.nvmlDeviceGetGpuInstanceProfileInfo(
|
|
332
|
+
dev,
|
|
333
|
+
dev_gi_prf_id,
|
|
334
|
+
)
|
|
335
|
+
if dev_gi_prf.id != mdev_gi_info.profileId:
|
|
327
336
|
continue
|
|
337
|
+
except pynvml.NVMLError:
|
|
338
|
+
continue
|
|
328
339
|
|
|
329
|
-
|
|
330
|
-
|
|
340
|
+
for dev_ci_prf_id in range(
|
|
341
|
+
pynvml.NVML_COMPUTE_INSTANCE_PROFILE_COUNT,
|
|
342
|
+
):
|
|
343
|
+
for dev_cig_prf_id in range(
|
|
344
|
+
pynvml.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
|
|
331
345
|
):
|
|
332
|
-
|
|
333
|
-
pynvml.
|
|
334
|
-
|
|
335
|
-
try:
|
|
336
|
-
mdev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
|
|
337
|
-
mdev_gi,
|
|
338
|
-
dev_ci_prf_id,
|
|
339
|
-
dev_cig_prf_id,
|
|
340
|
-
)
|
|
341
|
-
if mdev_ci_prf.id != mdev_ci_info.profileId:
|
|
342
|
-
continue
|
|
343
|
-
except pynvml.NVMLError:
|
|
344
|
-
continue
|
|
345
|
-
|
|
346
|
-
ci_slice = _get_compute_instance_slice(
|
|
346
|
+
try:
|
|
347
|
+
dev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
|
|
348
|
+
mdev_gi,
|
|
347
349
|
dev_ci_prf_id,
|
|
350
|
+
dev_cig_prf_id,
|
|
348
351
|
)
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
)
|
|
354
|
-
gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
|
|
355
|
-
gi_neg_attrs = _get_gpu_instance_negattrs(
|
|
356
|
-
dev_gi_prf_id,
|
|
357
|
-
)
|
|
352
|
+
if dev_ci_prf.id != mdev_ci_info.profileId:
|
|
353
|
+
continue
|
|
354
|
+
except pynvml.NVMLError:
|
|
355
|
+
continue
|
|
358
356
|
|
|
359
|
-
|
|
360
|
-
|
|
357
|
+
ci_slice = _get_compute_instance_slice(dev_ci_prf_id)
|
|
358
|
+
gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
|
|
359
|
+
if ci_slice == gi_slice:
|
|
360
|
+
if hasattr(dev_gi_prf, "name"):
|
|
361
|
+
mdev_name = dev_gi_prf.name
|
|
361
362
|
else:
|
|
362
|
-
|
|
363
|
-
|
|
363
|
+
gi_mem = round(
|
|
364
|
+
math.ceil(dev_gi_prf.memorySizeMB >> 10),
|
|
364
365
|
)
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
366
|
+
mdev_name = f"{gi_slice}g.{gi_mem}gb"
|
|
367
|
+
elif hasattr(dev_ci_prf, "name"):
|
|
368
|
+
mdev_name = dev_ci_prf.name
|
|
369
|
+
else:
|
|
370
|
+
gi_mem = round(
|
|
371
|
+
math.ceil(dev_gi_prf.memorySizeMB >> 10),
|
|
372
|
+
)
|
|
373
|
+
mdev_name = f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
|
|
374
|
+
gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
|
|
375
|
+
if gi_attrs:
|
|
376
|
+
mdev_name += f"+{gi_attrs}"
|
|
377
|
+
gi_neg_attrs = _get_gpu_instance_negattrs(dev_gi_prf_id)
|
|
378
|
+
if gi_neg_attrs:
|
|
379
|
+
mdev_name += f"-{gi_neg_attrs}"
|
|
369
380
|
|
|
370
|
-
|
|
381
|
+
mdev_cores = dev_ci_prf.multiprocessorCount
|
|
371
382
|
|
|
372
|
-
|
|
383
|
+
break
|
|
373
384
|
|
|
374
385
|
ret.append(
|
|
375
386
|
Device(
|
|
@@ -386,9 +397,10 @@ class NVIDIADetector(Detector):
|
|
|
386
397
|
memory=mdev_mem,
|
|
387
398
|
memory_used=mdev_mem_used,
|
|
388
399
|
memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
400
|
+
memory_status=mdev_mem_status,
|
|
401
|
+
temperature=dev_temp,
|
|
402
|
+
power=dev_power,
|
|
403
|
+
power_used=dev_power_used,
|
|
392
404
|
appendix=mdev_appendix,
|
|
393
405
|
),
|
|
394
406
|
)
|
|
@@ -426,11 +438,17 @@ class NVIDIADetector(Detector):
|
|
|
426
438
|
devices_count=len(devices),
|
|
427
439
|
)
|
|
428
440
|
|
|
441
|
+
get_links_cache = {}
|
|
442
|
+
|
|
429
443
|
try:
|
|
430
444
|
pynvml.nvmlInit()
|
|
431
445
|
|
|
432
446
|
for i, dev_i in enumerate(devices):
|
|
433
|
-
|
|
447
|
+
dev_i_bdf = dev_i.appendix.get("bdf")
|
|
448
|
+
if dev_i.appendix.get("vgpu", False):
|
|
449
|
+
dev_i_handle = pynvml.nvmlDeviceGetHandleByPciBusId(dev_i_bdf)
|
|
450
|
+
else:
|
|
451
|
+
dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
|
|
434
452
|
|
|
435
453
|
# Get NUMA and CPU affinities.
|
|
436
454
|
ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
|
|
@@ -439,7 +457,12 @@ class NVIDIADetector(Detector):
|
|
|
439
457
|
)
|
|
440
458
|
|
|
441
459
|
# Get links state if applicable.
|
|
442
|
-
if
|
|
460
|
+
if dev_i_bdf in get_links_cache:
|
|
461
|
+
dev_i_links_state = get_links_cache[dev_i_bdf]
|
|
462
|
+
else:
|
|
463
|
+
dev_i_links_state = _get_links_state(dev_i_handle)
|
|
464
|
+
get_links_cache[dev_i_bdf] = dev_i_links_state
|
|
465
|
+
if dev_i_links_state:
|
|
443
466
|
ret.appendices[i].update(dev_i_links_state)
|
|
444
467
|
# In practice, if a card has an active *Link,
|
|
445
468
|
# then other cards in the same machine should be interconnected with it through the *Link.
|
|
@@ -456,21 +479,30 @@ class NVIDIADetector(Detector):
|
|
|
456
479
|
if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
|
|
457
480
|
continue
|
|
458
481
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
482
|
+
dev_j_bdf = dev_j.appendix.get("bdf")
|
|
483
|
+
if dev_i_bdf == dev_j_bdf:
|
|
484
|
+
distance = TopologyDistanceEnum.SELF
|
|
485
|
+
else:
|
|
486
|
+
if dev_j.appendix.get("vgpu", False):
|
|
487
|
+
dev_j_handle = pynvml.nvmlDeviceGetHandleByPciBusId(
|
|
488
|
+
dev_j_bdf,
|
|
489
|
+
)
|
|
490
|
+
else:
|
|
491
|
+
dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
|
|
492
|
+
|
|
493
|
+
distance = TopologyDistanceEnum.UNK
|
|
494
|
+
try:
|
|
495
|
+
distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
|
|
496
|
+
dev_i_handle,
|
|
497
|
+
dev_j_handle,
|
|
498
|
+
)
|
|
499
|
+
except pynvml.NVMLError:
|
|
500
|
+
debug_log_exception(
|
|
501
|
+
logger,
|
|
502
|
+
"Failed to get distance between device %d and %d",
|
|
503
|
+
dev_i.index,
|
|
504
|
+
dev_j.index,
|
|
505
|
+
)
|
|
474
506
|
|
|
475
507
|
ret.devices_distances[i][j] = distance
|
|
476
508
|
ret.devices_distances[j][i] = distance
|
|
@@ -767,30 +799,6 @@ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
|
|
|
767
799
|
raise AttributeError(msg)
|
|
768
800
|
|
|
769
801
|
|
|
770
|
-
def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
|
|
771
|
-
"""
|
|
772
|
-
Compute the memory size of a MIG compute instance in GiB.
|
|
773
|
-
|
|
774
|
-
Args:
|
|
775
|
-
dev_mem:
|
|
776
|
-
The total memory info of the parent GPU device.
|
|
777
|
-
dev_gi_prf:
|
|
778
|
-
The profile info of the GPU instance.
|
|
779
|
-
|
|
780
|
-
Returns:
|
|
781
|
-
The memory size in GiB.
|
|
782
|
-
|
|
783
|
-
"""
|
|
784
|
-
mem = dev_gi_prf.memorySizeMB * (1 << 20) # MiB to byte
|
|
785
|
-
|
|
786
|
-
gib = round(
|
|
787
|
-
math.ceil(mem / dev_mem.total * 8)
|
|
788
|
-
/ 8
|
|
789
|
-
* ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
|
|
790
|
-
)
|
|
791
|
-
return gib
|
|
792
|
-
|
|
793
|
-
|
|
794
802
|
def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
|
|
795
803
|
"""
|
|
796
804
|
Get the number of slice for a given Compute Instance Profile ID.
|
|
@@ -403,20 +403,21 @@ def _LoadAclLibrary():
|
|
|
403
403
|
locs = [
|
|
404
404
|
"libascendcl.so",
|
|
405
405
|
]
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
if ascend_path.exists():
|
|
413
|
-
locs.extend(
|
|
414
|
-
[
|
|
415
|
-
str(ascend_path / "runtime/lib64/libascendcl.so"),
|
|
416
|
-
str(ascend_path / "aarch64-linux/lib64/libascendcl.so"),
|
|
417
|
-
str(ascend_path / "x86_64-linux/lib64/libascendcl.so"),
|
|
418
|
-
]
|
|
406
|
+
for default_path in [
|
|
407
|
+
"/usr/local/Ascend/ascend-toolkit/latest",
|
|
408
|
+
"/usr/local/Ascend/cann",
|
|
409
|
+
]:
|
|
410
|
+
ascend_path = Path(
|
|
411
|
+
os.getenv("ASCEND_HOME_PATH", default_path),
|
|
419
412
|
)
|
|
413
|
+
if ascend_path.exists():
|
|
414
|
+
locs.extend(
|
|
415
|
+
[
|
|
416
|
+
str(ascend_path / "runtime/lib64/libascendcl.so"),
|
|
417
|
+
str(ascend_path / "aarch64-linux/lib64/libascendcl.so"),
|
|
418
|
+
str(ascend_path / "x86_64-linux/lib64/libascendcl.so"),
|
|
419
|
+
]
|
|
420
|
+
)
|
|
420
421
|
for loc in locs:
|
|
421
422
|
try:
|
|
422
423
|
aclLib = CDLL(loc)
|
|
@@ -439,7 +440,8 @@ def aclrtGetSocName():
|
|
|
439
440
|
fn = _aclGetFunctionPointer("aclrtGetSocName")
|
|
440
441
|
fn.restype = c_char_p
|
|
441
442
|
c_version = fn()
|
|
442
|
-
|
|
443
|
+
if c_version is not None:
|
|
444
|
+
return c_version.decode()
|
|
443
445
|
|
|
444
446
|
return None
|
|
445
447
|
|
|
@@ -456,3 +458,21 @@ def aclsysGetCANNVersion(package_name=ACL_PKG_NAME_CANN):
|
|
|
456
458
|
return f"{c_version.version}".lower()
|
|
457
459
|
|
|
458
460
|
return None
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def aclsysGetVersion():
|
|
464
|
+
cann_version = aclsysGetCANNVersion()
|
|
465
|
+
if cann_version is not None:
|
|
466
|
+
return cann_version
|
|
467
|
+
|
|
468
|
+
with contextlib.suppress(ACLError):
|
|
469
|
+
_LoadAclLibrary()
|
|
470
|
+
|
|
471
|
+
c_version = create_string_buffer(ACL_PKG_VERSION_MAX_SIZE)
|
|
472
|
+
package_name = b"runtime"
|
|
473
|
+
fn = _aclGetFunctionPointer("aclsysGetVersionStr")
|
|
474
|
+
ret = fn(package_name, c_version)
|
|
475
|
+
_aclCheckReturn(ret)
|
|
476
|
+
return c_version.value.decode().lower()
|
|
477
|
+
|
|
478
|
+
return None
|
|
@@ -135,8 +135,10 @@ DCMI_TOPO_TYPE_PHB = 2
|
|
|
135
135
|
DCMI_TOPO_TYPE_HCCS = 3
|
|
136
136
|
DCMI_TOPO_TYPE_PXB = 4
|
|
137
137
|
DCMI_TOPO_TYPE_PIX = 5
|
|
138
|
-
|
|
139
|
-
|
|
138
|
+
DCMI_TOPO_TYPE_SIO = 6
|
|
139
|
+
DCMI_TOPO_TYPE_HCCS_SW = 7
|
|
140
|
+
DCMI_TOPO_TYPE_BUTT = 8 # Unknown
|
|
141
|
+
DCMI_TOPO_TYOE_MAX = 9
|
|
140
142
|
|
|
141
143
|
|
|
142
144
|
## Error Codes ##
|
|
@@ -960,6 +960,14 @@ NVML_HOST_VGPU_MODE_SRIOV = 1
|
|
|
960
960
|
# GSP firmware
|
|
961
961
|
NVML_GSP_FIRMWARE_VERSION_BUF_SIZE = 0x40
|
|
962
962
|
|
|
963
|
+
# Health
|
|
964
|
+
IXML_HEALTH_SYSHUB_ERROR = 0x0000000000000001
|
|
965
|
+
IXML_HEALTH_MC_ERROR = 0x0000000000000002
|
|
966
|
+
IXML_HEALTH_ECC_ERROR = 0x0000000000000010
|
|
967
|
+
IXML_HEALTH_MEMORY_ERROR = 0x0000000000000020
|
|
968
|
+
IXML_HEALTH_PCIE_ERROR = 0x0000000000000040
|
|
969
|
+
IXML_HEALTH_OK = 0x0000000000000000
|
|
970
|
+
|
|
963
971
|
|
|
964
972
|
## Error Checking ##
|
|
965
973
|
class NVMLError(Exception):
|
|
@@ -5267,3 +5275,11 @@ def nvmlDeviceGetGpuFabricInfo(device, gpuFabricInfo):
|
|
|
5267
5275
|
ret = fn(device, gpuFabricInfo)
|
|
5268
5276
|
_nvmlCheckReturn(ret)
|
|
5269
5277
|
return ret
|
|
5278
|
+
|
|
5279
|
+
|
|
5280
|
+
def ixmlDeviceGetHealth(device):
|
|
5281
|
+
c_health = c_longlong()
|
|
5282
|
+
fn = _nvmlGetFunctionPointer("ixmlDeviceGetHealth")
|
|
5283
|
+
ret = fn(device, byref(c_health))
|
|
5284
|
+
_nvmlCheckReturn(ret)
|
|
5285
|
+
return c_health.value
|
|
@@ -393,3 +393,17 @@ def rsmi_is_p2p_accessible(device_a=0, device_b=0):
|
|
|
393
393
|
)
|
|
394
394
|
_rocmsmiCheckReturn(ret)
|
|
395
395
|
return c_accessible.value
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def rsmi_dev_ecc_count_get(device=0, gpu_block=None):
|
|
399
|
+
if gpu_block is None:
|
|
400
|
+
gpu_block = rsmi_gpu_block_t.RSMI_GPU_BLOCK_UMC
|
|
401
|
+
c_error_count = rsmi_error_count_t()
|
|
402
|
+
fn = _rocmsmiGetFunctionPointer("rsmi_dev_ecc_count_get")
|
|
403
|
+
ret = fn(
|
|
404
|
+
device,
|
|
405
|
+
gpu_block,
|
|
406
|
+
byref(c_error_count),
|
|
407
|
+
)
|
|
408
|
+
_rocmsmiCheckReturn(ret)
|
|
409
|
+
return c_error_count
|