gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. gpustack_runtime/_version.py +2 -2
  2. gpustack_runtime/_version_appendix.py +1 -1
  3. gpustack_runtime/cmds/detector.py +4 -2
  4. gpustack_runtime/deployer/__types__.py +314 -233
  5. gpustack_runtime/deployer/cdi/__init__.py +1 -1
  6. gpustack_runtime/deployer/cdi/__types__.py +2 -2
  7. gpustack_runtime/deployer/cdi/__utils__.py +4 -1
  8. gpustack_runtime/deployer/cdi/amd.py +6 -8
  9. gpustack_runtime/deployer/cdi/ascend.py +7 -9
  10. gpustack_runtime/deployer/cdi/hygon.py +6 -8
  11. gpustack_runtime/deployer/cdi/iluvatar.py +6 -8
  12. gpustack_runtime/deployer/cdi/metax.py +6 -8
  13. gpustack_runtime/deployer/cdi/thead.py +6 -8
  14. gpustack_runtime/deployer/docker.py +133 -146
  15. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +13 -8
  16. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +26 -21
  17. gpustack_runtime/deployer/kuberentes.py +89 -108
  18. gpustack_runtime/deployer/podman.py +113 -120
  19. gpustack_runtime/detector/__init__.py +2 -0
  20. gpustack_runtime/detector/__types__.py +26 -0
  21. gpustack_runtime/detector/__utils__.py +3 -0
  22. gpustack_runtime/detector/amd.py +32 -10
  23. gpustack_runtime/detector/ascend.py +67 -13
  24. gpustack_runtime/detector/cambricon.py +3 -0
  25. gpustack_runtime/detector/hygon.py +22 -3
  26. gpustack_runtime/detector/iluvatar.py +15 -7
  27. gpustack_runtime/detector/metax.py +16 -6
  28. gpustack_runtime/detector/mthreads.py +22 -8
  29. gpustack_runtime/detector/nvidia.py +148 -140
  30. gpustack_runtime/detector/pyacl/__init__.py +34 -14
  31. gpustack_runtime/detector/pydcmi/__init__.py +4 -2
  32. gpustack_runtime/detector/pyixml/__init__.py +16 -0
  33. gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
  34. gpustack_runtime/detector/thead.py +145 -134
  35. gpustack_runtime/envs.py +7 -6
  36. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/METADATA +2 -2
  37. gpustack_runtime-0.1.42.post1.dist-info/RECORD +67 -0
  38. gpustack_runtime-0.1.41.post3.dist-info/RECORD +0 -67
  39. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/WHEEL +0 -0
  40. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/entry_points.txt +0 -0
  41. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/licenses/LICENSE +0 -0
@@ -3,17 +3,17 @@ from __future__ import annotations as __future_annotations__
3
3
  import contextlib
4
4
  import logging
5
5
  import math
6
+ import re
6
7
  import time
7
8
  from _ctypes import byref
8
9
  from functools import lru_cache
9
10
  from pathlib import Path
10
- from typing import re
11
11
 
12
12
  import pynvml
13
13
 
14
14
  from .. import envs
15
15
  from ..logging import debug_log_exception, debug_log_warning
16
- from . import Topology, pycuda
16
+ from . import DeviceMemoryStatusEnum, Topology, pycuda
17
17
  from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
18
18
  from .__utils__ import (
19
19
  PCIDevice,
@@ -78,7 +78,7 @@ class NVIDIADetector(Detector):
78
78
  def __init__(self):
79
79
  super().__init__(ManufacturerEnum.NVIDIA)
80
80
 
81
- def detect(self) -> Devices | None:
81
+ def detect(self) -> Devices | None: # noqa: PLR0915
82
82
  """
83
83
  Detect NVIDIA GPUs using pynvml.
84
84
 
@@ -134,12 +134,29 @@ class NVIDIADetector(Detector):
134
134
 
135
135
  dev_numa = get_numa_node_by_bdf(dev_bdf)
136
136
  if not dev_numa:
137
- dev_node_affinity = pynvml.nvmlDeviceGetMemoryAffinity(
137
+ with contextlib.suppress(pynvml.NVMLError):
138
+ dev_node_affinity = pynvml.nvmlDeviceGetMemoryAffinity(
139
+ dev,
140
+ get_numa_nodeset_size(),
141
+ pynvml.NVML_AFFINITY_SCOPE_NODE,
142
+ )
143
+ dev_numa = bitmask_to_str(list(dev_node_affinity))
144
+
145
+ dev_temp = None
146
+ with contextlib.suppress(pynvml.NVMLError):
147
+ dev_temp = pynvml.nvmlDeviceGetTemperature(
138
148
  dev,
139
- get_numa_nodeset_size(),
140
- pynvml.NVML_AFFINITY_SCOPE_NODE,
149
+ pynvml.NVML_TEMPERATURE_GPU,
141
150
  )
142
- dev_numa = bitmask_to_str(list(dev_node_affinity))
151
+
152
+ dev_power = None
153
+ dev_power_used = None
154
+ with contextlib.suppress(pynvml.NVMLError):
155
+ dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
156
+ dev_power = dev_power // 1000 # mW to W
157
+ dev_power_used = (
158
+ pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
159
+ ) # mW to W
143
160
 
144
161
  dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
145
162
  with contextlib.suppress(pynvml.NVMLError):
@@ -180,6 +197,7 @@ class NVIDIADetector(Detector):
180
197
 
181
198
  dev_mem = 0
182
199
  dev_mem_used = 0
200
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
183
201
  with contextlib.suppress(pynvml.NVMLError):
184
202
  dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
185
203
  dev_mem = byte_to_mebibyte( # byte to MiB
@@ -188,24 +206,16 @@ class NVIDIADetector(Detector):
188
206
  dev_mem_used = byte_to_mebibyte( # byte to MiB
189
207
  dev_mem_info.used,
190
208
  )
191
- if dev_mem == 0:
192
- dev_mem, dev_mem_used = get_memory()
193
-
194
- dev_temp = None
195
- with contextlib.suppress(pynvml.NVMLError):
196
- dev_temp = pynvml.nvmlDeviceGetTemperature(
209
+ dev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
197
210
  dev,
198
- pynvml.NVML_TEMPERATURE_GPU,
211
+ pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
212
+ pynvml.NVML_VOLATILE_ECC,
213
+ pynvml.NVML_MEMORY_LOCATION_DRAM,
199
214
  )
200
-
201
- dev_power = None
202
- dev_power_used = None
203
- with contextlib.suppress(pynvml.NVMLError):
204
- dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
205
- dev_power = dev_power // 1000 # mW to W
206
- dev_power_used = (
207
- pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
208
- ) # mW to W
215
+ if dev_mem_ecc_errors > 0:
216
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
217
+ if dev_mem == 0:
218
+ dev_mem, dev_mem_used = get_memory()
209
219
 
210
220
  dev_is_vgpu = False
211
221
  if dev_bdf in pci_devs:
@@ -215,8 +225,9 @@ class NVIDIADetector(Detector):
215
225
  "arch_family": _get_arch_family(dev_cc_t),
216
226
  "vgpu": dev_is_vgpu,
217
227
  "bdf": dev_bdf,
218
- "numa": dev_numa,
219
228
  }
229
+ if dev_numa:
230
+ dev_appendix["numa"] = dev_numa
220
231
 
221
232
  if dev_fabric_info := _get_fabric_info(dev):
222
233
  dev_appendix.update(dev_fabric_info)
@@ -236,6 +247,7 @@ class NVIDIADetector(Detector):
236
247
  memory=dev_mem,
237
248
  memory_used=dev_mem_used,
238
249
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
250
+ memory_status=dev_mem_status,
239
251
  temperature=dev_temp,
240
252
  power=dev_power,
241
253
  power_used=dev_power_used,
@@ -254,12 +266,18 @@ class NVIDIADetector(Detector):
254
266
  mdev_cores = None
255
267
  mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
256
268
  for mdev_idx in range(mdev_count):
257
- mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
269
+ mdev = None
270
+ with contextlib.suppress(pynvml.NVMLError):
271
+ mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
272
+ if not mdev:
273
+ continue
258
274
 
259
- mdev_index = mdev_idx
275
+ mdev_index = mdev_idx + dev_count * (dev_idx + 1)
260
276
  mdev_uuid = pynvml.nvmlDeviceGetUUID(mdev)
261
277
 
262
- mdev_mem, mdev_mem_used = 0, 0
278
+ mdev_mem = 0
279
+ mdev_mem_used = 0
280
+ mdev_mem_status = DeviceMemoryStatusEnum.HEALTHY
263
281
  with contextlib.suppress(pynvml.NVMLError):
264
282
  mdev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(mdev)
265
283
  mdev_mem = byte_to_mebibyte( # byte to MiB
@@ -268,28 +286,22 @@ class NVIDIADetector(Detector):
268
286
  mdev_mem_used = byte_to_mebibyte( # byte to MiB
269
287
  mdev_mem_info.used,
270
288
  )
271
-
272
- mdev_temp = pynvml.nvmlDeviceGetTemperature(
273
- mdev,
274
- pynvml.NVML_TEMPERATURE_GPU,
275
- )
276
-
277
- mdev_power = None
278
- with contextlib.suppress(pynvml.NVMLError):
279
- mdev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(
289
+ mdev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
280
290
  mdev,
291
+ pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
292
+ pynvml.NVML_AGGREGATE_ECC,
293
+ pynvml.NVML_MEMORY_LOCATION_SRAM,
281
294
  )
282
- mdev_power = mdev_power // 1000 # mW to W
283
- mdev_power_used = (
284
- pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
285
- ) # mW to W
295
+ if mdev_mem_ecc_errors > 0:
296
+ mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
286
297
 
287
298
  mdev_appendix = {
288
299
  "arch_family": _get_arch_family(dev_cc_t),
289
300
  "vgpu": True,
290
301
  "bdf": dev_bdf,
291
- "numa": dev_numa,
292
302
  }
303
+ if dev_numa:
304
+ mdev_appendix["numa"] = dev_numa
293
305
 
294
306
  mdev_gi_id = pynvml.nvmlDeviceGetGpuInstanceId(mdev)
295
307
  mdev_appendix["gpu_instance_id"] = mdev_gi_id
@@ -305,71 +317,70 @@ class NVIDIADetector(Detector):
305
317
 
306
318
  mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
307
319
 
308
- if not mdev_name:
309
- mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
310
- mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
311
- mdev_gi,
312
- mdev_ci_id,
313
- )
314
- mdev_gi_info = pynvml.nvmlGpuInstanceGetInfo(mdev_gi)
315
- mdev_ci_info = pynvml.nvmlComputeInstanceGetInfo(mdev_ci)
316
- for dev_gi_prf_id in range(
317
- pynvml.NVML_GPU_INSTANCE_PROFILE_COUNT,
318
- ):
319
- try:
320
- dev_gi_prf = pynvml.nvmlDeviceGetGpuInstanceProfileInfo(
321
- dev,
322
- dev_gi_prf_id,
323
- )
324
- if dev_gi_prf.id != mdev_gi_info.profileId:
325
- continue
326
- except pynvml.NVMLError:
320
+ mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
321
+ mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
322
+ mdev_gi,
323
+ mdev_ci_id,
324
+ )
325
+ mdev_gi_info = pynvml.nvmlGpuInstanceGetInfo(mdev_gi)
326
+ mdev_ci_info = pynvml.nvmlComputeInstanceGetInfo(mdev_ci)
327
+ for dev_gi_prf_id in range(
328
+ pynvml.NVML_GPU_INSTANCE_PROFILE_COUNT,
329
+ ):
330
+ try:
331
+ dev_gi_prf = pynvml.nvmlDeviceGetGpuInstanceProfileInfo(
332
+ dev,
333
+ dev_gi_prf_id,
334
+ )
335
+ if dev_gi_prf.id != mdev_gi_info.profileId:
327
336
  continue
337
+ except pynvml.NVMLError:
338
+ continue
328
339
 
329
- for dev_ci_prf_id in range(
330
- pynvml.NVML_COMPUTE_INSTANCE_PROFILE_COUNT,
340
+ for dev_ci_prf_id in range(
341
+ pynvml.NVML_COMPUTE_INSTANCE_PROFILE_COUNT,
342
+ ):
343
+ for dev_cig_prf_id in range(
344
+ pynvml.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
331
345
  ):
332
- for dev_cig_prf_id in range(
333
- pynvml.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
334
- ):
335
- try:
336
- mdev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
337
- mdev_gi,
338
- dev_ci_prf_id,
339
- dev_cig_prf_id,
340
- )
341
- if mdev_ci_prf.id != mdev_ci_info.profileId:
342
- continue
343
- except pynvml.NVMLError:
344
- continue
345
-
346
- ci_slice = _get_compute_instance_slice(
346
+ try:
347
+ dev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
348
+ mdev_gi,
347
349
  dev_ci_prf_id,
350
+ dev_cig_prf_id,
348
351
  )
349
- gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
350
- gi_mem = _get_gpu_instance_memory(
351
- dev_mem_info,
352
- dev_gi_prf,
353
- )
354
- gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
355
- gi_neg_attrs = _get_gpu_instance_negattrs(
356
- dev_gi_prf_id,
357
- )
352
+ if dev_ci_prf.id != mdev_ci_info.profileId:
353
+ continue
354
+ except pynvml.NVMLError:
355
+ continue
358
356
 
359
- if ci_slice == gi_slice:
360
- mdev_name = f"{gi_slice}g.{gi_mem}gb"
357
+ ci_slice = _get_compute_instance_slice(dev_ci_prf_id)
358
+ gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
359
+ if ci_slice == gi_slice:
360
+ if hasattr(dev_gi_prf, "name"):
361
+ mdev_name = dev_gi_prf.name
361
362
  else:
362
- mdev_name = (
363
- f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
363
+ gi_mem = round(
364
+ math.ceil(dev_gi_prf.memorySizeMB >> 10),
364
365
  )
365
- if gi_attrs:
366
- mdev_name += f"+{gi_attrs}"
367
- if gi_neg_attrs:
368
- mdev_name += f"-{gi_neg_attrs}"
366
+ mdev_name = f"{gi_slice}g.{gi_mem}gb"
367
+ elif hasattr(dev_ci_prf, "name"):
368
+ mdev_name = dev_ci_prf.name
369
+ else:
370
+ gi_mem = round(
371
+ math.ceil(dev_gi_prf.memorySizeMB >> 10),
372
+ )
373
+ mdev_name = f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
374
+ gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
375
+ if gi_attrs:
376
+ mdev_name += f"+{gi_attrs}"
377
+ gi_neg_attrs = _get_gpu_instance_negattrs(dev_gi_prf_id)
378
+ if gi_neg_attrs:
379
+ mdev_name += f"-{gi_neg_attrs}"
369
380
 
370
- mdev_cores = mdev_ci_prf.multiprocessorCount
381
+ mdev_cores = dev_ci_prf.multiprocessorCount
371
382
 
372
- break
383
+ break
373
384
 
374
385
  ret.append(
375
386
  Device(
@@ -386,9 +397,10 @@ class NVIDIADetector(Detector):
386
397
  memory=mdev_mem,
387
398
  memory_used=mdev_mem_used,
388
399
  memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
389
- temperature=mdev_temp,
390
- power=mdev_power,
391
- power_used=mdev_power_used,
400
+ memory_status=mdev_mem_status,
401
+ temperature=dev_temp,
402
+ power=dev_power,
403
+ power_used=dev_power_used,
392
404
  appendix=mdev_appendix,
393
405
  ),
394
406
  )
@@ -426,11 +438,17 @@ class NVIDIADetector(Detector):
426
438
  devices_count=len(devices),
427
439
  )
428
440
 
441
+ get_links_cache = {}
442
+
429
443
  try:
430
444
  pynvml.nvmlInit()
431
445
 
432
446
  for i, dev_i in enumerate(devices):
433
- dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
447
+ dev_i_bdf = dev_i.appendix.get("bdf")
448
+ if dev_i.appendix.get("vgpu", False):
449
+ dev_i_handle = pynvml.nvmlDeviceGetHandleByPciBusId(dev_i_bdf)
450
+ else:
451
+ dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
434
452
 
435
453
  # Get NUMA and CPU affinities.
436
454
  ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
@@ -439,7 +457,12 @@ class NVIDIADetector(Detector):
439
457
  )
440
458
 
441
459
  # Get links state if applicable.
442
- if dev_i_links_state := _get_links_state(dev_i_handle):
460
+ if dev_i_bdf in get_links_cache:
461
+ dev_i_links_state = get_links_cache[dev_i_bdf]
462
+ else:
463
+ dev_i_links_state = _get_links_state(dev_i_handle)
464
+ get_links_cache[dev_i_bdf] = dev_i_links_state
465
+ if dev_i_links_state:
443
466
  ret.appendices[i].update(dev_i_links_state)
444
467
  # In practice, if a card has an active *Link,
445
468
  # then other cards in the same machine should be interconnected with it through the *Link.
@@ -456,21 +479,30 @@ class NVIDIADetector(Detector):
456
479
  if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
457
480
  continue
458
481
 
459
- dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
460
-
461
- distance = TopologyDistanceEnum.UNK
462
- try:
463
- distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
464
- dev_i_handle,
465
- dev_j_handle,
466
- )
467
- except pynvml.NVMLError:
468
- debug_log_exception(
469
- logger,
470
- "Failed to get distance between device %d and %d",
471
- dev_i.index,
472
- dev_j.index,
473
- )
482
+ dev_j_bdf = dev_j.appendix.get("bdf")
483
+ if dev_i_bdf == dev_j_bdf:
484
+ distance = TopologyDistanceEnum.SELF
485
+ else:
486
+ if dev_j.appendix.get("vgpu", False):
487
+ dev_j_handle = pynvml.nvmlDeviceGetHandleByPciBusId(
488
+ dev_j_bdf,
489
+ )
490
+ else:
491
+ dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
492
+
493
+ distance = TopologyDistanceEnum.UNK
494
+ try:
495
+ distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
496
+ dev_i_handle,
497
+ dev_j_handle,
498
+ )
499
+ except pynvml.NVMLError:
500
+ debug_log_exception(
501
+ logger,
502
+ "Failed to get distance between device %d and %d",
503
+ dev_i.index,
504
+ dev_j.index,
505
+ )
474
506
 
475
507
  ret.devices_distances[i][j] = distance
476
508
  ret.devices_distances[j][i] = distance
@@ -767,30 +799,6 @@ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
767
799
  raise AttributeError(msg)
768
800
 
769
801
 
770
- def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
771
- """
772
- Compute the memory size of a MIG compute instance in GiB.
773
-
774
- Args:
775
- dev_mem:
776
- The total memory info of the parent GPU device.
777
- dev_gi_prf:
778
- The profile info of the GPU instance.
779
-
780
- Returns:
781
- The memory size in GiB.
782
-
783
- """
784
- mem = dev_gi_prf.memorySizeMB * (1 << 20) # MiB to byte
785
-
786
- gib = round(
787
- math.ceil(mem / dev_mem.total * 8)
788
- / 8
789
- * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
790
- )
791
- return gib
792
-
793
-
794
802
  def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
795
803
  """
796
804
  Get the number of slice for a given Compute Instance Profile ID.
@@ -403,20 +403,21 @@ def _LoadAclLibrary():
403
403
  locs = [
404
404
  "libascendcl.so",
405
405
  ]
406
- ascend_path = Path(
407
- os.getenv(
408
- "ASCEND_HOME_PATH",
409
- "/usr/local/Ascend/ascend-toolkit/latest",
410
- ),
411
- )
412
- if ascend_path.exists():
413
- locs.extend(
414
- [
415
- str(ascend_path / "runtime/lib64/libascendcl.so"),
416
- str(ascend_path / "aarch64-linux/lib64/libascendcl.so"),
417
- str(ascend_path / "x86_64-linux/lib64/libascendcl.so"),
418
- ]
406
+ for default_path in [
407
+ "/usr/local/Ascend/ascend-toolkit/latest",
408
+ "/usr/local/Ascend/cann",
409
+ ]:
410
+ ascend_path = Path(
411
+ os.getenv("ASCEND_HOME_PATH", default_path),
419
412
  )
413
+ if ascend_path.exists():
414
+ locs.extend(
415
+ [
416
+ str(ascend_path / "runtime/lib64/libascendcl.so"),
417
+ str(ascend_path / "aarch64-linux/lib64/libascendcl.so"),
418
+ str(ascend_path / "x86_64-linux/lib64/libascendcl.so"),
419
+ ]
420
+ )
420
421
  for loc in locs:
421
422
  try:
422
423
  aclLib = CDLL(loc)
@@ -439,7 +440,8 @@ def aclrtGetSocName():
439
440
  fn = _aclGetFunctionPointer("aclrtGetSocName")
440
441
  fn.restype = c_char_p
441
442
  c_version = fn()
442
- return c_version.decode()
443
+ if c_version is not None:
444
+ return c_version.decode()
443
445
 
444
446
  return None
445
447
 
@@ -456,3 +458,21 @@ def aclsysGetCANNVersion(package_name=ACL_PKG_NAME_CANN):
456
458
  return f"{c_version.version}".lower()
457
459
 
458
460
  return None
461
+
462
+
463
+ def aclsysGetVersion():
464
+ cann_version = aclsysGetCANNVersion()
465
+ if cann_version is not None:
466
+ return cann_version
467
+
468
+ with contextlib.suppress(ACLError):
469
+ _LoadAclLibrary()
470
+
471
+ c_version = create_string_buffer(ACL_PKG_VERSION_MAX_SIZE)
472
+ package_name = b"runtime"
473
+ fn = _aclGetFunctionPointer("aclsysGetVersionStr")
474
+ ret = fn(package_name, c_version)
475
+ _aclCheckReturn(ret)
476
+ return c_version.value.decode().lower()
477
+
478
+ return None
@@ -135,8 +135,10 @@ DCMI_TOPO_TYPE_PHB = 2
135
135
  DCMI_TOPO_TYPE_HCCS = 3
136
136
  DCMI_TOPO_TYPE_PXB = 4
137
137
  DCMI_TOPO_TYPE_PIX = 5
138
- DCMI_TOPO_TYPE_BUTT = 6 # Unknown
139
- DCMI_TOPO_TYOE_MAX = 7
138
+ DCMI_TOPO_TYPE_SIO = 6
139
+ DCMI_TOPO_TYPE_HCCS_SW = 7
140
+ DCMI_TOPO_TYPE_BUTT = 8 # Unknown
141
+ DCMI_TOPO_TYOE_MAX = 9
140
142
 
141
143
 
142
144
  ## Error Codes ##
@@ -960,6 +960,14 @@ NVML_HOST_VGPU_MODE_SRIOV = 1
960
960
  # GSP firmware
961
961
  NVML_GSP_FIRMWARE_VERSION_BUF_SIZE = 0x40
962
962
 
963
+ # Health
964
+ IXML_HEALTH_SYSHUB_ERROR = 0x0000000000000001
965
+ IXML_HEALTH_MC_ERROR = 0x0000000000000002
966
+ IXML_HEALTH_ECC_ERROR = 0x0000000000000010
967
+ IXML_HEALTH_MEMORY_ERROR = 0x0000000000000020
968
+ IXML_HEALTH_PCIE_ERROR = 0x0000000000000040
969
+ IXML_HEALTH_OK = 0x0000000000000000
970
+
963
971
 
964
972
  ## Error Checking ##
965
973
  class NVMLError(Exception):
@@ -5267,3 +5275,11 @@ def nvmlDeviceGetGpuFabricInfo(device, gpuFabricInfo):
5267
5275
  ret = fn(device, gpuFabricInfo)
5268
5276
  _nvmlCheckReturn(ret)
5269
5277
  return ret
5278
+
5279
+
5280
+ def ixmlDeviceGetHealth(device):
5281
+ c_health = c_longlong()
5282
+ fn = _nvmlGetFunctionPointer("ixmlDeviceGetHealth")
5283
+ ret = fn(device, byref(c_health))
5284
+ _nvmlCheckReturn(ret)
5285
+ return c_health.value
@@ -393,3 +393,17 @@ def rsmi_is_p2p_accessible(device_a=0, device_b=0):
393
393
  )
394
394
  _rocmsmiCheckReturn(ret)
395
395
  return c_accessible.value
396
+
397
+
398
+ def rsmi_dev_ecc_count_get(device=0, gpu_block=None):
399
+ if gpu_block is None:
400
+ gpu_block = rsmi_gpu_block_t.RSMI_GPU_BLOCK_UMC
401
+ c_error_count = rsmi_error_count_t()
402
+ fn = _rocmsmiGetFunctionPointer("rsmi_dev_ecc_count_get")
403
+ ret = fn(
404
+ device,
405
+ gpu_block,
406
+ byref(c_error_count),
407
+ )
408
+ _rocmsmiCheckReturn(ret)
409
+ return c_error_count