gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. gpustack_runtime/_version.py +2 -2
  2. gpustack_runtime/_version_appendix.py +1 -1
  3. gpustack_runtime/cmds/detector.py +4 -2
  4. gpustack_runtime/deployer/__types__.py +314 -233
  5. gpustack_runtime/deployer/cdi/__init__.py +1 -1
  6. gpustack_runtime/deployer/cdi/__types__.py +2 -2
  7. gpustack_runtime/deployer/cdi/__utils__.py +4 -1
  8. gpustack_runtime/deployer/cdi/amd.py +6 -8
  9. gpustack_runtime/deployer/cdi/ascend.py +7 -9
  10. gpustack_runtime/deployer/cdi/hygon.py +6 -8
  11. gpustack_runtime/deployer/cdi/iluvatar.py +6 -8
  12. gpustack_runtime/deployer/cdi/metax.py +6 -8
  13. gpustack_runtime/deployer/cdi/thead.py +6 -8
  14. gpustack_runtime/deployer/docker.py +133 -146
  15. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +13 -8
  16. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +26 -21
  17. gpustack_runtime/deployer/kuberentes.py +89 -108
  18. gpustack_runtime/deployer/podman.py +113 -120
  19. gpustack_runtime/detector/__init__.py +2 -0
  20. gpustack_runtime/detector/__types__.py +26 -0
  21. gpustack_runtime/detector/__utils__.py +3 -0
  22. gpustack_runtime/detector/amd.py +32 -10
  23. gpustack_runtime/detector/ascend.py +67 -13
  24. gpustack_runtime/detector/cambricon.py +3 -0
  25. gpustack_runtime/detector/hygon.py +22 -3
  26. gpustack_runtime/detector/iluvatar.py +15 -7
  27. gpustack_runtime/detector/metax.py +16 -6
  28. gpustack_runtime/detector/mthreads.py +22 -8
  29. gpustack_runtime/detector/nvidia.py +148 -140
  30. gpustack_runtime/detector/pyacl/__init__.py +34 -14
  31. gpustack_runtime/detector/pydcmi/__init__.py +4 -2
  32. gpustack_runtime/detector/pyixml/__init__.py +16 -0
  33. gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
  34. gpustack_runtime/detector/thead.py +145 -134
  35. gpustack_runtime/envs.py +7 -6
  36. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/METADATA +2 -2
  37. gpustack_runtime-0.1.42.post1.dist-info/RECORD +67 -0
  38. gpustack_runtime-0.1.41.post3.dist-info/RECORD +0 -67
  39. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/WHEEL +0 -0
  40. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/entry_points.txt +0 -0
  41. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/licenses/LICENSE +0 -0
@@ -8,7 +8,14 @@ from pathlib import Path
8
8
  from .. import envs
9
9
  from ..logging import debug_log_exception, debug_log_warning
10
10
  from . import Topology, pyamdgpu, pyamdsmi, pyhsa, pyrocmcore, pyrocmsmi
11
- from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
11
+ from .__types__ import (
12
+ Detector,
13
+ Device,
14
+ DeviceMemoryStatusEnum,
15
+ Devices,
16
+ ManufacturerEnum,
17
+ TopologyDistanceEnum,
18
+ )
12
19
  from .__utils__ import (
13
20
  PCIDevice,
14
21
  byte_to_mebibyte,
@@ -165,20 +172,32 @@ class AMDDetector(Detector):
165
172
  )
166
173
  dev_cores_util = 0
167
174
 
168
- dev_mem = None
169
- dev_mem_used = None
175
+ dev_mem = 0
176
+ dev_mem_used = 0
177
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
170
178
  try:
171
179
  dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev)
172
180
  dev_mem = dev_gpu_vram_usage.get("vram_total")
173
181
  dev_mem_used = dev_gpu_vram_usage.get("vram_used")
182
+ dev_ecc_count = pyamdsmi.amdsmi_get_gpu_ecc_count(
183
+ dev,
184
+ pyamdsmi.AmdSmiGpuBlock.UMC,
185
+ )
186
+ if dev_ecc_count.get("uncorrectable_count", 0) > 0:
187
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
174
188
  except pyamdsmi.AmdSmiException:
189
+ dev_mem = byte_to_mebibyte( # byte to MiB
190
+ pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
191
+ )
192
+ dev_mem_used = byte_to_mebibyte( # byte to MiB
193
+ pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
194
+ )
175
195
  with contextlib.suppress(pyrocmsmi.ROCMSMIError):
176
- dev_mem = byte_to_mebibyte( # byte to MiB
177
- pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
178
- )
179
- dev_mem_used = byte_to_mebibyte( # byte to MiB
180
- pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
196
+ dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
197
+ dev_idx,
181
198
  )
199
+ if dev_ecc_count.uncorrectable_err > 0:
200
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
182
201
 
183
202
  dev_power = None
184
203
  dev_power_used = None
@@ -201,14 +220,16 @@ class AMDDetector(Detector):
201
220
 
202
221
  dev_numa = get_numa_node_by_bdf(dev_bdf)
203
222
  if not dev_numa:
204
- dev_numa = str(pyamdsmi.amdsmi_topo_get_numa_node_number(dev))
223
+ with contextlib.suppress(pyamdsmi.AmdSmiException):
224
+ dev_numa = str(pyamdsmi.amdsmi_topo_get_numa_node_number(dev))
205
225
 
206
226
  dev_appendix = {
207
227
  "arch_family": _get_arch_family(dev_asic_family_id),
208
228
  "vgpu": dev_is_vgpu,
209
229
  "bdf": dev_bdf,
210
- "numa": dev_numa,
211
230
  }
231
+ if dev_numa:
232
+ dev_appendix["numa"] = dev_numa
212
233
  if dev_card_id is not None:
213
234
  dev_appendix["card_id"] = dev_card_id
214
235
  if dev_renderd_id is not None:
@@ -232,6 +253,7 @@ class AMDDetector(Detector):
232
253
  memory=dev_mem,
233
254
  memory_used=dev_mem_used,
234
255
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
256
+ memory_status=dev_mem_status,
235
257
  temperature=dev_temp,
236
258
  power=dev_power,
237
259
  power_used=dev_power_used,
@@ -10,6 +10,7 @@ from . import pyacl, pydcmi
10
10
  from .__types__ import (
11
11
  Detector,
12
12
  Device,
13
+ DeviceMemoryStatusEnum,
13
14
  Devices,
14
15
  ManufacturerEnum,
15
16
  Topology,
@@ -31,10 +32,12 @@ slogger = logger.getChild("internal")
31
32
  _TOPOLOGY_DISTANCE_MAPPING: dict[int, int] = {
32
33
  pydcmi.DCMI_TOPO_TYPE_SELF: TopologyDistanceEnum.SELF,
33
34
  pydcmi.DCMI_TOPO_TYPE_HCCS: TopologyDistanceEnum.LINK, # Traversing via high-speed interconnect, RoCE, etc.
35
+ pydcmi.DCMI_TOPO_TYPE_HCCS_SW: TopologyDistanceEnum.LINK, # Traversing via high-speed interconnect switch.
34
36
  pydcmi.DCMI_TOPO_TYPE_PIX: TopologyDistanceEnum.PIX, # Traversing via a single PCIe bridge.
35
37
  pydcmi.DCMI_TOPO_TYPE_PXB: TopologyDistanceEnum.PXB, # Traversing via multiple PCIe bridges without PCIe Host Bridge.
36
38
  pydcmi.DCMI_TOPO_TYPE_PHB: TopologyDistanceEnum.PHB, # Traversing via a PCIe Host Bridge.
37
39
  pydcmi.DCMI_TOPO_TYPE_SYS: TopologyDistanceEnum.SYS, # Traversing via SMP interconnect across other NUMA nodes.
40
+ pydcmi.DCMI_TOPO_TYPE_SIO: TopologyDistanceEnum.SYS, # Traversing via Super I/O or other slower interconnects.
38
41
  }
39
42
  """
40
43
  Mapping of Ascend topology types to distance values.
@@ -108,7 +111,7 @@ class AscendDetector(Detector):
108
111
 
109
112
  sys_driver_ver = pydcmi.dcmi_get_driver_version()
110
113
 
111
- sys_runtime_ver_original = pyacl.aclsysGetCANNVersion()
114
+ sys_runtime_ver_original = pyacl.aclsysGetVersion()
112
115
  sys_runtime_ver = get_brief_version(sys_runtime_ver_original)
113
116
 
114
117
  _, card_list = pydcmi.dcmi_get_card_list()
@@ -128,7 +131,9 @@ class AscendDetector(Detector):
128
131
  dev_is_vgpu = True
129
132
  dev_cores_aicore = dev_virt_info.query_info.computing.aic
130
133
  dev_name = dev_virt_info.query_info.name
131
- dev_mem, dev_mem_used = 0, 0
134
+ dev_mem = 0
135
+ dev_mem_used = 0
136
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
132
137
  if hasattr(dev_virt_info.query_info.computing, "memory_size"):
133
138
  dev_mem = dev_virt_info.query_info.computing.memory_size
134
139
  dev_index = dev_virt_info.vdev_id
@@ -143,6 +148,10 @@ class AscendDetector(Detector):
143
148
  dev_card_id,
144
149
  dev_device_id,
145
150
  )
151
+ dev_mem_status = _get_device_memory_status(
152
+ dev_card_id,
153
+ dev_device_id,
154
+ )
146
155
  dev_index = pydcmi.dcmi_get_device_logic_id(
147
156
  dev_card_id,
148
157
  dev_device_id,
@@ -191,13 +200,14 @@ class AscendDetector(Detector):
191
200
 
192
201
  dev_numa = get_numa_node_by_bdf(dev_bdf)
193
202
  if not dev_numa:
194
- dev_cpu_affinity = (
195
- pydcmi.dcmi_get_affinity_cpu_info_by_device_id(
196
- dev_card_id,
197
- dev_device_id,
203
+ with contextlib.suppress(pydcmi.DCMIError):
204
+ dev_cpu_affinity = (
205
+ pydcmi.dcmi_get_affinity_cpu_info_by_device_id(
206
+ dev_card_id,
207
+ dev_device_id,
208
+ )
198
209
  )
199
- )
200
- dev_numa = map_cpu_affinity_to_numa_node(dev_cpu_affinity)
210
+ dev_numa = map_cpu_affinity_to_numa_node(dev_cpu_affinity)
201
211
 
202
212
  dev_appendix = {
203
213
  "arch_family": (
@@ -206,11 +216,12 @@ class AscendDetector(Detector):
206
216
  ),
207
217
  "vgpu": dev_is_vgpu,
208
218
  "bdf": dev_bdf,
209
- "numa": dev_numa,
210
219
  "card_id": dev_card_id,
211
220
  "device_id": dev_device_id,
212
221
  "device_id_max": device_num_in_card - 1,
213
222
  }
223
+ if dev_numa:
224
+ dev_appendix["numa"] = dev_numa
214
225
 
215
226
  dev_roce_ip, dev_roce_mask, dev_roce_gateway = (
216
227
  _get_device_roce_network_info(
@@ -239,6 +250,7 @@ class AscendDetector(Detector):
239
250
  memory=dev_mem,
240
251
  memory_used=dev_mem_used,
241
252
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
253
+ memory_status=dev_mem_status,
242
254
  temperature=dev_temp,
243
255
  power_used=dev_power_used,
244
256
  appendix=dev_appendix,
@@ -332,6 +344,12 @@ def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:
332
344
  """
333
345
  Get device memory information.
334
346
 
347
+ Args:
348
+ dev_card_id:
349
+ The card ID of the device.
350
+ dev_device_id:
351
+ The device ID of the device.
352
+
335
353
  Returns:
336
354
  A tuple containing total memory and used memory in MiB.
337
355
 
@@ -370,6 +388,37 @@ def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:
370
388
  return dev_mem, dev_mem_used
371
389
 
372
390
 
391
+ def _get_device_memory_status(dev_card_id, dev_device_id) -> DeviceMemoryStatusEnum:
392
+ """
393
+ Get device memory ECC status.
394
+
395
+ Args:
396
+ dev_card_id:
397
+ The card ID of the device.
398
+ dev_device_id:
399
+ The device ID of the device.
400
+
401
+ Returns:
402
+ DeviceMemoryStatusEnum indicating the ECC status.
403
+
404
+ """
405
+ for dev_mem_type in [pydcmi.DCMI_DEVICE_TYPE_HBM, pydcmi.DCMI_DEVICE_TYPE_DDR]:
406
+ with contextlib.suppress(pydcmi.DCMIError):
407
+ dev_ecc_info = pydcmi.dcmi_get_device_ecc_info(
408
+ dev_card_id,
409
+ dev_device_id,
410
+ dev_mem_type,
411
+ )
412
+ if dev_ecc_info.enable_flag and (
413
+ dev_ecc_info.single_bit_error_cnt > 0
414
+ or dev_ecc_info.double_bit_error_cnt > 0
415
+ ):
416
+ return DeviceMemoryStatusEnum.UNHEALTHY
417
+ return DeviceMemoryStatusEnum.HEALTHY
418
+
419
+ return DeviceMemoryStatusEnum.HEALTHY
420
+
421
+
373
422
  def _get_device_roce_network_info(
374
423
  dev_card_id,
375
424
  dev_device_id,
@@ -395,7 +444,7 @@ def _get_device_roce_network_info(
395
444
  pydcmi.DCMI_PORT_TYPE_ROCE_PORT,
396
445
  )
397
446
  except pydcmi.DCMIError:
398
- debug_log_exception(logger, "Failed to get device roce network info")
447
+ debug_log_exception(logger, "Failed to get device RoCE network info")
399
448
 
400
449
  return ip, mask, gateway
401
450
 
@@ -456,12 +505,15 @@ _soc_name_version_mapping: dict[str, int] = {
456
505
  "Ascend310B3": 242,
457
506
  "Ascend310B4": 243,
458
507
  "Ascend910_9391": 250,
508
+ "Ascend910": 250,
459
509
  "Ascend910_9392": 251,
460
510
  "Ascend910_9381": 252,
461
511
  "Ascend910_9382": 253,
462
512
  "Ascend910_9372": 254,
463
513
  "Ascend910_9362": 255,
464
514
  "Ascend910_9579": 260,
515
+ "Ascend910_95": 260,
516
+ "Ascend950": 260,
465
517
  }
466
518
 
467
519
 
@@ -477,6 +529,8 @@ def _guess_soc_name_from_dev_name(dev_name: str) -> str | None:
477
529
  The guessed SoC name, or None if not found.
478
530
 
479
531
  """
532
+ if dev_name.startswith("Ascend"):
533
+ dev_name = dev_name[6:].strip()
480
534
  soc_name = f"Ascend{dev_name}"
481
535
  if soc_name in _soc_name_version_mapping:
482
536
  return soc_name
@@ -528,11 +582,11 @@ def get_ascend_cann_variant(name: str | None) -> str | None:
528
582
  if version < 220:
529
583
  return "310p"
530
584
  if version < 240:
531
- return "910b"
585
+ return "910b" # 910b/a2
532
586
  if version < 250:
533
587
  return "310b"
534
588
  if version < 260:
535
- return "a3" # 910c
589
+ return "a3" # 910c/a3
536
590
  if version < 270:
537
- return "a5" # 910d
591
+ return "a5" # 910d/a5
538
592
  return None
@@ -6,6 +6,7 @@ from functools import lru_cache
6
6
 
7
7
  from .. import envs
8
8
  from ..logging import debug_log_exception
9
+ from . import DeviceMemoryStatusEnum
9
10
  from .__types__ import Detector, Device, Devices, ManufacturerEnum
10
11
  from .__utils__ import (
11
12
  PCIDevice,
@@ -100,6 +101,7 @@ class CambriconDetector(Detector):
100
101
  dev_mem_usage_info = dev_info.get("PhysicalMemUsage", {})
101
102
  dev_mem = safe_int(dev_mem_usage_info.get("Total", 0))
102
103
  dev_mem_used = safe_int(dev_mem_usage_info.get("Used", 0))
104
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
103
105
 
104
106
  dev_temp_info = dev_info.get("Temperature", {})
105
107
  dev_temp = safe_float(dev_temp_info.get("Chip", 0))
@@ -118,6 +120,7 @@ class CambriconDetector(Detector):
118
120
  memory=dev_mem,
119
121
  memory_used=dev_mem_used,
120
122
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
123
+ memory_status=dev_mem_status,
121
124
  temperature=dev_temp,
122
125
  appendix=dev_appendix,
123
126
  ),
@@ -8,7 +8,14 @@ from pathlib import Path
8
8
  from .. import envs
9
9
  from ..logging import debug_log_exception, debug_log_warning
10
10
  from . import Topology, pyamdgpu, pyhsa, pyrocmcore, pyrocmsmi
11
- from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
11
+ from .__types__ import (
12
+ Detector,
13
+ Device,
14
+ DeviceMemoryStatusEnum,
15
+ Devices,
16
+ ManufacturerEnum,
17
+ TopologyDistanceEnum,
18
+ )
12
19
  from .__utils__ import (
13
20
  PCIDevice,
14
21
  byte_to_mebibyte,
@@ -149,6 +156,13 @@ class HygonDetector(Detector):
149
156
  dev_mem_used = byte_to_mebibyte( # byte to MiB
150
157
  pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
151
158
  )
159
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
160
+ with contextlib.suppress(pyrocmsmi.ROCMSMIError):
161
+ dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
162
+ dev_idx,
163
+ )
164
+ if dev_ecc_count.uncorrectable_err > 0:
165
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
152
166
 
153
167
  dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
154
168
  dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
@@ -157,13 +171,17 @@ class HygonDetector(Detector):
157
171
 
158
172
  dev_numa = get_numa_node_by_bdf(dev_bdf)
159
173
  if not dev_numa:
160
- dev_numa = str(pyrocmsmi.rsmi_topo_get_numa_node_number(dev_idx))
174
+ with contextlib.suppress(pyrocmsmi.ROCMSMIError):
175
+ dev_numa = str(
176
+ pyrocmsmi.rsmi_topo_get_numa_node_number(dev_idx),
177
+ )
161
178
 
162
179
  dev_appendix = {
163
180
  "vgpu": dev_is_vgpu,
164
181
  "bdf": dev_bdf,
165
- "numa": dev_numa,
166
182
  }
183
+ if dev_numa:
184
+ dev_appendix["numa"] = dev_numa
167
185
  if dev_card_id is not None:
168
186
  dev_appendix["card_id"] = dev_card_id
169
187
  if dev_renderd_id is not None:
@@ -184,6 +202,7 @@ class HygonDetector(Detector):
184
202
  memory=dev_mem,
185
203
  memory_used=dev_mem_used,
186
204
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
205
+ memory_status=dev_mem_status,
187
206
  temperature=dev_temp,
188
207
  power=dev_power,
189
208
  power_used=dev_power_used,
@@ -10,6 +10,7 @@ from . import pyixml
10
10
  from .__types__ import (
11
11
  Detector,
12
12
  Device,
13
+ DeviceMemoryStatusEnum,
13
14
  Devices,
14
15
  ManufacturerEnum,
15
16
  Topology,
@@ -135,6 +136,7 @@ class IluvatarDetector(Detector):
135
136
 
136
137
  dev_mem = 0
137
138
  dev_mem_used = 0
139
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
138
140
  with contextlib.suppress(pyixml.NVMLError):
139
141
  dev_mem_info = pyixml.nvmlDeviceGetMemoryInfo(dev)
140
142
  dev_mem = byte_to_mebibyte( # byte to MiB
@@ -143,6 +145,9 @@ class IluvatarDetector(Detector):
143
145
  dev_mem_used = byte_to_mebibyte( # byte to MiB
144
146
  dev_mem_info.used,
145
147
  )
148
+ dev_health = pyixml.ixmlDeviceGetHealth(dev)
149
+ if dev_health != pyixml.IXML_HEALTH_OK:
150
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
146
151
 
147
152
  dev_cores_util = None
148
153
  with contextlib.suppress(pyixml.NVMLError):
@@ -185,18 +190,20 @@ class IluvatarDetector(Detector):
185
190
 
186
191
  dev_numa = get_numa_node_by_bdf(dev_bdf)
187
192
  if not dev_numa:
188
- dev_node_affinity = pyixml.nvmlDeviceGetMemoryAffinity(
189
- dev,
190
- get_numa_nodeset_size(),
191
- pyixml.NVML_AFFINITY_SCOPE_NODE,
192
- )
193
- dev_numa = bitmask_to_str(list(dev_node_affinity))
193
+ with contextlib.suppress(pyixml.NVMLError):
194
+ dev_node_affinity = pyixml.nvmlDeviceGetMemoryAffinity(
195
+ dev,
196
+ get_numa_nodeset_size(),
197
+ pyixml.NVML_AFFINITY_SCOPE_NODE,
198
+ )
199
+ dev_numa = bitmask_to_str(list(dev_node_affinity))
194
200
 
195
201
  dev_appendix = {
196
202
  "vgpu": dev_is_vgpu,
197
203
  "bdf": dev_bdf,
198
- "numa": dev_numa,
199
204
  }
205
+ if dev_numa:
206
+ dev_appendix["numa"] = dev_numa
200
207
 
201
208
  ret.append(
202
209
  Device(
@@ -213,6 +220,7 @@ class IluvatarDetector(Detector):
213
220
  memory=dev_mem,
214
221
  memory_used=dev_mem_used,
215
222
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
223
+ memory_status=dev_mem_status,
216
224
  temperature=dev_temp,
217
225
  power=dev_power,
218
226
  power_used=dev_power_used,
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations as __future_annotations__
2
2
 
3
+ import contextlib
3
4
  import logging
4
5
  from functools import lru_cache
5
6
  from pathlib import Path
@@ -10,6 +11,7 @@ from . import pymxsml
10
11
  from .__types__ import (
11
12
  Detector,
12
13
  Device,
14
+ DeviceMemoryStatusEnum,
13
15
  Devices,
14
16
  ManufacturerEnum,
15
17
  Topology,
@@ -145,6 +147,11 @@ class MetaXDetector(Detector):
145
147
  dev_mem_used = kibibyte_to_mebibyte( # KiB to MiB
146
148
  dev_mem_info.vramUse,
147
149
  )
150
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
151
+ with contextlib.suppress(pymxsml.MXSMLError):
152
+ dev_ecc_errors = pymxsml.mxSmlGetTotalEccErrors(dev_idx)
153
+ if dev_ecc_errors.dramUE > 0:
154
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
148
155
 
149
156
  dev_temp = (
150
157
  pymxsml.mxSmlGetTemperatureInfo(
@@ -172,17 +179,19 @@ class MetaXDetector(Detector):
172
179
 
173
180
  dev_numa = get_numa_node_by_bdf(dev_bdf)
174
181
  if not dev_numa:
175
- dev_node_affinity = pymxsml.mxSmlGetNodeAffinity(
176
- dev_idx,
177
- get_numa_nodeset_size(),
178
- )
179
- dev_numa = bitmask_to_str(list(dev_node_affinity))
182
+ with contextlib.suppress(pymxsml.MXSMLError):
183
+ dev_node_affinity = pymxsml.mxSmlGetNodeAffinity(
184
+ dev_idx,
185
+ get_numa_nodeset_size(),
186
+ )
187
+ dev_numa = bitmask_to_str(list(dev_node_affinity))
180
188
 
181
189
  dev_appendix = {
182
190
  "vgpu": dev_is_vgpu,
183
191
  "bdf": dev_bdf,
184
- "numa": dev_numa,
185
192
  }
193
+ if dev_numa:
194
+ dev_appendix["numa"] = dev_numa
186
195
  if dev_card_id is not None:
187
196
  dev_appendix["card_id"] = dev_card_id
188
197
  if dev_renderd_id is not None:
@@ -201,6 +210,7 @@ class MetaXDetector(Detector):
201
210
  memory=dev_mem,
202
211
  memory_used=dev_mem_used,
203
212
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
213
+ memory_status=dev_mem_status,
204
214
  temperature=dev_temp,
205
215
  power=dev_power,
206
216
  power_used=dev_power_used,
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations as __future_annotations__
2
2
 
3
+ import contextlib
3
4
  import logging
4
5
  from functools import lru_cache
5
6
 
@@ -7,6 +8,7 @@ import pymtml
7
8
 
8
9
  from .. import envs
9
10
  from ..logging import debug_log_exception, debug_log_warning
11
+ from . import DeviceMemoryStatusEnum
10
12
  from .__types__ import (
11
13
  Detector,
12
14
  Device,
@@ -140,6 +142,7 @@ class MThreadsDetector(Detector):
140
142
 
141
143
  dev_mem = 0
142
144
  dev_mem_used = 0
145
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
143
146
  with pymtml.mtmlMemoryContext(dev) as devmem:
144
147
  dev_mem = byte_to_mebibyte( # byte to MiB
145
148
  pymtml.mtmlMemoryGetTotal(devmem),
@@ -147,6 +150,14 @@ class MThreadsDetector(Detector):
147
150
  dev_mem_used = byte_to_mebibyte( # byte to MiB
148
151
  pymtml.mtmlMemoryGetUsed(devmem),
149
152
  )
153
+ dev_mem_ecc_errors = pymtml.mtmlMemoryGetEccErrorCounter(
154
+ devmem,
155
+ pymtml.MTML_MEMORY_ERROR_TYPE_UNCORRECTED,
156
+ pymtml.MTML_VOLATILE_ECC,
157
+ pymtml.MTML_MEMORY_LOCATION_DRAM,
158
+ )
159
+ if dev_mem_ecc_errors > 0:
160
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
150
161
 
151
162
  dev_cores_util = None
152
163
  dev_temp = None
@@ -166,19 +177,21 @@ class MThreadsDetector(Detector):
166
177
 
167
178
  dev_numa = get_numa_node_by_bdf(dev_bdf)
168
179
  if not dev_numa:
169
- dev_node_affinity = pymtml.mtmlDeviceGetMemoryAffinityWithinNode(
170
- dev,
171
- get_numa_nodeset_size(),
172
- )
173
- dev_numa = bitmask_to_str(
174
- list(dev_node_affinity),
175
- )
180
+ with contextlib.suppress(pymtml.MTMLError):
181
+ dev_node_affinity = (
182
+ pymtml.mtmlDeviceGetMemoryAffinityWithinNode(
183
+ dev,
184
+ get_numa_nodeset_size(),
185
+ )
186
+ )
187
+ dev_numa = bitmask_to_str(list(dev_node_affinity))
176
188
 
177
189
  dev_appendix = {
178
190
  "vgpu": dev_is_vgpu,
179
191
  "bdf": dev_bdf,
180
- "numa": dev_numa,
181
192
  }
193
+ if dev_numa:
194
+ dev_appendix["numa"] = dev_numa
182
195
 
183
196
  ret.append(
184
197
  Device(
@@ -192,6 +205,7 @@ class MThreadsDetector(Detector):
192
205
  memory=dev_mem,
193
206
  memory_used=dev_mem_used,
194
207
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
208
+ memory_status=dev_mem_status,
195
209
  temperature=dev_temp,
196
210
  power_used=dev_power_used,
197
211
  appendix=dev_appendix,