gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/detector.py +4 -2
- gpustack_runtime/deployer/__types__.py +314 -233
- gpustack_runtime/deployer/cdi/__init__.py +1 -1
- gpustack_runtime/deployer/cdi/__types__.py +2 -2
- gpustack_runtime/deployer/cdi/__utils__.py +4 -1
- gpustack_runtime/deployer/cdi/amd.py +6 -8
- gpustack_runtime/deployer/cdi/ascend.py +7 -9
- gpustack_runtime/deployer/cdi/hygon.py +6 -8
- gpustack_runtime/deployer/cdi/iluvatar.py +6 -8
- gpustack_runtime/deployer/cdi/metax.py +6 -8
- gpustack_runtime/deployer/cdi/thead.py +6 -8
- gpustack_runtime/deployer/docker.py +133 -146
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +13 -8
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +26 -21
- gpustack_runtime/deployer/kuberentes.py +89 -108
- gpustack_runtime/deployer/podman.py +113 -120
- gpustack_runtime/detector/__init__.py +2 -0
- gpustack_runtime/detector/__types__.py +26 -0
- gpustack_runtime/detector/__utils__.py +3 -0
- gpustack_runtime/detector/amd.py +32 -10
- gpustack_runtime/detector/ascend.py +67 -13
- gpustack_runtime/detector/cambricon.py +3 -0
- gpustack_runtime/detector/hygon.py +22 -3
- gpustack_runtime/detector/iluvatar.py +15 -7
- gpustack_runtime/detector/metax.py +16 -6
- gpustack_runtime/detector/mthreads.py +22 -8
- gpustack_runtime/detector/nvidia.py +148 -140
- gpustack_runtime/detector/pyacl/__init__.py +34 -14
- gpustack_runtime/detector/pydcmi/__init__.py +4 -2
- gpustack_runtime/detector/pyixml/__init__.py +16 -0
- gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
- gpustack_runtime/detector/thead.py +145 -134
- gpustack_runtime/envs.py +7 -6
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/METADATA +2 -2
- gpustack_runtime-0.1.42.post1.dist-info/RECORD +67 -0
- gpustack_runtime-0.1.41.post3.dist-info/RECORD +0 -67
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.post1.dist-info}/licenses/LICENSE +0 -0
gpustack_runtime/detector/amd.py
CHANGED
|
@@ -8,7 +8,14 @@ from pathlib import Path
|
|
|
8
8
|
from .. import envs
|
|
9
9
|
from ..logging import debug_log_exception, debug_log_warning
|
|
10
10
|
from . import Topology, pyamdgpu, pyamdsmi, pyhsa, pyrocmcore, pyrocmsmi
|
|
11
|
-
from .__types__ import
|
|
11
|
+
from .__types__ import (
|
|
12
|
+
Detector,
|
|
13
|
+
Device,
|
|
14
|
+
DeviceMemoryStatusEnum,
|
|
15
|
+
Devices,
|
|
16
|
+
ManufacturerEnum,
|
|
17
|
+
TopologyDistanceEnum,
|
|
18
|
+
)
|
|
12
19
|
from .__utils__ import (
|
|
13
20
|
PCIDevice,
|
|
14
21
|
byte_to_mebibyte,
|
|
@@ -165,20 +172,32 @@ class AMDDetector(Detector):
|
|
|
165
172
|
)
|
|
166
173
|
dev_cores_util = 0
|
|
167
174
|
|
|
168
|
-
dev_mem =
|
|
169
|
-
dev_mem_used =
|
|
175
|
+
dev_mem = 0
|
|
176
|
+
dev_mem_used = 0
|
|
177
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
170
178
|
try:
|
|
171
179
|
dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev)
|
|
172
180
|
dev_mem = dev_gpu_vram_usage.get("vram_total")
|
|
173
181
|
dev_mem_used = dev_gpu_vram_usage.get("vram_used")
|
|
182
|
+
dev_ecc_count = pyamdsmi.amdsmi_get_gpu_ecc_count(
|
|
183
|
+
dev,
|
|
184
|
+
pyamdsmi.AmdSmiGpuBlock.UMC,
|
|
185
|
+
)
|
|
186
|
+
if dev_ecc_count.get("uncorrectable_count", 0) > 0:
|
|
187
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
174
188
|
except pyamdsmi.AmdSmiException:
|
|
189
|
+
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
190
|
+
pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
|
|
191
|
+
)
|
|
192
|
+
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
193
|
+
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
|
|
194
|
+
)
|
|
175
195
|
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
)
|
|
179
|
-
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
180
|
-
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
|
|
196
|
+
dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
|
|
197
|
+
dev_idx,
|
|
181
198
|
)
|
|
199
|
+
if dev_ecc_count.uncorrectable_err > 0:
|
|
200
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
182
201
|
|
|
183
202
|
dev_power = None
|
|
184
203
|
dev_power_used = None
|
|
@@ -201,14 +220,16 @@ class AMDDetector(Detector):
|
|
|
201
220
|
|
|
202
221
|
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
203
222
|
if not dev_numa:
|
|
204
|
-
|
|
223
|
+
with contextlib.suppress(pyamdsmi.AmdSmiException):
|
|
224
|
+
dev_numa = str(pyamdsmi.amdsmi_topo_get_numa_node_number(dev))
|
|
205
225
|
|
|
206
226
|
dev_appendix = {
|
|
207
227
|
"arch_family": _get_arch_family(dev_asic_family_id),
|
|
208
228
|
"vgpu": dev_is_vgpu,
|
|
209
229
|
"bdf": dev_bdf,
|
|
210
|
-
"numa": dev_numa,
|
|
211
230
|
}
|
|
231
|
+
if dev_numa:
|
|
232
|
+
dev_appendix["numa"] = dev_numa
|
|
212
233
|
if dev_card_id is not None:
|
|
213
234
|
dev_appendix["card_id"] = dev_card_id
|
|
214
235
|
if dev_renderd_id is not None:
|
|
@@ -232,6 +253,7 @@ class AMDDetector(Detector):
|
|
|
232
253
|
memory=dev_mem,
|
|
233
254
|
memory_used=dev_mem_used,
|
|
234
255
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
256
|
+
memory_status=dev_mem_status,
|
|
235
257
|
temperature=dev_temp,
|
|
236
258
|
power=dev_power,
|
|
237
259
|
power_used=dev_power_used,
|
|
@@ -10,6 +10,7 @@ from . import pyacl, pydcmi
|
|
|
10
10
|
from .__types__ import (
|
|
11
11
|
Detector,
|
|
12
12
|
Device,
|
|
13
|
+
DeviceMemoryStatusEnum,
|
|
13
14
|
Devices,
|
|
14
15
|
ManufacturerEnum,
|
|
15
16
|
Topology,
|
|
@@ -31,10 +32,12 @@ slogger = logger.getChild("internal")
|
|
|
31
32
|
_TOPOLOGY_DISTANCE_MAPPING: dict[int, int] = {
|
|
32
33
|
pydcmi.DCMI_TOPO_TYPE_SELF: TopologyDistanceEnum.SELF,
|
|
33
34
|
pydcmi.DCMI_TOPO_TYPE_HCCS: TopologyDistanceEnum.LINK, # Traversing via high-speed interconnect, RoCE, etc.
|
|
35
|
+
pydcmi.DCMI_TOPO_TYPE_HCCS_SW: TopologyDistanceEnum.LINK, # Traversing via high-speed interconnect switch.
|
|
34
36
|
pydcmi.DCMI_TOPO_TYPE_PIX: TopologyDistanceEnum.PIX, # Traversing via a single PCIe bridge.
|
|
35
37
|
pydcmi.DCMI_TOPO_TYPE_PXB: TopologyDistanceEnum.PXB, # Traversing via multiple PCIe bridges without PCIe Host Bridge.
|
|
36
38
|
pydcmi.DCMI_TOPO_TYPE_PHB: TopologyDistanceEnum.PHB, # Traversing via a PCIe Host Bridge.
|
|
37
39
|
pydcmi.DCMI_TOPO_TYPE_SYS: TopologyDistanceEnum.SYS, # Traversing via SMP interconnect across other NUMA nodes.
|
|
40
|
+
pydcmi.DCMI_TOPO_TYPE_SIO: TopologyDistanceEnum.SYS, # Traversing via Super I/O or other slower interconnects.
|
|
38
41
|
}
|
|
39
42
|
"""
|
|
40
43
|
Mapping of Ascend topology types to distance values.
|
|
@@ -108,7 +111,7 @@ class AscendDetector(Detector):
|
|
|
108
111
|
|
|
109
112
|
sys_driver_ver = pydcmi.dcmi_get_driver_version()
|
|
110
113
|
|
|
111
|
-
sys_runtime_ver_original = pyacl.
|
|
114
|
+
sys_runtime_ver_original = pyacl.aclsysGetVersion()
|
|
112
115
|
sys_runtime_ver = get_brief_version(sys_runtime_ver_original)
|
|
113
116
|
|
|
114
117
|
_, card_list = pydcmi.dcmi_get_card_list()
|
|
@@ -128,7 +131,9 @@ class AscendDetector(Detector):
|
|
|
128
131
|
dev_is_vgpu = True
|
|
129
132
|
dev_cores_aicore = dev_virt_info.query_info.computing.aic
|
|
130
133
|
dev_name = dev_virt_info.query_info.name
|
|
131
|
-
dev_mem
|
|
134
|
+
dev_mem = 0
|
|
135
|
+
dev_mem_used = 0
|
|
136
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
132
137
|
if hasattr(dev_virt_info.query_info.computing, "memory_size"):
|
|
133
138
|
dev_mem = dev_virt_info.query_info.computing.memory_size
|
|
134
139
|
dev_index = dev_virt_info.vdev_id
|
|
@@ -143,6 +148,10 @@ class AscendDetector(Detector):
|
|
|
143
148
|
dev_card_id,
|
|
144
149
|
dev_device_id,
|
|
145
150
|
)
|
|
151
|
+
dev_mem_status = _get_device_memory_status(
|
|
152
|
+
dev_card_id,
|
|
153
|
+
dev_device_id,
|
|
154
|
+
)
|
|
146
155
|
dev_index = pydcmi.dcmi_get_device_logic_id(
|
|
147
156
|
dev_card_id,
|
|
148
157
|
dev_device_id,
|
|
@@ -191,13 +200,14 @@ class AscendDetector(Detector):
|
|
|
191
200
|
|
|
192
201
|
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
193
202
|
if not dev_numa:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
203
|
+
with contextlib.suppress(pydcmi.DCMIError):
|
|
204
|
+
dev_cpu_affinity = (
|
|
205
|
+
pydcmi.dcmi_get_affinity_cpu_info_by_device_id(
|
|
206
|
+
dev_card_id,
|
|
207
|
+
dev_device_id,
|
|
208
|
+
)
|
|
198
209
|
)
|
|
199
|
-
|
|
200
|
-
dev_numa = map_cpu_affinity_to_numa_node(dev_cpu_affinity)
|
|
210
|
+
dev_numa = map_cpu_affinity_to_numa_node(dev_cpu_affinity)
|
|
201
211
|
|
|
202
212
|
dev_appendix = {
|
|
203
213
|
"arch_family": (
|
|
@@ -206,11 +216,12 @@ class AscendDetector(Detector):
|
|
|
206
216
|
),
|
|
207
217
|
"vgpu": dev_is_vgpu,
|
|
208
218
|
"bdf": dev_bdf,
|
|
209
|
-
"numa": dev_numa,
|
|
210
219
|
"card_id": dev_card_id,
|
|
211
220
|
"device_id": dev_device_id,
|
|
212
221
|
"device_id_max": device_num_in_card - 1,
|
|
213
222
|
}
|
|
223
|
+
if dev_numa:
|
|
224
|
+
dev_appendix["numa"] = dev_numa
|
|
214
225
|
|
|
215
226
|
dev_roce_ip, dev_roce_mask, dev_roce_gateway = (
|
|
216
227
|
_get_device_roce_network_info(
|
|
@@ -239,6 +250,7 @@ class AscendDetector(Detector):
|
|
|
239
250
|
memory=dev_mem,
|
|
240
251
|
memory_used=dev_mem_used,
|
|
241
252
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
253
|
+
memory_status=dev_mem_status,
|
|
242
254
|
temperature=dev_temp,
|
|
243
255
|
power_used=dev_power_used,
|
|
244
256
|
appendix=dev_appendix,
|
|
@@ -332,6 +344,12 @@ def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:
|
|
|
332
344
|
"""
|
|
333
345
|
Get device memory information.
|
|
334
346
|
|
|
347
|
+
Args:
|
|
348
|
+
dev_card_id:
|
|
349
|
+
The card ID of the device.
|
|
350
|
+
dev_device_id:
|
|
351
|
+
The device ID of the device.
|
|
352
|
+
|
|
335
353
|
Returns:
|
|
336
354
|
A tuple containing total memory and used memory in MiB.
|
|
337
355
|
|
|
@@ -370,6 +388,37 @@ def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:
|
|
|
370
388
|
return dev_mem, dev_mem_used
|
|
371
389
|
|
|
372
390
|
|
|
391
|
+
def _get_device_memory_status(dev_card_id, dev_device_id) -> DeviceMemoryStatusEnum:
|
|
392
|
+
"""
|
|
393
|
+
Get device memory ECC status.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
dev_card_id:
|
|
397
|
+
The card ID of the device.
|
|
398
|
+
dev_device_id:
|
|
399
|
+
The device ID of the device.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
DeviceMemoryStatusEnum indicating the ECC status.
|
|
403
|
+
|
|
404
|
+
"""
|
|
405
|
+
for dev_mem_type in [pydcmi.DCMI_DEVICE_TYPE_HBM, pydcmi.DCMI_DEVICE_TYPE_DDR]:
|
|
406
|
+
with contextlib.suppress(pydcmi.DCMIError):
|
|
407
|
+
dev_ecc_info = pydcmi.dcmi_get_device_ecc_info(
|
|
408
|
+
dev_card_id,
|
|
409
|
+
dev_device_id,
|
|
410
|
+
dev_mem_type,
|
|
411
|
+
)
|
|
412
|
+
if dev_ecc_info.enable_flag and (
|
|
413
|
+
dev_ecc_info.single_bit_error_cnt > 0
|
|
414
|
+
or dev_ecc_info.double_bit_error_cnt > 0
|
|
415
|
+
):
|
|
416
|
+
return DeviceMemoryStatusEnum.UNHEALTHY
|
|
417
|
+
return DeviceMemoryStatusEnum.HEALTHY
|
|
418
|
+
|
|
419
|
+
return DeviceMemoryStatusEnum.HEALTHY
|
|
420
|
+
|
|
421
|
+
|
|
373
422
|
def _get_device_roce_network_info(
|
|
374
423
|
dev_card_id,
|
|
375
424
|
dev_device_id,
|
|
@@ -395,7 +444,7 @@ def _get_device_roce_network_info(
|
|
|
395
444
|
pydcmi.DCMI_PORT_TYPE_ROCE_PORT,
|
|
396
445
|
)
|
|
397
446
|
except pydcmi.DCMIError:
|
|
398
|
-
debug_log_exception(logger, "Failed to get device
|
|
447
|
+
debug_log_exception(logger, "Failed to get device RoCE network info")
|
|
399
448
|
|
|
400
449
|
return ip, mask, gateway
|
|
401
450
|
|
|
@@ -456,12 +505,15 @@ _soc_name_version_mapping: dict[str, int] = {
|
|
|
456
505
|
"Ascend310B3": 242,
|
|
457
506
|
"Ascend310B4": 243,
|
|
458
507
|
"Ascend910_9391": 250,
|
|
508
|
+
"Ascend910": 250,
|
|
459
509
|
"Ascend910_9392": 251,
|
|
460
510
|
"Ascend910_9381": 252,
|
|
461
511
|
"Ascend910_9382": 253,
|
|
462
512
|
"Ascend910_9372": 254,
|
|
463
513
|
"Ascend910_9362": 255,
|
|
464
514
|
"Ascend910_9579": 260,
|
|
515
|
+
"Ascend910_95": 260,
|
|
516
|
+
"Ascend950": 260,
|
|
465
517
|
}
|
|
466
518
|
|
|
467
519
|
|
|
@@ -477,6 +529,8 @@ def _guess_soc_name_from_dev_name(dev_name: str) -> str | None:
|
|
|
477
529
|
The guessed SoC name, or None if not found.
|
|
478
530
|
|
|
479
531
|
"""
|
|
532
|
+
if dev_name.startswith("Ascend"):
|
|
533
|
+
dev_name = dev_name[6:].strip()
|
|
480
534
|
soc_name = f"Ascend{dev_name}"
|
|
481
535
|
if soc_name in _soc_name_version_mapping:
|
|
482
536
|
return soc_name
|
|
@@ -528,11 +582,11 @@ def get_ascend_cann_variant(name: str | None) -> str | None:
|
|
|
528
582
|
if version < 220:
|
|
529
583
|
return "310p"
|
|
530
584
|
if version < 240:
|
|
531
|
-
return "910b"
|
|
585
|
+
return "910b" # 910b/a2
|
|
532
586
|
if version < 250:
|
|
533
587
|
return "310b"
|
|
534
588
|
if version < 260:
|
|
535
|
-
return "a3" # 910c
|
|
589
|
+
return "a3" # 910c/a3
|
|
536
590
|
if version < 270:
|
|
537
|
-
return "a5" # 910d
|
|
591
|
+
return "a5" # 910d/a5
|
|
538
592
|
return None
|
|
@@ -6,6 +6,7 @@ from functools import lru_cache
|
|
|
6
6
|
|
|
7
7
|
from .. import envs
|
|
8
8
|
from ..logging import debug_log_exception
|
|
9
|
+
from . import DeviceMemoryStatusEnum
|
|
9
10
|
from .__types__ import Detector, Device, Devices, ManufacturerEnum
|
|
10
11
|
from .__utils__ import (
|
|
11
12
|
PCIDevice,
|
|
@@ -100,6 +101,7 @@ class CambriconDetector(Detector):
|
|
|
100
101
|
dev_mem_usage_info = dev_info.get("PhysicalMemUsage", {})
|
|
101
102
|
dev_mem = safe_int(dev_mem_usage_info.get("Total", 0))
|
|
102
103
|
dev_mem_used = safe_int(dev_mem_usage_info.get("Used", 0))
|
|
104
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
103
105
|
|
|
104
106
|
dev_temp_info = dev_info.get("Temperature", {})
|
|
105
107
|
dev_temp = safe_float(dev_temp_info.get("Chip", 0))
|
|
@@ -118,6 +120,7 @@ class CambriconDetector(Detector):
|
|
|
118
120
|
memory=dev_mem,
|
|
119
121
|
memory_used=dev_mem_used,
|
|
120
122
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
123
|
+
memory_status=dev_mem_status,
|
|
121
124
|
temperature=dev_temp,
|
|
122
125
|
appendix=dev_appendix,
|
|
123
126
|
),
|
|
@@ -8,7 +8,14 @@ from pathlib import Path
|
|
|
8
8
|
from .. import envs
|
|
9
9
|
from ..logging import debug_log_exception, debug_log_warning
|
|
10
10
|
from . import Topology, pyamdgpu, pyhsa, pyrocmcore, pyrocmsmi
|
|
11
|
-
from .__types__ import
|
|
11
|
+
from .__types__ import (
|
|
12
|
+
Detector,
|
|
13
|
+
Device,
|
|
14
|
+
DeviceMemoryStatusEnum,
|
|
15
|
+
Devices,
|
|
16
|
+
ManufacturerEnum,
|
|
17
|
+
TopologyDistanceEnum,
|
|
18
|
+
)
|
|
12
19
|
from .__utils__ import (
|
|
13
20
|
PCIDevice,
|
|
14
21
|
byte_to_mebibyte,
|
|
@@ -149,6 +156,13 @@ class HygonDetector(Detector):
|
|
|
149
156
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
150
157
|
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
|
|
151
158
|
)
|
|
159
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
160
|
+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
161
|
+
dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
|
|
162
|
+
dev_idx,
|
|
163
|
+
)
|
|
164
|
+
if dev_ecc_count.uncorrectable_err > 0:
|
|
165
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
152
166
|
|
|
153
167
|
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
|
|
154
168
|
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
|
|
@@ -157,13 +171,17 @@ class HygonDetector(Detector):
|
|
|
157
171
|
|
|
158
172
|
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
159
173
|
if not dev_numa:
|
|
160
|
-
|
|
174
|
+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
175
|
+
dev_numa = str(
|
|
176
|
+
pyrocmsmi.rsmi_topo_get_numa_node_number(dev_idx),
|
|
177
|
+
)
|
|
161
178
|
|
|
162
179
|
dev_appendix = {
|
|
163
180
|
"vgpu": dev_is_vgpu,
|
|
164
181
|
"bdf": dev_bdf,
|
|
165
|
-
"numa": dev_numa,
|
|
166
182
|
}
|
|
183
|
+
if dev_numa:
|
|
184
|
+
dev_appendix["numa"] = dev_numa
|
|
167
185
|
if dev_card_id is not None:
|
|
168
186
|
dev_appendix["card_id"] = dev_card_id
|
|
169
187
|
if dev_renderd_id is not None:
|
|
@@ -184,6 +202,7 @@ class HygonDetector(Detector):
|
|
|
184
202
|
memory=dev_mem,
|
|
185
203
|
memory_used=dev_mem_used,
|
|
186
204
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
205
|
+
memory_status=dev_mem_status,
|
|
187
206
|
temperature=dev_temp,
|
|
188
207
|
power=dev_power,
|
|
189
208
|
power_used=dev_power_used,
|
|
@@ -10,6 +10,7 @@ from . import pyixml
|
|
|
10
10
|
from .__types__ import (
|
|
11
11
|
Detector,
|
|
12
12
|
Device,
|
|
13
|
+
DeviceMemoryStatusEnum,
|
|
13
14
|
Devices,
|
|
14
15
|
ManufacturerEnum,
|
|
15
16
|
Topology,
|
|
@@ -135,6 +136,7 @@ class IluvatarDetector(Detector):
|
|
|
135
136
|
|
|
136
137
|
dev_mem = 0
|
|
137
138
|
dev_mem_used = 0
|
|
139
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
138
140
|
with contextlib.suppress(pyixml.NVMLError):
|
|
139
141
|
dev_mem_info = pyixml.nvmlDeviceGetMemoryInfo(dev)
|
|
140
142
|
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
@@ -143,6 +145,9 @@ class IluvatarDetector(Detector):
|
|
|
143
145
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
144
146
|
dev_mem_info.used,
|
|
145
147
|
)
|
|
148
|
+
dev_health = pyixml.ixmlDeviceGetHealth(dev)
|
|
149
|
+
if dev_health != pyixml.IXML_HEALTH_OK:
|
|
150
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
146
151
|
|
|
147
152
|
dev_cores_util = None
|
|
148
153
|
with contextlib.suppress(pyixml.NVMLError):
|
|
@@ -185,18 +190,20 @@ class IluvatarDetector(Detector):
|
|
|
185
190
|
|
|
186
191
|
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
187
192
|
if not dev_numa:
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
193
|
+
with contextlib.suppress(pyixml.NVMLError):
|
|
194
|
+
dev_node_affinity = pyixml.nvmlDeviceGetMemoryAffinity(
|
|
195
|
+
dev,
|
|
196
|
+
get_numa_nodeset_size(),
|
|
197
|
+
pyixml.NVML_AFFINITY_SCOPE_NODE,
|
|
198
|
+
)
|
|
199
|
+
dev_numa = bitmask_to_str(list(dev_node_affinity))
|
|
194
200
|
|
|
195
201
|
dev_appendix = {
|
|
196
202
|
"vgpu": dev_is_vgpu,
|
|
197
203
|
"bdf": dev_bdf,
|
|
198
|
-
"numa": dev_numa,
|
|
199
204
|
}
|
|
205
|
+
if dev_numa:
|
|
206
|
+
dev_appendix["numa"] = dev_numa
|
|
200
207
|
|
|
201
208
|
ret.append(
|
|
202
209
|
Device(
|
|
@@ -213,6 +220,7 @@ class IluvatarDetector(Detector):
|
|
|
213
220
|
memory=dev_mem,
|
|
214
221
|
memory_used=dev_mem_used,
|
|
215
222
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
223
|
+
memory_status=dev_mem_status,
|
|
216
224
|
temperature=dev_temp,
|
|
217
225
|
power=dev_power,
|
|
218
226
|
power_used=dev_power_used,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
|
+
import contextlib
|
|
3
4
|
import logging
|
|
4
5
|
from functools import lru_cache
|
|
5
6
|
from pathlib import Path
|
|
@@ -10,6 +11,7 @@ from . import pymxsml
|
|
|
10
11
|
from .__types__ import (
|
|
11
12
|
Detector,
|
|
12
13
|
Device,
|
|
14
|
+
DeviceMemoryStatusEnum,
|
|
13
15
|
Devices,
|
|
14
16
|
ManufacturerEnum,
|
|
15
17
|
Topology,
|
|
@@ -145,6 +147,11 @@ class MetaXDetector(Detector):
|
|
|
145
147
|
dev_mem_used = kibibyte_to_mebibyte( # KiB to MiB
|
|
146
148
|
dev_mem_info.vramUse,
|
|
147
149
|
)
|
|
150
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
151
|
+
with contextlib.suppress(pymxsml.MXSMLError):
|
|
152
|
+
dev_ecc_errors = pymxsml.mxSmlGetTotalEccErrors(dev_idx)
|
|
153
|
+
if dev_ecc_errors.dramUE > 0:
|
|
154
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
148
155
|
|
|
149
156
|
dev_temp = (
|
|
150
157
|
pymxsml.mxSmlGetTemperatureInfo(
|
|
@@ -172,17 +179,19 @@ class MetaXDetector(Detector):
|
|
|
172
179
|
|
|
173
180
|
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
174
181
|
if not dev_numa:
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
182
|
+
with contextlib.suppress(pymxsml.MXSMLError):
|
|
183
|
+
dev_node_affinity = pymxsml.mxSmlGetNodeAffinity(
|
|
184
|
+
dev_idx,
|
|
185
|
+
get_numa_nodeset_size(),
|
|
186
|
+
)
|
|
187
|
+
dev_numa = bitmask_to_str(list(dev_node_affinity))
|
|
180
188
|
|
|
181
189
|
dev_appendix = {
|
|
182
190
|
"vgpu": dev_is_vgpu,
|
|
183
191
|
"bdf": dev_bdf,
|
|
184
|
-
"numa": dev_numa,
|
|
185
192
|
}
|
|
193
|
+
if dev_numa:
|
|
194
|
+
dev_appendix["numa"] = dev_numa
|
|
186
195
|
if dev_card_id is not None:
|
|
187
196
|
dev_appendix["card_id"] = dev_card_id
|
|
188
197
|
if dev_renderd_id is not None:
|
|
@@ -201,6 +210,7 @@ class MetaXDetector(Detector):
|
|
|
201
210
|
memory=dev_mem,
|
|
202
211
|
memory_used=dev_mem_used,
|
|
203
212
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
213
|
+
memory_status=dev_mem_status,
|
|
204
214
|
temperature=dev_temp,
|
|
205
215
|
power=dev_power,
|
|
206
216
|
power_used=dev_power_used,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
|
+
import contextlib
|
|
3
4
|
import logging
|
|
4
5
|
from functools import lru_cache
|
|
5
6
|
|
|
@@ -7,6 +8,7 @@ import pymtml
|
|
|
7
8
|
|
|
8
9
|
from .. import envs
|
|
9
10
|
from ..logging import debug_log_exception, debug_log_warning
|
|
11
|
+
from . import DeviceMemoryStatusEnum
|
|
10
12
|
from .__types__ import (
|
|
11
13
|
Detector,
|
|
12
14
|
Device,
|
|
@@ -140,6 +142,7 @@ class MThreadsDetector(Detector):
|
|
|
140
142
|
|
|
141
143
|
dev_mem = 0
|
|
142
144
|
dev_mem_used = 0
|
|
145
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
143
146
|
with pymtml.mtmlMemoryContext(dev) as devmem:
|
|
144
147
|
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
145
148
|
pymtml.mtmlMemoryGetTotal(devmem),
|
|
@@ -147,6 +150,14 @@ class MThreadsDetector(Detector):
|
|
|
147
150
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
148
151
|
pymtml.mtmlMemoryGetUsed(devmem),
|
|
149
152
|
)
|
|
153
|
+
dev_mem_ecc_errors = pymtml.mtmlMemoryGetEccErrorCounter(
|
|
154
|
+
devmem,
|
|
155
|
+
pymtml.MTML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
|
156
|
+
pymtml.MTML_VOLATILE_ECC,
|
|
157
|
+
pymtml.MTML_MEMORY_LOCATION_DRAM,
|
|
158
|
+
)
|
|
159
|
+
if dev_mem_ecc_errors > 0:
|
|
160
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
150
161
|
|
|
151
162
|
dev_cores_util = None
|
|
152
163
|
dev_temp = None
|
|
@@ -166,19 +177,21 @@ class MThreadsDetector(Detector):
|
|
|
166
177
|
|
|
167
178
|
dev_numa = get_numa_node_by_bdf(dev_bdf)
|
|
168
179
|
if not dev_numa:
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
180
|
+
with contextlib.suppress(pymtml.MTMLError):
|
|
181
|
+
dev_node_affinity = (
|
|
182
|
+
pymtml.mtmlDeviceGetMemoryAffinityWithinNode(
|
|
183
|
+
dev,
|
|
184
|
+
get_numa_nodeset_size(),
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
dev_numa = bitmask_to_str(list(dev_node_affinity))
|
|
176
188
|
|
|
177
189
|
dev_appendix = {
|
|
178
190
|
"vgpu": dev_is_vgpu,
|
|
179
191
|
"bdf": dev_bdf,
|
|
180
|
-
"numa": dev_numa,
|
|
181
192
|
}
|
|
193
|
+
if dev_numa:
|
|
194
|
+
dev_appendix["numa"] = dev_numa
|
|
182
195
|
|
|
183
196
|
ret.append(
|
|
184
197
|
Device(
|
|
@@ -192,6 +205,7 @@ class MThreadsDetector(Detector):
|
|
|
192
205
|
memory=dev_mem,
|
|
193
206
|
memory_used=dev_mem_used,
|
|
194
207
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
208
|
+
memory_status=dev_mem_status,
|
|
195
209
|
temperature=dev_temp,
|
|
196
210
|
power_used=dev_power_used,
|
|
197
211
|
appendix=dev_appendix,
|