gpustack-runtime 0.1.41.post2__py3-none-any.whl → 0.1.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/detector.py +3 -1
- gpustack_runtime/deployer/__types__.py +314 -233
- gpustack_runtime/deployer/cdi/__utils__.py +4 -1
- gpustack_runtime/deployer/docker.py +109 -148
- gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +21 -3
- gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
- gpustack_runtime/deployer/kuberentes.py +91 -126
- gpustack_runtime/deployer/podman.py +89 -122
- gpustack_runtime/detector/__init__.py +2 -0
- gpustack_runtime/detector/__types__.py +26 -0
- gpustack_runtime/detector/amd.py +28 -8
- gpustack_runtime/detector/ascend.py +49 -4
- gpustack_runtime/detector/cambricon.py +3 -0
- gpustack_runtime/detector/hygon.py +16 -1
- gpustack_runtime/detector/iluvatar.py +6 -0
- gpustack_runtime/detector/metax.py +8 -0
- gpustack_runtime/detector/mthreads.py +11 -0
- gpustack_runtime/detector/nvidia.py +139 -134
- gpustack_runtime/detector/pyixml/__init__.py +16 -0
- gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
- gpustack_runtime/detector/thead.py +135 -127
- gpustack_runtime/envs.py +7 -6
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.41.post2.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0
|
@@ -10,6 +10,7 @@ from . import pyacl, pydcmi
|
|
|
10
10
|
from .__types__ import (
|
|
11
11
|
Detector,
|
|
12
12
|
Device,
|
|
13
|
+
DeviceMemoryStatusEnum,
|
|
13
14
|
Devices,
|
|
14
15
|
ManufacturerEnum,
|
|
15
16
|
Topology,
|
|
@@ -128,7 +129,9 @@ class AscendDetector(Detector):
|
|
|
128
129
|
dev_is_vgpu = True
|
|
129
130
|
dev_cores_aicore = dev_virt_info.query_info.computing.aic
|
|
130
131
|
dev_name = dev_virt_info.query_info.name
|
|
131
|
-
dev_mem
|
|
132
|
+
dev_mem = 0
|
|
133
|
+
dev_mem_used = 0
|
|
134
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
132
135
|
if hasattr(dev_virt_info.query_info.computing, "memory_size"):
|
|
133
136
|
dev_mem = dev_virt_info.query_info.computing.memory_size
|
|
134
137
|
dev_index = dev_virt_info.vdev_id
|
|
@@ -143,6 +146,10 @@ class AscendDetector(Detector):
|
|
|
143
146
|
dev_card_id,
|
|
144
147
|
dev_device_id,
|
|
145
148
|
)
|
|
149
|
+
dev_mem_status = _get_device_memory_status(
|
|
150
|
+
dev_card_id,
|
|
151
|
+
dev_device_id,
|
|
152
|
+
)
|
|
146
153
|
dev_index = pydcmi.dcmi_get_device_logic_id(
|
|
147
154
|
dev_card_id,
|
|
148
155
|
dev_device_id,
|
|
@@ -239,6 +246,7 @@ class AscendDetector(Detector):
|
|
|
239
246
|
memory=dev_mem,
|
|
240
247
|
memory_used=dev_mem_used,
|
|
241
248
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
249
|
+
memory_status=dev_mem_status,
|
|
242
250
|
temperature=dev_temp,
|
|
243
251
|
power_used=dev_power_used,
|
|
244
252
|
appendix=dev_appendix,
|
|
@@ -332,6 +340,12 @@ def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:
|
|
|
332
340
|
"""
|
|
333
341
|
Get device memory information.
|
|
334
342
|
|
|
343
|
+
Args:
|
|
344
|
+
dev_card_id:
|
|
345
|
+
The card ID of the device.
|
|
346
|
+
dev_device_id:
|
|
347
|
+
The device ID of the device.
|
|
348
|
+
|
|
335
349
|
Returns:
|
|
336
350
|
A tuple containing total memory and used memory in MiB.
|
|
337
351
|
|
|
@@ -370,6 +384,37 @@ def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:
|
|
|
370
384
|
return dev_mem, dev_mem_used
|
|
371
385
|
|
|
372
386
|
|
|
387
|
+
def _get_device_memory_status(dev_card_id, dev_device_id) -> DeviceMemoryStatusEnum:
|
|
388
|
+
"""
|
|
389
|
+
Get device memory ECC status.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
dev_card_id:
|
|
393
|
+
The card ID of the device.
|
|
394
|
+
dev_device_id:
|
|
395
|
+
The device ID of the device.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
DeviceMemoryStatusEnum indicating the ECC status.
|
|
399
|
+
|
|
400
|
+
"""
|
|
401
|
+
for dev_mem_type in [pydcmi.DCMI_DEVICE_TYPE_HBM, pydcmi.DCMI_DEVICE_TYPE_DDR]:
|
|
402
|
+
with contextlib.suppress(pydcmi.DCMIError):
|
|
403
|
+
dev_ecc_info = pydcmi.dcmi_get_device_ecc_info(
|
|
404
|
+
dev_card_id,
|
|
405
|
+
dev_device_id,
|
|
406
|
+
dev_mem_type,
|
|
407
|
+
)
|
|
408
|
+
if dev_ecc_info.enable_flag and (
|
|
409
|
+
dev_ecc_info.single_bit_error_cnt > 0
|
|
410
|
+
or dev_ecc_info.double_bit_error_cnt > 0
|
|
411
|
+
):
|
|
412
|
+
return DeviceMemoryStatusEnum.UNHEALTHY
|
|
413
|
+
return DeviceMemoryStatusEnum.HEALTHY
|
|
414
|
+
|
|
415
|
+
return DeviceMemoryStatusEnum.HEALTHY
|
|
416
|
+
|
|
417
|
+
|
|
373
418
|
def _get_device_roce_network_info(
|
|
374
419
|
dev_card_id,
|
|
375
420
|
dev_device_id,
|
|
@@ -528,11 +573,11 @@ def get_ascend_cann_variant(name: str | None) -> str | None:
|
|
|
528
573
|
if version < 220:
|
|
529
574
|
return "310p"
|
|
530
575
|
if version < 240:
|
|
531
|
-
return "910b"
|
|
576
|
+
return "910b" # 910b/a2
|
|
532
577
|
if version < 250:
|
|
533
578
|
return "310b"
|
|
534
579
|
if version < 260:
|
|
535
|
-
return "a3" # 910c
|
|
580
|
+
return "a3" # 910c/a3
|
|
536
581
|
if version < 270:
|
|
537
|
-
return "a5" # 910d
|
|
582
|
+
return "a5" # 910d/a5
|
|
538
583
|
return None
|
|
@@ -6,6 +6,7 @@ from functools import lru_cache
|
|
|
6
6
|
|
|
7
7
|
from .. import envs
|
|
8
8
|
from ..logging import debug_log_exception
|
|
9
|
+
from . import DeviceMemoryStatusEnum
|
|
9
10
|
from .__types__ import Detector, Device, Devices, ManufacturerEnum
|
|
10
11
|
from .__utils__ import (
|
|
11
12
|
PCIDevice,
|
|
@@ -100,6 +101,7 @@ class CambriconDetector(Detector):
|
|
|
100
101
|
dev_mem_usage_info = dev_info.get("PhysicalMemUsage", {})
|
|
101
102
|
dev_mem = safe_int(dev_mem_usage_info.get("Total", 0))
|
|
102
103
|
dev_mem_used = safe_int(dev_mem_usage_info.get("Used", 0))
|
|
104
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
103
105
|
|
|
104
106
|
dev_temp_info = dev_info.get("Temperature", {})
|
|
105
107
|
dev_temp = safe_float(dev_temp_info.get("Chip", 0))
|
|
@@ -118,6 +120,7 @@ class CambriconDetector(Detector):
|
|
|
118
120
|
memory=dev_mem,
|
|
119
121
|
memory_used=dev_mem_used,
|
|
120
122
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
123
|
+
memory_status=dev_mem_status,
|
|
121
124
|
temperature=dev_temp,
|
|
122
125
|
appendix=dev_appendix,
|
|
123
126
|
),
|
|
@@ -8,7 +8,14 @@ from pathlib import Path
|
|
|
8
8
|
from .. import envs
|
|
9
9
|
from ..logging import debug_log_exception, debug_log_warning
|
|
10
10
|
from . import Topology, pyamdgpu, pyhsa, pyrocmcore, pyrocmsmi
|
|
11
|
-
from .__types__ import
|
|
11
|
+
from .__types__ import (
|
|
12
|
+
Detector,
|
|
13
|
+
Device,
|
|
14
|
+
DeviceMemoryStatusEnum,
|
|
15
|
+
Devices,
|
|
16
|
+
ManufacturerEnum,
|
|
17
|
+
TopologyDistanceEnum,
|
|
18
|
+
)
|
|
12
19
|
from .__utils__ import (
|
|
13
20
|
PCIDevice,
|
|
14
21
|
byte_to_mebibyte,
|
|
@@ -149,6 +156,13 @@ class HygonDetector(Detector):
|
|
|
149
156
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
150
157
|
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
|
|
151
158
|
)
|
|
159
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
160
|
+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
161
|
+
dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
|
|
162
|
+
dev_idx,
|
|
163
|
+
)
|
|
164
|
+
if dev_ecc_count.uncorrectable_err > 0:
|
|
165
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
152
166
|
|
|
153
167
|
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
|
|
154
168
|
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
|
|
@@ -184,6 +198,7 @@ class HygonDetector(Detector):
|
|
|
184
198
|
memory=dev_mem,
|
|
185
199
|
memory_used=dev_mem_used,
|
|
186
200
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
201
|
+
memory_status=dev_mem_status,
|
|
187
202
|
temperature=dev_temp,
|
|
188
203
|
power=dev_power,
|
|
189
204
|
power_used=dev_power_used,
|
|
@@ -10,6 +10,7 @@ from . import pyixml
|
|
|
10
10
|
from .__types__ import (
|
|
11
11
|
Detector,
|
|
12
12
|
Device,
|
|
13
|
+
DeviceMemoryStatusEnum,
|
|
13
14
|
Devices,
|
|
14
15
|
ManufacturerEnum,
|
|
15
16
|
Topology,
|
|
@@ -135,6 +136,7 @@ class IluvatarDetector(Detector):
|
|
|
135
136
|
|
|
136
137
|
dev_mem = 0
|
|
137
138
|
dev_mem_used = 0
|
|
139
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
138
140
|
with contextlib.suppress(pyixml.NVMLError):
|
|
139
141
|
dev_mem_info = pyixml.nvmlDeviceGetMemoryInfo(dev)
|
|
140
142
|
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
@@ -143,6 +145,9 @@ class IluvatarDetector(Detector):
|
|
|
143
145
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
144
146
|
dev_mem_info.used,
|
|
145
147
|
)
|
|
148
|
+
dev_health = pyixml.ixmlDeviceGetHealth(dev)
|
|
149
|
+
if dev_health != pyixml.IXML_HEALTH_OK:
|
|
150
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
146
151
|
|
|
147
152
|
dev_cores_util = None
|
|
148
153
|
with contextlib.suppress(pyixml.NVMLError):
|
|
@@ -213,6 +218,7 @@ class IluvatarDetector(Detector):
|
|
|
213
218
|
memory=dev_mem,
|
|
214
219
|
memory_used=dev_mem_used,
|
|
215
220
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
221
|
+
memory_status=dev_mem_status,
|
|
216
222
|
temperature=dev_temp,
|
|
217
223
|
power=dev_power,
|
|
218
224
|
power_used=dev_power_used,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations as __future_annotations__
|
|
2
2
|
|
|
3
|
+
import contextlib
|
|
3
4
|
import logging
|
|
4
5
|
from functools import lru_cache
|
|
5
6
|
from pathlib import Path
|
|
@@ -10,6 +11,7 @@ from . import pymxsml
|
|
|
10
11
|
from .__types__ import (
|
|
11
12
|
Detector,
|
|
12
13
|
Device,
|
|
14
|
+
DeviceMemoryStatusEnum,
|
|
13
15
|
Devices,
|
|
14
16
|
ManufacturerEnum,
|
|
15
17
|
Topology,
|
|
@@ -145,6 +147,11 @@ class MetaXDetector(Detector):
|
|
|
145
147
|
dev_mem_used = kibibyte_to_mebibyte( # KiB to MiB
|
|
146
148
|
dev_mem_info.vramUse,
|
|
147
149
|
)
|
|
150
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
151
|
+
with contextlib.suppress(pymxsml.MXSMLError):
|
|
152
|
+
dev_ecc_errors = pymxsml.mxSmlGetTotalEccErrors(dev_idx)
|
|
153
|
+
if dev_ecc_errors.dramUE > 0:
|
|
154
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
148
155
|
|
|
149
156
|
dev_temp = (
|
|
150
157
|
pymxsml.mxSmlGetTemperatureInfo(
|
|
@@ -201,6 +208,7 @@ class MetaXDetector(Detector):
|
|
|
201
208
|
memory=dev_mem,
|
|
202
209
|
memory_used=dev_mem_used,
|
|
203
210
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
211
|
+
memory_status=dev_mem_status,
|
|
204
212
|
temperature=dev_temp,
|
|
205
213
|
power=dev_power,
|
|
206
214
|
power_used=dev_power_used,
|
|
@@ -7,6 +7,7 @@ import pymtml
|
|
|
7
7
|
|
|
8
8
|
from .. import envs
|
|
9
9
|
from ..logging import debug_log_exception, debug_log_warning
|
|
10
|
+
from . import DeviceMemoryStatusEnum
|
|
10
11
|
from .__types__ import (
|
|
11
12
|
Detector,
|
|
12
13
|
Device,
|
|
@@ -140,6 +141,7 @@ class MThreadsDetector(Detector):
|
|
|
140
141
|
|
|
141
142
|
dev_mem = 0
|
|
142
143
|
dev_mem_used = 0
|
|
144
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
143
145
|
with pymtml.mtmlMemoryContext(dev) as devmem:
|
|
144
146
|
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
145
147
|
pymtml.mtmlMemoryGetTotal(devmem),
|
|
@@ -147,6 +149,14 @@ class MThreadsDetector(Detector):
|
|
|
147
149
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
148
150
|
pymtml.mtmlMemoryGetUsed(devmem),
|
|
149
151
|
)
|
|
152
|
+
dev_mem_ecc_errors = pymtml.mtmlMemoryGetEccErrorCounter(
|
|
153
|
+
devmem,
|
|
154
|
+
pymtml.MTML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
|
155
|
+
pymtml.MTML_VOLATILE_ECC,
|
|
156
|
+
pymtml.MTML_MEMORY_LOCATION_DRAM,
|
|
157
|
+
)
|
|
158
|
+
if dev_mem_ecc_errors > 0:
|
|
159
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
150
160
|
|
|
151
161
|
dev_cores_util = None
|
|
152
162
|
dev_temp = None
|
|
@@ -192,6 +202,7 @@ class MThreadsDetector(Detector):
|
|
|
192
202
|
memory=dev_mem,
|
|
193
203
|
memory_used=dev_mem_used,
|
|
194
204
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
205
|
+
memory_status=dev_mem_status,
|
|
195
206
|
temperature=dev_temp,
|
|
196
207
|
power_used=dev_power_used,
|
|
197
208
|
appendix=dev_appendix,
|
|
@@ -3,17 +3,17 @@ from __future__ import annotations as __future_annotations__
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import logging
|
|
5
5
|
import math
|
|
6
|
+
import re
|
|
6
7
|
import time
|
|
7
8
|
from _ctypes import byref
|
|
8
9
|
from functools import lru_cache
|
|
9
10
|
from pathlib import Path
|
|
10
|
-
from typing import re
|
|
11
11
|
|
|
12
12
|
import pynvml
|
|
13
13
|
|
|
14
14
|
from .. import envs
|
|
15
15
|
from ..logging import debug_log_exception, debug_log_warning
|
|
16
|
-
from . import Topology, pycuda
|
|
16
|
+
from . import DeviceMemoryStatusEnum, Topology, pycuda
|
|
17
17
|
from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
|
|
18
18
|
from .__utils__ import (
|
|
19
19
|
PCIDevice,
|
|
@@ -78,7 +78,7 @@ class NVIDIADetector(Detector):
|
|
|
78
78
|
def __init__(self):
|
|
79
79
|
super().__init__(ManufacturerEnum.NVIDIA)
|
|
80
80
|
|
|
81
|
-
def detect(self) -> Devices | None:
|
|
81
|
+
def detect(self) -> Devices | None: # noqa: PLR0915
|
|
82
82
|
"""
|
|
83
83
|
Detect NVIDIA GPUs using pynvml.
|
|
84
84
|
|
|
@@ -141,6 +141,22 @@ class NVIDIADetector(Detector):
|
|
|
141
141
|
)
|
|
142
142
|
dev_numa = bitmask_to_str(list(dev_node_affinity))
|
|
143
143
|
|
|
144
|
+
dev_temp = None
|
|
145
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
146
|
+
dev_temp = pynvml.nvmlDeviceGetTemperature(
|
|
147
|
+
dev,
|
|
148
|
+
pynvml.NVML_TEMPERATURE_GPU,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
dev_power = None
|
|
152
|
+
dev_power_used = None
|
|
153
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
154
|
+
dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
|
|
155
|
+
dev_power = dev_power // 1000 # mW to W
|
|
156
|
+
dev_power_used = (
|
|
157
|
+
pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
|
|
158
|
+
) # mW to W
|
|
159
|
+
|
|
144
160
|
dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
|
|
145
161
|
with contextlib.suppress(pynvml.NVMLError):
|
|
146
162
|
dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
|
|
@@ -180,6 +196,7 @@ class NVIDIADetector(Detector):
|
|
|
180
196
|
|
|
181
197
|
dev_mem = 0
|
|
182
198
|
dev_mem_used = 0
|
|
199
|
+
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
183
200
|
with contextlib.suppress(pynvml.NVMLError):
|
|
184
201
|
dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
|
|
185
202
|
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
@@ -188,24 +205,16 @@ class NVIDIADetector(Detector):
|
|
|
188
205
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
189
206
|
dev_mem_info.used,
|
|
190
207
|
)
|
|
191
|
-
|
|
192
|
-
dev_mem, dev_mem_used = get_memory()
|
|
193
|
-
|
|
194
|
-
dev_temp = None
|
|
195
|
-
with contextlib.suppress(pynvml.NVMLError):
|
|
196
|
-
dev_temp = pynvml.nvmlDeviceGetTemperature(
|
|
208
|
+
dev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
|
|
197
209
|
dev,
|
|
198
|
-
pynvml.
|
|
210
|
+
pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
|
211
|
+
pynvml.NVML_VOLATILE_ECC,
|
|
212
|
+
pynvml.NVML_MEMORY_LOCATION_DRAM,
|
|
199
213
|
)
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
|
|
205
|
-
dev_power = dev_power // 1000 # mW to W
|
|
206
|
-
dev_power_used = (
|
|
207
|
-
pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
|
|
208
|
-
) # mW to W
|
|
214
|
+
if dev_mem_ecc_errors > 0:
|
|
215
|
+
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
216
|
+
if dev_mem == 0:
|
|
217
|
+
dev_mem, dev_mem_used = get_memory()
|
|
209
218
|
|
|
210
219
|
dev_is_vgpu = False
|
|
211
220
|
if dev_bdf in pci_devs:
|
|
@@ -236,6 +245,7 @@ class NVIDIADetector(Detector):
|
|
|
236
245
|
memory=dev_mem,
|
|
237
246
|
memory_used=dev_mem_used,
|
|
238
247
|
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
248
|
+
memory_status=dev_mem_status,
|
|
239
249
|
temperature=dev_temp,
|
|
240
250
|
power=dev_power,
|
|
241
251
|
power_used=dev_power_used,
|
|
@@ -254,12 +264,18 @@ class NVIDIADetector(Detector):
|
|
|
254
264
|
mdev_cores = None
|
|
255
265
|
mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
|
|
256
266
|
for mdev_idx in range(mdev_count):
|
|
257
|
-
mdev =
|
|
267
|
+
mdev = None
|
|
268
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
269
|
+
mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
|
|
270
|
+
if not mdev:
|
|
271
|
+
continue
|
|
258
272
|
|
|
259
|
-
mdev_index = mdev_idx
|
|
273
|
+
mdev_index = mdev_idx + dev_count * (dev_idx + 1)
|
|
260
274
|
mdev_uuid = pynvml.nvmlDeviceGetUUID(mdev)
|
|
261
275
|
|
|
262
|
-
mdev_mem
|
|
276
|
+
mdev_mem = 0
|
|
277
|
+
mdev_mem_used = 0
|
|
278
|
+
mdev_mem_status = DeviceMemoryStatusEnum.HEALTHY
|
|
263
279
|
with contextlib.suppress(pynvml.NVMLError):
|
|
264
280
|
mdev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(mdev)
|
|
265
281
|
mdev_mem = byte_to_mebibyte( # byte to MiB
|
|
@@ -268,21 +284,14 @@ class NVIDIADetector(Detector):
|
|
|
268
284
|
mdev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
269
285
|
mdev_mem_info.used,
|
|
270
286
|
)
|
|
271
|
-
|
|
272
|
-
mdev_temp = pynvml.nvmlDeviceGetTemperature(
|
|
273
|
-
mdev,
|
|
274
|
-
pynvml.NVML_TEMPERATURE_GPU,
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
mdev_power = None
|
|
278
|
-
with contextlib.suppress(pynvml.NVMLError):
|
|
279
|
-
mdev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(
|
|
287
|
+
mdev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
|
|
280
288
|
mdev,
|
|
289
|
+
pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
|
|
290
|
+
pynvml.NVML_AGGREGATE_ECC,
|
|
291
|
+
pynvml.NVML_MEMORY_LOCATION_SRAM,
|
|
281
292
|
)
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
|
|
285
|
-
) # mW to W
|
|
293
|
+
if mdev_mem_ecc_errors > 0:
|
|
294
|
+
mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
|
|
286
295
|
|
|
287
296
|
mdev_appendix = {
|
|
288
297
|
"arch_family": _get_arch_family(dev_cc_t),
|
|
@@ -305,71 +314,70 @@ class NVIDIADetector(Detector):
|
|
|
305
314
|
|
|
306
315
|
mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
|
|
307
316
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
if dev_gi_prf.id != mdev_gi_info.profileId:
|
|
325
|
-
continue
|
|
326
|
-
except pynvml.NVMLError:
|
|
317
|
+
mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
|
|
318
|
+
mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
|
|
319
|
+
mdev_gi,
|
|
320
|
+
mdev_ci_id,
|
|
321
|
+
)
|
|
322
|
+
mdev_gi_info = pynvml.nvmlGpuInstanceGetInfo(mdev_gi)
|
|
323
|
+
mdev_ci_info = pynvml.nvmlComputeInstanceGetInfo(mdev_ci)
|
|
324
|
+
for dev_gi_prf_id in range(
|
|
325
|
+
pynvml.NVML_GPU_INSTANCE_PROFILE_COUNT,
|
|
326
|
+
):
|
|
327
|
+
try:
|
|
328
|
+
dev_gi_prf = pynvml.nvmlDeviceGetGpuInstanceProfileInfo(
|
|
329
|
+
dev,
|
|
330
|
+
dev_gi_prf_id,
|
|
331
|
+
)
|
|
332
|
+
if dev_gi_prf.id != mdev_gi_info.profileId:
|
|
327
333
|
continue
|
|
334
|
+
except pynvml.NVMLError:
|
|
335
|
+
continue
|
|
328
336
|
|
|
329
|
-
|
|
330
|
-
|
|
337
|
+
for dev_ci_prf_id in range(
|
|
338
|
+
pynvml.NVML_COMPUTE_INSTANCE_PROFILE_COUNT,
|
|
339
|
+
):
|
|
340
|
+
for dev_cig_prf_id in range(
|
|
341
|
+
pynvml.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
|
|
331
342
|
):
|
|
332
|
-
|
|
333
|
-
pynvml.
|
|
334
|
-
|
|
335
|
-
try:
|
|
336
|
-
mdev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
|
|
337
|
-
mdev_gi,
|
|
338
|
-
dev_ci_prf_id,
|
|
339
|
-
dev_cig_prf_id,
|
|
340
|
-
)
|
|
341
|
-
if mdev_ci_prf.id != mdev_ci_info.profileId:
|
|
342
|
-
continue
|
|
343
|
-
except pynvml.NVMLError:
|
|
344
|
-
continue
|
|
345
|
-
|
|
346
|
-
ci_slice = _get_compute_instance_slice(
|
|
343
|
+
try:
|
|
344
|
+
dev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
|
|
345
|
+
mdev_gi,
|
|
347
346
|
dev_ci_prf_id,
|
|
347
|
+
dev_cig_prf_id,
|
|
348
348
|
)
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
)
|
|
354
|
-
gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
|
|
355
|
-
gi_neg_attrs = _get_gpu_instance_negattrs(
|
|
356
|
-
dev_gi_prf_id,
|
|
357
|
-
)
|
|
349
|
+
if dev_ci_prf.id != mdev_ci_info.profileId:
|
|
350
|
+
continue
|
|
351
|
+
except pynvml.NVMLError:
|
|
352
|
+
continue
|
|
358
353
|
|
|
359
|
-
|
|
360
|
-
|
|
354
|
+
ci_slice = _get_compute_instance_slice(dev_ci_prf_id)
|
|
355
|
+
gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
|
|
356
|
+
if ci_slice == gi_slice:
|
|
357
|
+
if hasattr(dev_gi_prf, "name"):
|
|
358
|
+
mdev_name = dev_gi_prf.name
|
|
361
359
|
else:
|
|
362
|
-
|
|
363
|
-
|
|
360
|
+
gi_mem = round(
|
|
361
|
+
math.ceil(dev_gi_prf.memorySizeMB >> 10),
|
|
364
362
|
)
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
363
|
+
mdev_name = f"{gi_slice}g.{gi_mem}gb"
|
|
364
|
+
elif hasattr(dev_ci_prf, "name"):
|
|
365
|
+
mdev_name = dev_ci_prf.name
|
|
366
|
+
else:
|
|
367
|
+
gi_mem = round(
|
|
368
|
+
math.ceil(dev_gi_prf.memorySizeMB >> 10),
|
|
369
|
+
)
|
|
370
|
+
mdev_name = f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
|
|
371
|
+
gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
|
|
372
|
+
if gi_attrs:
|
|
373
|
+
mdev_name += f"+{gi_attrs}"
|
|
374
|
+
gi_neg_attrs = _get_gpu_instance_negattrs(dev_gi_prf_id)
|
|
375
|
+
if gi_neg_attrs:
|
|
376
|
+
mdev_name += f"-{gi_neg_attrs}"
|
|
369
377
|
|
|
370
|
-
|
|
378
|
+
mdev_cores = dev_ci_prf.multiprocessorCount
|
|
371
379
|
|
|
372
|
-
|
|
380
|
+
break
|
|
373
381
|
|
|
374
382
|
ret.append(
|
|
375
383
|
Device(
|
|
@@ -386,9 +394,10 @@ class NVIDIADetector(Detector):
|
|
|
386
394
|
memory=mdev_mem,
|
|
387
395
|
memory_used=mdev_mem_used,
|
|
388
396
|
memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
397
|
+
memory_status=mdev_mem_status,
|
|
398
|
+
temperature=dev_temp,
|
|
399
|
+
power=dev_power,
|
|
400
|
+
power_used=dev_power_used,
|
|
392
401
|
appendix=mdev_appendix,
|
|
393
402
|
),
|
|
394
403
|
)
|
|
@@ -426,11 +435,17 @@ class NVIDIADetector(Detector):
|
|
|
426
435
|
devices_count=len(devices),
|
|
427
436
|
)
|
|
428
437
|
|
|
438
|
+
get_links_cache = {}
|
|
439
|
+
|
|
429
440
|
try:
|
|
430
441
|
pynvml.nvmlInit()
|
|
431
442
|
|
|
432
443
|
for i, dev_i in enumerate(devices):
|
|
433
|
-
|
|
444
|
+
dev_i_bdf = dev_i.appendix.get("bdf")
|
|
445
|
+
if dev_i.appendix.get("vgpu", False):
|
|
446
|
+
dev_i_handle = pynvml.nvmlDeviceGetHandleByPciBusId(dev_i_bdf)
|
|
447
|
+
else:
|
|
448
|
+
dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
|
|
434
449
|
|
|
435
450
|
# Get NUMA and CPU affinities.
|
|
436
451
|
ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
|
|
@@ -439,7 +454,12 @@ class NVIDIADetector(Detector):
|
|
|
439
454
|
)
|
|
440
455
|
|
|
441
456
|
# Get links state if applicable.
|
|
442
|
-
if
|
|
457
|
+
if dev_i_bdf in get_links_cache:
|
|
458
|
+
dev_i_links_state = get_links_cache[dev_i_bdf]
|
|
459
|
+
else:
|
|
460
|
+
dev_i_links_state = _get_links_state(dev_i_handle)
|
|
461
|
+
get_links_cache[dev_i_bdf] = dev_i_links_state
|
|
462
|
+
if dev_i_links_state:
|
|
443
463
|
ret.appendices[i].update(dev_i_links_state)
|
|
444
464
|
# In practice, if a card has an active *Link,
|
|
445
465
|
# then other cards in the same machine should be interconnected with it through the *Link.
|
|
@@ -456,21 +476,30 @@ class NVIDIADetector(Detector):
|
|
|
456
476
|
if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
|
|
457
477
|
continue
|
|
458
478
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
479
|
+
dev_j_bdf = dev_j.appendix.get("bdf")
|
|
480
|
+
if dev_i_bdf == dev_j_bdf:
|
|
481
|
+
distance = TopologyDistanceEnum.SELF
|
|
482
|
+
else:
|
|
483
|
+
if dev_j.appendix.get("vgpu", False):
|
|
484
|
+
dev_j_handle = pynvml.nvmlDeviceGetHandleByPciBusId(
|
|
485
|
+
dev_j_bdf,
|
|
486
|
+
)
|
|
487
|
+
else:
|
|
488
|
+
dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
|
|
489
|
+
|
|
490
|
+
distance = TopologyDistanceEnum.UNK
|
|
491
|
+
try:
|
|
492
|
+
distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
|
|
493
|
+
dev_i_handle,
|
|
494
|
+
dev_j_handle,
|
|
495
|
+
)
|
|
496
|
+
except pynvml.NVMLError:
|
|
497
|
+
debug_log_exception(
|
|
498
|
+
logger,
|
|
499
|
+
"Failed to get distance between device %d and %d",
|
|
500
|
+
dev_i.index,
|
|
501
|
+
dev_j.index,
|
|
502
|
+
)
|
|
474
503
|
|
|
475
504
|
ret.devices_distances[i][j] = distance
|
|
476
505
|
ret.devices_distances[j][i] = distance
|
|
@@ -767,30 +796,6 @@ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
|
|
|
767
796
|
raise AttributeError(msg)
|
|
768
797
|
|
|
769
798
|
|
|
770
|
-
def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
|
|
771
|
-
"""
|
|
772
|
-
Compute the memory size of a MIG compute instance in GiB.
|
|
773
|
-
|
|
774
|
-
Args:
|
|
775
|
-
dev_mem:
|
|
776
|
-
The total memory info of the parent GPU device.
|
|
777
|
-
dev_gi_prf:
|
|
778
|
-
The profile info of the GPU instance.
|
|
779
|
-
|
|
780
|
-
Returns:
|
|
781
|
-
The memory size in GiB.
|
|
782
|
-
|
|
783
|
-
"""
|
|
784
|
-
mem = dev_gi_prf.memorySizeMB * (1 << 20) # MiB to byte
|
|
785
|
-
|
|
786
|
-
gib = round(
|
|
787
|
-
math.ceil(mem / dev_mem.total * 8)
|
|
788
|
-
/ 8
|
|
789
|
-
* ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
|
|
790
|
-
)
|
|
791
|
-
return gib
|
|
792
|
-
|
|
793
|
-
|
|
794
799
|
def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
|
|
795
800
|
"""
|
|
796
801
|
Get the number of slice for a given Compute Instance Profile ID.
|