gpustack-runtime 0.1.41.post3__py3-none-any.whl → 0.1.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. gpustack_runtime/_version.py +2 -2
  2. gpustack_runtime/_version_appendix.py +1 -1
  3. gpustack_runtime/cmds/detector.py +3 -1
  4. gpustack_runtime/deployer/__types__.py +314 -233
  5. gpustack_runtime/deployer/cdi/__utils__.py +4 -1
  6. gpustack_runtime/deployer/docker.py +109 -148
  7. gpustack_runtime/deployer/k8s/deviceplugin/__init__.py +1 -1
  8. gpustack_runtime/deployer/k8s/deviceplugin/plugin.py +20 -19
  9. gpustack_runtime/deployer/kuberentes.py +89 -108
  10. gpustack_runtime/deployer/podman.py +89 -122
  11. gpustack_runtime/detector/__init__.py +2 -0
  12. gpustack_runtime/detector/__types__.py +26 -0
  13. gpustack_runtime/detector/amd.py +28 -8
  14. gpustack_runtime/detector/ascend.py +49 -4
  15. gpustack_runtime/detector/cambricon.py +3 -0
  16. gpustack_runtime/detector/hygon.py +16 -1
  17. gpustack_runtime/detector/iluvatar.py +6 -0
  18. gpustack_runtime/detector/metax.py +8 -0
  19. gpustack_runtime/detector/mthreads.py +11 -0
  20. gpustack_runtime/detector/nvidia.py +139 -134
  21. gpustack_runtime/detector/pyixml/__init__.py +16 -0
  22. gpustack_runtime/detector/pyrocmsmi/__init__.py +14 -0
  23. gpustack_runtime/detector/thead.py +135 -127
  24. gpustack_runtime/envs.py +7 -6
  25. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/METADATA +2 -2
  26. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/RECORD +29 -29
  27. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/WHEEL +0 -0
  28. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/entry_points.txt +0 -0
  29. {gpustack_runtime-0.1.41.post3.dist-info → gpustack_runtime-0.1.42.dist-info}/licenses/LICENSE +0 -0
@@ -10,6 +10,7 @@ from . import pyacl, pydcmi
10
10
  from .__types__ import (
11
11
  Detector,
12
12
  Device,
13
+ DeviceMemoryStatusEnum,
13
14
  Devices,
14
15
  ManufacturerEnum,
15
16
  Topology,
@@ -128,7 +129,9 @@ class AscendDetector(Detector):
128
129
  dev_is_vgpu = True
129
130
  dev_cores_aicore = dev_virt_info.query_info.computing.aic
130
131
  dev_name = dev_virt_info.query_info.name
131
- dev_mem, dev_mem_used = 0, 0
132
+ dev_mem = 0
133
+ dev_mem_used = 0
134
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
132
135
  if hasattr(dev_virt_info.query_info.computing, "memory_size"):
133
136
  dev_mem = dev_virt_info.query_info.computing.memory_size
134
137
  dev_index = dev_virt_info.vdev_id
@@ -143,6 +146,10 @@ class AscendDetector(Detector):
143
146
  dev_card_id,
144
147
  dev_device_id,
145
148
  )
149
+ dev_mem_status = _get_device_memory_status(
150
+ dev_card_id,
151
+ dev_device_id,
152
+ )
146
153
  dev_index = pydcmi.dcmi_get_device_logic_id(
147
154
  dev_card_id,
148
155
  dev_device_id,
@@ -239,6 +246,7 @@ class AscendDetector(Detector):
239
246
  memory=dev_mem,
240
247
  memory_used=dev_mem_used,
241
248
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
249
+ memory_status=dev_mem_status,
242
250
  temperature=dev_temp,
243
251
  power_used=dev_power_used,
244
252
  appendix=dev_appendix,
@@ -332,6 +340,12 @@ def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:
332
340
  """
333
341
  Get device memory information.
334
342
 
343
+ Args:
344
+ dev_card_id:
345
+ The card ID of the device.
346
+ dev_device_id:
347
+ The device ID of the device.
348
+
335
349
  Returns:
336
350
  A tuple containing total memory and used memory in MiB.
337
351
 
@@ -370,6 +384,37 @@ def _get_device_memory_info(dev_card_id, dev_device_id) -> tuple[int, int]:
370
384
  return dev_mem, dev_mem_used
371
385
 
372
386
 
387
+ def _get_device_memory_status(dev_card_id, dev_device_id) -> DeviceMemoryStatusEnum:
388
+ """
389
+ Get device memory ECC status.
390
+
391
+ Args:
392
+ dev_card_id:
393
+ The card ID of the device.
394
+ dev_device_id:
395
+ The device ID of the device.
396
+
397
+ Returns:
398
+ DeviceMemoryStatusEnum indicating the ECC status.
399
+
400
+ """
401
+ for dev_mem_type in [pydcmi.DCMI_DEVICE_TYPE_HBM, pydcmi.DCMI_DEVICE_TYPE_DDR]:
402
+ with contextlib.suppress(pydcmi.DCMIError):
403
+ dev_ecc_info = pydcmi.dcmi_get_device_ecc_info(
404
+ dev_card_id,
405
+ dev_device_id,
406
+ dev_mem_type,
407
+ )
408
+ if dev_ecc_info.enable_flag and (
409
+ dev_ecc_info.single_bit_error_cnt > 0
410
+ or dev_ecc_info.double_bit_error_cnt > 0
411
+ ):
412
+ return DeviceMemoryStatusEnum.UNHEALTHY
413
+ return DeviceMemoryStatusEnum.HEALTHY
414
+
415
+ return DeviceMemoryStatusEnum.HEALTHY
416
+
417
+
373
418
  def _get_device_roce_network_info(
374
419
  dev_card_id,
375
420
  dev_device_id,
@@ -528,11 +573,11 @@ def get_ascend_cann_variant(name: str | None) -> str | None:
528
573
  if version < 220:
529
574
  return "310p"
530
575
  if version < 240:
531
- return "910b"
576
+ return "910b" # 910b/a2
532
577
  if version < 250:
533
578
  return "310b"
534
579
  if version < 260:
535
- return "a3" # 910c
580
+ return "a3" # 910c/a3
536
581
  if version < 270:
537
- return "a5" # 910d
582
+ return "a5" # 910d/a5
538
583
  return None
@@ -6,6 +6,7 @@ from functools import lru_cache
6
6
 
7
7
  from .. import envs
8
8
  from ..logging import debug_log_exception
9
+ from . import DeviceMemoryStatusEnum
9
10
  from .__types__ import Detector, Device, Devices, ManufacturerEnum
10
11
  from .__utils__ import (
11
12
  PCIDevice,
@@ -100,6 +101,7 @@ class CambriconDetector(Detector):
100
101
  dev_mem_usage_info = dev_info.get("PhysicalMemUsage", {})
101
102
  dev_mem = safe_int(dev_mem_usage_info.get("Total", 0))
102
103
  dev_mem_used = safe_int(dev_mem_usage_info.get("Used", 0))
104
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
103
105
 
104
106
  dev_temp_info = dev_info.get("Temperature", {})
105
107
  dev_temp = safe_float(dev_temp_info.get("Chip", 0))
@@ -118,6 +120,7 @@ class CambriconDetector(Detector):
118
120
  memory=dev_mem,
119
121
  memory_used=dev_mem_used,
120
122
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
123
+ memory_status=dev_mem_status,
121
124
  temperature=dev_temp,
122
125
  appendix=dev_appendix,
123
126
  ),
@@ -8,7 +8,14 @@ from pathlib import Path
8
8
  from .. import envs
9
9
  from ..logging import debug_log_exception, debug_log_warning
10
10
  from . import Topology, pyamdgpu, pyhsa, pyrocmcore, pyrocmsmi
11
- from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
11
+ from .__types__ import (
12
+ Detector,
13
+ Device,
14
+ DeviceMemoryStatusEnum,
15
+ Devices,
16
+ ManufacturerEnum,
17
+ TopologyDistanceEnum,
18
+ )
12
19
  from .__utils__ import (
13
20
  PCIDevice,
14
21
  byte_to_mebibyte,
@@ -149,6 +156,13 @@ class HygonDetector(Detector):
149
156
  dev_mem_used = byte_to_mebibyte( # byte to MiB
150
157
  pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
151
158
  )
159
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
160
+ with contextlib.suppress(pyrocmsmi.ROCMSMIError):
161
+ dev_ecc_count = pyrocmsmi.rsmi_dev_ecc_count_get(
162
+ dev_idx,
163
+ )
164
+ if dev_ecc_count.uncorrectable_err > 0:
165
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
152
166
 
153
167
  dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
154
168
  dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
@@ -184,6 +198,7 @@ class HygonDetector(Detector):
184
198
  memory=dev_mem,
185
199
  memory_used=dev_mem_used,
186
200
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
201
+ memory_status=dev_mem_status,
187
202
  temperature=dev_temp,
188
203
  power=dev_power,
189
204
  power_used=dev_power_used,
@@ -10,6 +10,7 @@ from . import pyixml
10
10
  from .__types__ import (
11
11
  Detector,
12
12
  Device,
13
+ DeviceMemoryStatusEnum,
13
14
  Devices,
14
15
  ManufacturerEnum,
15
16
  Topology,
@@ -135,6 +136,7 @@ class IluvatarDetector(Detector):
135
136
 
136
137
  dev_mem = 0
137
138
  dev_mem_used = 0
139
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
138
140
  with contextlib.suppress(pyixml.NVMLError):
139
141
  dev_mem_info = pyixml.nvmlDeviceGetMemoryInfo(dev)
140
142
  dev_mem = byte_to_mebibyte( # byte to MiB
@@ -143,6 +145,9 @@ class IluvatarDetector(Detector):
143
145
  dev_mem_used = byte_to_mebibyte( # byte to MiB
144
146
  dev_mem_info.used,
145
147
  )
148
+ dev_health = pyixml.ixmlDeviceGetHealth(dev)
149
+ if dev_health != pyixml.IXML_HEALTH_OK:
150
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
146
151
 
147
152
  dev_cores_util = None
148
153
  with contextlib.suppress(pyixml.NVMLError):
@@ -213,6 +218,7 @@ class IluvatarDetector(Detector):
213
218
  memory=dev_mem,
214
219
  memory_used=dev_mem_used,
215
220
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
221
+ memory_status=dev_mem_status,
216
222
  temperature=dev_temp,
217
223
  power=dev_power,
218
224
  power_used=dev_power_used,
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations as __future_annotations__
2
2
 
3
+ import contextlib
3
4
  import logging
4
5
  from functools import lru_cache
5
6
  from pathlib import Path
@@ -10,6 +11,7 @@ from . import pymxsml
10
11
  from .__types__ import (
11
12
  Detector,
12
13
  Device,
14
+ DeviceMemoryStatusEnum,
13
15
  Devices,
14
16
  ManufacturerEnum,
15
17
  Topology,
@@ -145,6 +147,11 @@ class MetaXDetector(Detector):
145
147
  dev_mem_used = kibibyte_to_mebibyte( # KiB to MiB
146
148
  dev_mem_info.vramUse,
147
149
  )
150
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
151
+ with contextlib.suppress(pymxsml.MXSMLError):
152
+ dev_ecc_errors = pymxsml.mxSmlGetTotalEccErrors(dev_idx)
153
+ if dev_ecc_errors.dramUE > 0:
154
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
148
155
 
149
156
  dev_temp = (
150
157
  pymxsml.mxSmlGetTemperatureInfo(
@@ -201,6 +208,7 @@ class MetaXDetector(Detector):
201
208
  memory=dev_mem,
202
209
  memory_used=dev_mem_used,
203
210
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
211
+ memory_status=dev_mem_status,
204
212
  temperature=dev_temp,
205
213
  power=dev_power,
206
214
  power_used=dev_power_used,
@@ -7,6 +7,7 @@ import pymtml
7
7
 
8
8
  from .. import envs
9
9
  from ..logging import debug_log_exception, debug_log_warning
10
+ from . import DeviceMemoryStatusEnum
10
11
  from .__types__ import (
11
12
  Detector,
12
13
  Device,
@@ -140,6 +141,7 @@ class MThreadsDetector(Detector):
140
141
 
141
142
  dev_mem = 0
142
143
  dev_mem_used = 0
144
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
143
145
  with pymtml.mtmlMemoryContext(dev) as devmem:
144
146
  dev_mem = byte_to_mebibyte( # byte to MiB
145
147
  pymtml.mtmlMemoryGetTotal(devmem),
@@ -147,6 +149,14 @@ class MThreadsDetector(Detector):
147
149
  dev_mem_used = byte_to_mebibyte( # byte to MiB
148
150
  pymtml.mtmlMemoryGetUsed(devmem),
149
151
  )
152
+ dev_mem_ecc_errors = pymtml.mtmlMemoryGetEccErrorCounter(
153
+ devmem,
154
+ pymtml.MTML_MEMORY_ERROR_TYPE_UNCORRECTED,
155
+ pymtml.MTML_VOLATILE_ECC,
156
+ pymtml.MTML_MEMORY_LOCATION_DRAM,
157
+ )
158
+ if dev_mem_ecc_errors > 0:
159
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
150
160
 
151
161
  dev_cores_util = None
152
162
  dev_temp = None
@@ -192,6 +202,7 @@ class MThreadsDetector(Detector):
192
202
  memory=dev_mem,
193
203
  memory_used=dev_mem_used,
194
204
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
205
+ memory_status=dev_mem_status,
195
206
  temperature=dev_temp,
196
207
  power_used=dev_power_used,
197
208
  appendix=dev_appendix,
@@ -3,17 +3,17 @@ from __future__ import annotations as __future_annotations__
3
3
  import contextlib
4
4
  import logging
5
5
  import math
6
+ import re
6
7
  import time
7
8
  from _ctypes import byref
8
9
  from functools import lru_cache
9
10
  from pathlib import Path
10
- from typing import re
11
11
 
12
12
  import pynvml
13
13
 
14
14
  from .. import envs
15
15
  from ..logging import debug_log_exception, debug_log_warning
16
- from . import Topology, pycuda
16
+ from . import DeviceMemoryStatusEnum, Topology, pycuda
17
17
  from .__types__ import Detector, Device, Devices, ManufacturerEnum, TopologyDistanceEnum
18
18
  from .__utils__ import (
19
19
  PCIDevice,
@@ -78,7 +78,7 @@ class NVIDIADetector(Detector):
78
78
  def __init__(self):
79
79
  super().__init__(ManufacturerEnum.NVIDIA)
80
80
 
81
- def detect(self) -> Devices | None:
81
+ def detect(self) -> Devices | None: # noqa: PLR0915
82
82
  """
83
83
  Detect NVIDIA GPUs using pynvml.
84
84
 
@@ -141,6 +141,22 @@ class NVIDIADetector(Detector):
141
141
  )
142
142
  dev_numa = bitmask_to_str(list(dev_node_affinity))
143
143
 
144
+ dev_temp = None
145
+ with contextlib.suppress(pynvml.NVMLError):
146
+ dev_temp = pynvml.nvmlDeviceGetTemperature(
147
+ dev,
148
+ pynvml.NVML_TEMPERATURE_GPU,
149
+ )
150
+
151
+ dev_power = None
152
+ dev_power_used = None
153
+ with contextlib.suppress(pynvml.NVMLError):
154
+ dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
155
+ dev_power = dev_power // 1000 # mW to W
156
+ dev_power_used = (
157
+ pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
158
+ ) # mW to W
159
+
144
160
  dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
145
161
  with contextlib.suppress(pynvml.NVMLError):
146
162
  dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
@@ -180,6 +196,7 @@ class NVIDIADetector(Detector):
180
196
 
181
197
  dev_mem = 0
182
198
  dev_mem_used = 0
199
+ dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
183
200
  with contextlib.suppress(pynvml.NVMLError):
184
201
  dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
185
202
  dev_mem = byte_to_mebibyte( # byte to MiB
@@ -188,24 +205,16 @@ class NVIDIADetector(Detector):
188
205
  dev_mem_used = byte_to_mebibyte( # byte to MiB
189
206
  dev_mem_info.used,
190
207
  )
191
- if dev_mem == 0:
192
- dev_mem, dev_mem_used = get_memory()
193
-
194
- dev_temp = None
195
- with contextlib.suppress(pynvml.NVMLError):
196
- dev_temp = pynvml.nvmlDeviceGetTemperature(
208
+ dev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
197
209
  dev,
198
- pynvml.NVML_TEMPERATURE_GPU,
210
+ pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
211
+ pynvml.NVML_VOLATILE_ECC,
212
+ pynvml.NVML_MEMORY_LOCATION_DRAM,
199
213
  )
200
-
201
- dev_power = None
202
- dev_power_used = None
203
- with contextlib.suppress(pynvml.NVMLError):
204
- dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
205
- dev_power = dev_power // 1000 # mW to W
206
- dev_power_used = (
207
- pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
208
- ) # mW to W
214
+ if dev_mem_ecc_errors > 0:
215
+ dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
216
+ if dev_mem == 0:
217
+ dev_mem, dev_mem_used = get_memory()
209
218
 
210
219
  dev_is_vgpu = False
211
220
  if dev_bdf in pci_devs:
@@ -236,6 +245,7 @@ class NVIDIADetector(Detector):
236
245
  memory=dev_mem,
237
246
  memory_used=dev_mem_used,
238
247
  memory_utilization=get_utilization(dev_mem_used, dev_mem),
248
+ memory_status=dev_mem_status,
239
249
  temperature=dev_temp,
240
250
  power=dev_power,
241
251
  power_used=dev_power_used,
@@ -254,12 +264,18 @@ class NVIDIADetector(Detector):
254
264
  mdev_cores = None
255
265
  mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
256
266
  for mdev_idx in range(mdev_count):
257
- mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
267
+ mdev = None
268
+ with contextlib.suppress(pynvml.NVMLError):
269
+ mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
270
+ if not mdev:
271
+ continue
258
272
 
259
- mdev_index = mdev_idx
273
+ mdev_index = mdev_idx + dev_count * (dev_idx + 1)
260
274
  mdev_uuid = pynvml.nvmlDeviceGetUUID(mdev)
261
275
 
262
- mdev_mem, mdev_mem_used = 0, 0
276
+ mdev_mem = 0
277
+ mdev_mem_used = 0
278
+ mdev_mem_status = DeviceMemoryStatusEnum.HEALTHY
263
279
  with contextlib.suppress(pynvml.NVMLError):
264
280
  mdev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(mdev)
265
281
  mdev_mem = byte_to_mebibyte( # byte to MiB
@@ -268,21 +284,14 @@ class NVIDIADetector(Detector):
268
284
  mdev_mem_used = byte_to_mebibyte( # byte to MiB
269
285
  mdev_mem_info.used,
270
286
  )
271
-
272
- mdev_temp = pynvml.nvmlDeviceGetTemperature(
273
- mdev,
274
- pynvml.NVML_TEMPERATURE_GPU,
275
- )
276
-
277
- mdev_power = None
278
- with contextlib.suppress(pynvml.NVMLError):
279
- mdev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(
287
+ mdev_mem_ecc_errors = pynvml.nvmlDeviceGetMemoryErrorCounter(
280
288
  mdev,
289
+ pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
290
+ pynvml.NVML_AGGREGATE_ECC,
291
+ pynvml.NVML_MEMORY_LOCATION_SRAM,
281
292
  )
282
- mdev_power = mdev_power // 1000 # mW to W
283
- mdev_power_used = (
284
- pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
285
- ) # mW to W
293
+ if mdev_mem_ecc_errors > 0:
294
+ mdev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
286
295
 
287
296
  mdev_appendix = {
288
297
  "arch_family": _get_arch_family(dev_cc_t),
@@ -305,71 +314,70 @@ class NVIDIADetector(Detector):
305
314
 
306
315
  mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
307
316
 
308
- if not mdev_name:
309
- mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
310
- mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
311
- mdev_gi,
312
- mdev_ci_id,
313
- )
314
- mdev_gi_info = pynvml.nvmlGpuInstanceGetInfo(mdev_gi)
315
- mdev_ci_info = pynvml.nvmlComputeInstanceGetInfo(mdev_ci)
316
- for dev_gi_prf_id in range(
317
- pynvml.NVML_GPU_INSTANCE_PROFILE_COUNT,
318
- ):
319
- try:
320
- dev_gi_prf = pynvml.nvmlDeviceGetGpuInstanceProfileInfo(
321
- dev,
322
- dev_gi_prf_id,
323
- )
324
- if dev_gi_prf.id != mdev_gi_info.profileId:
325
- continue
326
- except pynvml.NVMLError:
317
+ mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
318
+ mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
319
+ mdev_gi,
320
+ mdev_ci_id,
321
+ )
322
+ mdev_gi_info = pynvml.nvmlGpuInstanceGetInfo(mdev_gi)
323
+ mdev_ci_info = pynvml.nvmlComputeInstanceGetInfo(mdev_ci)
324
+ for dev_gi_prf_id in range(
325
+ pynvml.NVML_GPU_INSTANCE_PROFILE_COUNT,
326
+ ):
327
+ try:
328
+ dev_gi_prf = pynvml.nvmlDeviceGetGpuInstanceProfileInfo(
329
+ dev,
330
+ dev_gi_prf_id,
331
+ )
332
+ if dev_gi_prf.id != mdev_gi_info.profileId:
327
333
  continue
334
+ except pynvml.NVMLError:
335
+ continue
328
336
 
329
- for dev_ci_prf_id in range(
330
- pynvml.NVML_COMPUTE_INSTANCE_PROFILE_COUNT,
337
+ for dev_ci_prf_id in range(
338
+ pynvml.NVML_COMPUTE_INSTANCE_PROFILE_COUNT,
339
+ ):
340
+ for dev_cig_prf_id in range(
341
+ pynvml.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
331
342
  ):
332
- for dev_cig_prf_id in range(
333
- pynvml.NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
334
- ):
335
- try:
336
- mdev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
337
- mdev_gi,
338
- dev_ci_prf_id,
339
- dev_cig_prf_id,
340
- )
341
- if mdev_ci_prf.id != mdev_ci_info.profileId:
342
- continue
343
- except pynvml.NVMLError:
344
- continue
345
-
346
- ci_slice = _get_compute_instance_slice(
343
+ try:
344
+ dev_ci_prf = pynvml.nvmlGpuInstanceGetComputeInstanceProfileInfo(
345
+ mdev_gi,
347
346
  dev_ci_prf_id,
347
+ dev_cig_prf_id,
348
348
  )
349
- gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
350
- gi_mem = _get_gpu_instance_memory(
351
- dev_mem_info,
352
- dev_gi_prf,
353
- )
354
- gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
355
- gi_neg_attrs = _get_gpu_instance_negattrs(
356
- dev_gi_prf_id,
357
- )
349
+ if dev_ci_prf.id != mdev_ci_info.profileId:
350
+ continue
351
+ except pynvml.NVMLError:
352
+ continue
358
353
 
359
- if ci_slice == gi_slice:
360
- mdev_name = f"{gi_slice}g.{gi_mem}gb"
354
+ ci_slice = _get_compute_instance_slice(dev_ci_prf_id)
355
+ gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
356
+ if ci_slice == gi_slice:
357
+ if hasattr(dev_gi_prf, "name"):
358
+ mdev_name = dev_gi_prf.name
361
359
  else:
362
- mdev_name = (
363
- f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
360
+ gi_mem = round(
361
+ math.ceil(dev_gi_prf.memorySizeMB >> 10),
364
362
  )
365
- if gi_attrs:
366
- mdev_name += f"+{gi_attrs}"
367
- if gi_neg_attrs:
368
- mdev_name += f"-{gi_neg_attrs}"
363
+ mdev_name = f"{gi_slice}g.{gi_mem}gb"
364
+ elif hasattr(dev_ci_prf, "name"):
365
+ mdev_name = dev_ci_prf.name
366
+ else:
367
+ gi_mem = round(
368
+ math.ceil(dev_gi_prf.memorySizeMB >> 10),
369
+ )
370
+ mdev_name = f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
371
+ gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
372
+ if gi_attrs:
373
+ mdev_name += f"+{gi_attrs}"
374
+ gi_neg_attrs = _get_gpu_instance_negattrs(dev_gi_prf_id)
375
+ if gi_neg_attrs:
376
+ mdev_name += f"-{gi_neg_attrs}"
369
377
 
370
- mdev_cores = mdev_ci_prf.multiprocessorCount
378
+ mdev_cores = dev_ci_prf.multiprocessorCount
371
379
 
372
- break
380
+ break
373
381
 
374
382
  ret.append(
375
383
  Device(
@@ -386,9 +394,10 @@ class NVIDIADetector(Detector):
386
394
  memory=mdev_mem,
387
395
  memory_used=mdev_mem_used,
388
396
  memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
389
- temperature=mdev_temp,
390
- power=mdev_power,
391
- power_used=mdev_power_used,
397
+ memory_status=mdev_mem_status,
398
+ temperature=dev_temp,
399
+ power=dev_power,
400
+ power_used=dev_power_used,
392
401
  appendix=mdev_appendix,
393
402
  ),
394
403
  )
@@ -426,11 +435,17 @@ class NVIDIADetector(Detector):
426
435
  devices_count=len(devices),
427
436
  )
428
437
 
438
+ get_links_cache = {}
439
+
429
440
  try:
430
441
  pynvml.nvmlInit()
431
442
 
432
443
  for i, dev_i in enumerate(devices):
433
- dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
444
+ dev_i_bdf = dev_i.appendix.get("bdf")
445
+ if dev_i.appendix.get("vgpu", False):
446
+ dev_i_handle = pynvml.nvmlDeviceGetHandleByPciBusId(dev_i_bdf)
447
+ else:
448
+ dev_i_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_i.uuid)
434
449
 
435
450
  # Get NUMA and CPU affinities.
436
451
  ret.devices_numa_affinities[i] = dev_i.appendix.get("numa", "")
@@ -439,7 +454,12 @@ class NVIDIADetector(Detector):
439
454
  )
440
455
 
441
456
  # Get links state if applicable.
442
- if dev_i_links_state := _get_links_state(dev_i_handle):
457
+ if dev_i_bdf in get_links_cache:
458
+ dev_i_links_state = get_links_cache[dev_i_bdf]
459
+ else:
460
+ dev_i_links_state = _get_links_state(dev_i_handle)
461
+ get_links_cache[dev_i_bdf] = dev_i_links_state
462
+ if dev_i_links_state:
443
463
  ret.appendices[i].update(dev_i_links_state)
444
464
  # In practice, if a card has an active *Link,
445
465
  # then other cards in the same machine should be interconnected with it through the *Link.
@@ -456,21 +476,30 @@ class NVIDIADetector(Detector):
456
476
  if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
457
477
  continue
458
478
 
459
- dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
460
-
461
- distance = TopologyDistanceEnum.UNK
462
- try:
463
- distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
464
- dev_i_handle,
465
- dev_j_handle,
466
- )
467
- except pynvml.NVMLError:
468
- debug_log_exception(
469
- logger,
470
- "Failed to get distance between device %d and %d",
471
- dev_i.index,
472
- dev_j.index,
473
- )
479
+ dev_j_bdf = dev_j.appendix.get("bdf")
480
+ if dev_i_bdf == dev_j_bdf:
481
+ distance = TopologyDistanceEnum.SELF
482
+ else:
483
+ if dev_j.appendix.get("vgpu", False):
484
+ dev_j_handle = pynvml.nvmlDeviceGetHandleByPciBusId(
485
+ dev_j_bdf,
486
+ )
487
+ else:
488
+ dev_j_handle = pynvml.nvmlDeviceGetHandleByUUID(dev_j.uuid)
489
+
490
+ distance = TopologyDistanceEnum.UNK
491
+ try:
492
+ distance = pynvml.nvmlDeviceGetTopologyCommonAncestor(
493
+ dev_i_handle,
494
+ dev_j_handle,
495
+ )
496
+ except pynvml.NVMLError:
497
+ debug_log_exception(
498
+ logger,
499
+ "Failed to get distance between device %d and %d",
500
+ dev_i.index,
501
+ dev_j.index,
502
+ )
474
503
 
475
504
  ret.devices_distances[i][j] = distance
476
505
  ret.devices_distances[j][i] = distance
@@ -767,30 +796,6 @@ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
767
796
  raise AttributeError(msg)
768
797
 
769
798
 
770
- def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
771
- """
772
- Compute the memory size of a MIG compute instance in GiB.
773
-
774
- Args:
775
- dev_mem:
776
- The total memory info of the parent GPU device.
777
- dev_gi_prf:
778
- The profile info of the GPU instance.
779
-
780
- Returns:
781
- The memory size in GiB.
782
-
783
- """
784
- mem = dev_gi_prf.memorySizeMB * (1 << 20) # MiB to byte
785
-
786
- gib = round(
787
- math.ceil(mem / dev_mem.total * 8)
788
- / 8
789
- * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
790
- )
791
- return gib
792
-
793
-
794
799
  def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
795
800
  """
796
801
  Get the number of slice for a given Compute Instance Profile ID.