gpustack-runtime 0.1.39.post2__py3-none-any.whl → 0.1.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/__main__.py +7 -3
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/__init__.py +2 -0
- gpustack_runtime/cmds/deployer.py +84 -2
- gpustack_runtime/cmds/images.py +2 -0
- gpustack_runtime/deployer/__init__.py +2 -0
- gpustack_runtime/deployer/__types__.py +52 -28
- gpustack_runtime/deployer/__utils__.py +99 -112
- gpustack_runtime/deployer/cdi/__init__.py +81 -0
- gpustack_runtime/deployer/cdi/__types__.py +667 -0
- gpustack_runtime/deployer/cdi/thead.py +103 -0
- gpustack_runtime/deployer/docker.py +42 -24
- gpustack_runtime/deployer/kuberentes.py +8 -4
- gpustack_runtime/deployer/podman.py +41 -23
- gpustack_runtime/detector/__init__.py +62 -3
- gpustack_runtime/detector/__types__.py +11 -0
- gpustack_runtime/detector/__utils__.py +23 -0
- gpustack_runtime/detector/amd.py +17 -9
- gpustack_runtime/detector/hygon.py +6 -1
- gpustack_runtime/detector/iluvatar.py +20 -5
- gpustack_runtime/detector/mthreads.py +8 -12
- gpustack_runtime/detector/nvidia.py +365 -168
- gpustack_runtime/detector/pyacl/__init__.py +9 -1
- gpustack_runtime/detector/pyamdgpu/__init__.py +8 -0
- gpustack_runtime/detector/pycuda/__init__.py +9 -1
- gpustack_runtime/detector/pydcmi/__init__.py +9 -2
- gpustack_runtime/detector/pyhgml/__init__.py +5879 -0
- gpustack_runtime/detector/pyhgml/libhgml.so +0 -0
- gpustack_runtime/detector/pyhgml/libuki.so +0 -0
- gpustack_runtime/detector/pyhsa/__init__.py +9 -0
- gpustack_runtime/detector/pyixml/__init__.py +89 -164
- gpustack_runtime/detector/pyrocmcore/__init__.py +42 -24
- gpustack_runtime/detector/pyrocmsmi/__init__.py +141 -138
- gpustack_runtime/detector/thead.py +733 -0
- gpustack_runtime/envs.py +128 -55
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/METADATA +4 -2
- gpustack_runtime-0.1.40.dist-info/RECORD +55 -0
- gpustack_runtime/detector/pymtml/__init__.py +0 -770
- gpustack_runtime-0.1.39.post2.dist-info/RECORD +0 -49
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,9 +2,10 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import logging
|
|
5
|
+
import math
|
|
6
|
+
import time
|
|
5
7
|
from _ctypes import byref
|
|
6
8
|
from functools import lru_cache
|
|
7
|
-
from math import ceil
|
|
8
9
|
|
|
9
10
|
import pynvml
|
|
10
11
|
|
|
@@ -125,103 +126,104 @@ class NVIDIADetector(Detector):
|
|
|
125
126
|
for dev_idx in range(dev_count):
|
|
126
127
|
dev = pynvml.nvmlDeviceGetHandleByIndex(dev_idx)
|
|
127
128
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
if dev_files is None:
|
|
131
|
-
dev_files = get_device_files(pattern=r"nvidia(?P<number>\d+)")
|
|
132
|
-
if len(dev_files) >= dev_count:
|
|
133
|
-
dev_file = dev_files[dev_idx]
|
|
134
|
-
if dev_file.number is not None:
|
|
135
|
-
dev_index = dev_file.number
|
|
136
|
-
dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
|
|
137
|
-
|
|
138
|
-
dev_cores = None
|
|
139
|
-
if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
|
|
140
|
-
with contextlib.suppress(pycuda.CUDAError):
|
|
141
|
-
dev_gpudev = pycuda.cuDeviceGet(dev_idx)
|
|
142
|
-
dev_cores = pycuda.cuDeviceGetAttribute(
|
|
143
|
-
dev_gpudev,
|
|
144
|
-
pycuda.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
|
145
|
-
)
|
|
129
|
+
dev_cc_t = pynvml.nvmlDeviceGetCudaComputeCapability(dev)
|
|
130
|
+
dev_cc = ".".join(map(str, dev_cc_t))
|
|
146
131
|
|
|
147
|
-
|
|
148
|
-
dev_mem_used = 0
|
|
132
|
+
dev_bdf = None
|
|
149
133
|
with contextlib.suppress(pynvml.NVMLError):
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
dev_mem_info.total,
|
|
153
|
-
)
|
|
154
|
-
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
155
|
-
dev_mem_info.used,
|
|
156
|
-
)
|
|
157
|
-
if dev_mem == 0:
|
|
158
|
-
dev_mem, dev_mem_used = get_memory()
|
|
134
|
+
dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
|
|
135
|
+
dev_bdf = str(dev_pci_info.busIdLegacy).lower()
|
|
159
136
|
|
|
160
|
-
|
|
137
|
+
dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
|
|
161
138
|
with contextlib.suppress(pynvml.NVMLError):
|
|
162
|
-
|
|
163
|
-
dev_cores_util = dev_util_rates.gpu
|
|
164
|
-
if dev_cores_util is None:
|
|
165
|
-
debug_log_warning(
|
|
166
|
-
logger,
|
|
167
|
-
"Failed to get device %d cores utilization, setting to 0",
|
|
168
|
-
dev_index,
|
|
169
|
-
)
|
|
170
|
-
dev_cores_util = 0
|
|
139
|
+
dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
|
|
171
140
|
|
|
172
|
-
|
|
173
|
-
with contextlib.suppress(pynvml.NVMLError):
|
|
174
|
-
dev_temp = pynvml.nvmlDeviceGetTemperature(
|
|
175
|
-
dev,
|
|
176
|
-
pynvml.NVML_TEMPERATURE_GPU,
|
|
177
|
-
)
|
|
141
|
+
# With MIG disabled, treat as a single device.
|
|
178
142
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
143
|
+
if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
|
|
144
|
+
dev_index = dev_idx
|
|
145
|
+
if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
|
|
146
|
+
if dev_files is None:
|
|
147
|
+
dev_files = get_device_files(
|
|
148
|
+
pattern=r"nvidia(?P<number>\d+)",
|
|
149
|
+
)
|
|
150
|
+
if len(dev_files) >= dev_count:
|
|
151
|
+
dev_file = dev_files[dev_idx]
|
|
152
|
+
if dev_file.number is not None:
|
|
153
|
+
dev_index = dev_file.number
|
|
187
154
|
|
|
188
|
-
|
|
189
|
-
dev_cc = ".".join(map(str, dev_cc_t))
|
|
155
|
+
dev_name = pynvml.nvmlDeviceGetName(dev)
|
|
190
156
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
if
|
|
195
|
-
|
|
196
|
-
|
|
157
|
+
dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
|
|
158
|
+
|
|
159
|
+
dev_cores = None
|
|
160
|
+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
|
|
161
|
+
with contextlib.suppress(pycuda.CUDAError):
|
|
162
|
+
dev_gpudev = pycuda.cuDeviceGet(dev_idx)
|
|
163
|
+
dev_cores = pycuda.cuDeviceGetAttribute(
|
|
164
|
+
dev_gpudev,
|
|
165
|
+
pycuda.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
dev_cores_util = _get_sm_util_from_gpm_metrics(dev)
|
|
169
|
+
if dev_cores_util is None:
|
|
170
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
171
|
+
dev_util_rates = pynvml.nvmlDeviceGetUtilizationRates(dev)
|
|
172
|
+
dev_cores_util = dev_util_rates.gpu
|
|
173
|
+
if dev_cores_util is None:
|
|
174
|
+
debug_log_warning(
|
|
175
|
+
logger,
|
|
176
|
+
"Failed to get device %d cores utilization, setting to 0",
|
|
177
|
+
dev_index,
|
|
178
|
+
)
|
|
179
|
+
dev_cores_util = 0
|
|
197
180
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
181
|
+
dev_mem = 0
|
|
182
|
+
dev_mem_used = 0
|
|
183
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
184
|
+
dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
|
|
185
|
+
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
186
|
+
dev_mem_info.total,
|
|
187
|
+
)
|
|
188
|
+
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
189
|
+
dev_mem_info.used,
|
|
190
|
+
)
|
|
191
|
+
if dev_mem == 0:
|
|
192
|
+
dev_mem, dev_mem_used = get_memory()
|
|
203
193
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
|
|
210
|
-
dev_fabric = None
|
|
211
|
-
if dev_fabric:
|
|
212
|
-
dev_appendix["fabric_cluster_uuid"] = stringify_uuid(
|
|
213
|
-
bytes(dev_fabric.clusterUuid),
|
|
194
|
+
dev_temp = None
|
|
195
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
196
|
+
dev_temp = pynvml.nvmlDeviceGetTemperature(
|
|
197
|
+
dev,
|
|
198
|
+
pynvml.NVML_TEMPERATURE_GPU,
|
|
214
199
|
)
|
|
215
|
-
dev_appendix["fabric_clique_id"] = dev_fabric.cliqueId
|
|
216
200
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
201
|
+
dev_power = None
|
|
202
|
+
dev_power_used = None
|
|
203
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
204
|
+
dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
|
|
205
|
+
dev_power = dev_power // 1000 # mW to W
|
|
206
|
+
dev_power_used = (
|
|
207
|
+
pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
|
|
208
|
+
) # mW to W
|
|
220
209
|
|
|
221
|
-
|
|
210
|
+
dev_is_vgpu = False
|
|
211
|
+
if dev_bdf and dev_bdf in pci_devs:
|
|
212
|
+
dev_is_vgpu = _is_vgpu(pci_devs[dev_bdf].config)
|
|
213
|
+
|
|
214
|
+
dev_appendix = {
|
|
215
|
+
"arch_family": _get_arch_family(dev_cc_t),
|
|
216
|
+
"vgpu": dev_is_vgpu,
|
|
217
|
+
}
|
|
218
|
+
if dev_bdf:
|
|
219
|
+
dev_appendix["bdf"] = dev_bdf
|
|
220
|
+
|
|
221
|
+
if dev_links_state := _get_links_state(dev):
|
|
222
|
+
dev_appendix.update(dev_links_state)
|
|
223
|
+
|
|
224
|
+
if dev_fabric_info := _get_fabric_info(dev):
|
|
225
|
+
dev_appendix.update(dev_fabric_info)
|
|
222
226
|
|
|
223
|
-
if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
|
|
224
|
-
dev_name = pynvml.nvmlDeviceGetName(dev)
|
|
225
227
|
ret.append(
|
|
226
228
|
Device(
|
|
227
229
|
manufacturer=self.manufacturer,
|
|
@@ -250,7 +252,7 @@ class NVIDIADetector(Detector):
|
|
|
250
252
|
# inspired by https://github.com/NVIDIA/go-nvlib/blob/fdfe25d0ffc9d7a8c166f4639ef236da81116262/pkg/nvlib/device/mig_device.go#L61-L154.
|
|
251
253
|
|
|
252
254
|
mdev_name = ""
|
|
253
|
-
mdev_cores =
|
|
255
|
+
mdev_cores = None
|
|
254
256
|
mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
|
|
255
257
|
for mdev_idx in range(mdev_count):
|
|
256
258
|
mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
|
|
@@ -283,16 +285,21 @@ class NVIDIADetector(Detector):
|
|
|
283
285
|
pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
|
|
284
286
|
) # mW to W
|
|
285
287
|
|
|
286
|
-
mdev_appendix =
|
|
288
|
+
mdev_appendix = {
|
|
289
|
+
"arch_family": _get_arch_family(dev_cc_t),
|
|
290
|
+
"vgpu": True,
|
|
291
|
+
}
|
|
292
|
+
if dev_bdf:
|
|
293
|
+
mdev_appendix["bdf"] = dev_bdf
|
|
287
294
|
|
|
288
295
|
mdev_gi_id = pynvml.nvmlDeviceGetGpuInstanceId(mdev)
|
|
289
296
|
mdev_appendix["gpu_instance_id"] = mdev_gi_id
|
|
290
297
|
mdev_ci_id = pynvml.nvmlDeviceGetComputeInstanceId(mdev)
|
|
291
298
|
mdev_appendix["compute_instance_id"] = mdev_ci_id
|
|
292
299
|
|
|
293
|
-
|
|
294
|
-
mdev_attrs = pynvml.nvmlDeviceGetAttributes(mdev)
|
|
300
|
+
mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
|
|
295
301
|
|
|
302
|
+
if not mdev_name:
|
|
296
303
|
mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
|
|
297
304
|
mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
|
|
298
305
|
mdev_gi,
|
|
@@ -310,11 +317,6 @@ class NVIDIADetector(Detector):
|
|
|
310
317
|
)
|
|
311
318
|
if dev_gi_prf.id != mdev_gi_info.profileId:
|
|
312
319
|
continue
|
|
313
|
-
mdev_cores = getattr(
|
|
314
|
-
dev_gi_prf,
|
|
315
|
-
"multiprocessorCount",
|
|
316
|
-
1,
|
|
317
|
-
)
|
|
318
320
|
except pynvml.NVMLError:
|
|
319
321
|
continue
|
|
320
322
|
|
|
@@ -335,31 +337,31 @@ class NVIDIADetector(Detector):
|
|
|
335
337
|
except pynvml.NVMLError:
|
|
336
338
|
continue
|
|
337
339
|
|
|
338
|
-
|
|
339
|
-
gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
|
|
340
|
-
gi_neg_attrs = _get_gpu_instance_negative_attrs(
|
|
341
|
-
dev_gi_prf_id,
|
|
342
|
-
)
|
|
343
|
-
ci_slices = _get_compute_instance_slices(
|
|
340
|
+
ci_slice = _get_compute_instance_slice(
|
|
344
341
|
dev_ci_prf_id,
|
|
345
342
|
)
|
|
346
|
-
|
|
343
|
+
gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
|
|
344
|
+
gi_mem = _get_gpu_instance_memory(
|
|
347
345
|
dev_mem_info,
|
|
348
|
-
|
|
346
|
+
dev_gi_prf,
|
|
347
|
+
)
|
|
348
|
+
gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
|
|
349
|
+
gi_neg_attrs = _get_gpu_instance_negattrs(
|
|
350
|
+
dev_gi_prf_id,
|
|
349
351
|
)
|
|
350
352
|
|
|
351
|
-
if
|
|
352
|
-
mdev_name = f"{
|
|
353
|
+
if ci_slice == gi_slice:
|
|
354
|
+
mdev_name = f"{gi_slice}g.{gi_mem}gb"
|
|
353
355
|
else:
|
|
354
356
|
mdev_name = (
|
|
355
|
-
f"{
|
|
357
|
+
f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
|
|
356
358
|
)
|
|
357
359
|
if gi_attrs:
|
|
358
360
|
mdev_name += f"+{gi_attrs}"
|
|
359
361
|
if gi_neg_attrs:
|
|
360
362
|
mdev_name += f"-{gi_neg_attrs}"
|
|
361
363
|
|
|
362
|
-
mdev_cores =
|
|
364
|
+
mdev_cores = mdev_ci_prf.multiprocessorCount
|
|
363
365
|
|
|
364
366
|
break
|
|
365
367
|
|
|
@@ -374,6 +376,7 @@ class NVIDIADetector(Detector):
|
|
|
374
376
|
runtime_version_original=sys_runtime_ver_original,
|
|
375
377
|
compute_capability=dev_cc,
|
|
376
378
|
cores=mdev_cores,
|
|
379
|
+
cores_utilization=mdev_cores_util,
|
|
377
380
|
memory=mdev_mem,
|
|
378
381
|
memory_used=mdev_mem_used,
|
|
379
382
|
memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
|
|
@@ -467,8 +470,7 @@ class NVIDIADetector(Detector):
|
|
|
467
470
|
dev_i_handle,
|
|
468
471
|
dev_j_handle,
|
|
469
472
|
)
|
|
470
|
-
|
|
471
|
-
if "fabric_cluster_uuid" in dev_i.appendix:
|
|
473
|
+
if dev_i.appendix.get("links_state", 0) > 0:
|
|
472
474
|
distance = TopologyDistanceEnum.LINK
|
|
473
475
|
except pynvml.NVMLError:
|
|
474
476
|
debug_log_exception(
|
|
@@ -492,6 +494,201 @@ class NVIDIADetector(Detector):
|
|
|
492
494
|
return ret
|
|
493
495
|
|
|
494
496
|
|
|
497
|
+
def _get_gpm_metrics(
|
|
498
|
+
metrics: list[int],
|
|
499
|
+
dev: pynvml.c_nvmlDevice_t,
|
|
500
|
+
gpu_instance_id: int | None = None,
|
|
501
|
+
interval: float = 0.1,
|
|
502
|
+
) -> list[pynvml.c_nvmlGpmMetric_t] | None:
|
|
503
|
+
"""
|
|
504
|
+
Get GPM metrics for a device or a MIG GPU instance.
|
|
505
|
+
|
|
506
|
+
Args:
|
|
507
|
+
metrics:
|
|
508
|
+
A list of GPM metric IDs to query.
|
|
509
|
+
dev:
|
|
510
|
+
The NVML device handle.
|
|
511
|
+
gpu_instance_id:
|
|
512
|
+
The GPU instance ID for MIG devices.
|
|
513
|
+
interval:
|
|
514
|
+
Interval in seconds between two samples.
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
A list of GPM metric structures, or None if failed.
|
|
518
|
+
|
|
519
|
+
"""
|
|
520
|
+
try:
|
|
521
|
+
dev_gpm_support = pynvml.nvmlGpmQueryDeviceSupport(dev)
|
|
522
|
+
if not bool(dev_gpm_support.isSupportedDevice):
|
|
523
|
+
return None
|
|
524
|
+
except pynvml.NVMLError:
|
|
525
|
+
debug_log_warning(logger, "Unsupported GPM query")
|
|
526
|
+
return None
|
|
527
|
+
|
|
528
|
+
dev_gpm_metrics = pynvml.c_nvmlGpmMetricsGet_t()
|
|
529
|
+
try:
|
|
530
|
+
dev_gpm_metrics.sample1 = pynvml.nvmlGpmSampleAlloc()
|
|
531
|
+
dev_gpm_metrics.sample2 = pynvml.nvmlGpmSampleAlloc()
|
|
532
|
+
if gpu_instance_id is None:
|
|
533
|
+
pynvml.nvmlGpmSampleGet(dev, dev_gpm_metrics.sample1)
|
|
534
|
+
time.sleep(interval)
|
|
535
|
+
pynvml.nvmlGpmSampleGet(dev, dev_gpm_metrics.sample2)
|
|
536
|
+
else:
|
|
537
|
+
pynvml.nvmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample1)
|
|
538
|
+
time.sleep(interval)
|
|
539
|
+
pynvml.nvmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample2)
|
|
540
|
+
dev_gpm_metrics.version = pynvml.NVML_GPM_METRICS_GET_VERSION
|
|
541
|
+
dev_gpm_metrics.numMetrics = len(metrics)
|
|
542
|
+
for metric_idx, metric in enumerate(metrics):
|
|
543
|
+
dev_gpm_metrics.metrics[metric_idx].metricId = metric
|
|
544
|
+
pynvml.nvmlGpmMetricsGet(dev_gpm_metrics)
|
|
545
|
+
except pynvml.NVMLError:
|
|
546
|
+
debug_log_exception(logger, "Failed to get GPM metrics")
|
|
547
|
+
return None
|
|
548
|
+
finally:
|
|
549
|
+
if dev_gpm_metrics.sample1:
|
|
550
|
+
pynvml.nvmlGpmSampleFree(dev_gpm_metrics.sample1)
|
|
551
|
+
if dev_gpm_metrics.sample2:
|
|
552
|
+
pynvml.nvmlGpmSampleFree(dev_gpm_metrics.sample2)
|
|
553
|
+
return list(dev_gpm_metrics.metrics)
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def _get_sm_util_from_gpm_metrics(
|
|
557
|
+
dev: pynvml.c_nvmlDevice_t,
|
|
558
|
+
gpu_instance_id: int | None = None,
|
|
559
|
+
interval: float = 0.1,
|
|
560
|
+
) -> int | None:
|
|
561
|
+
"""
|
|
562
|
+
Get SM utilization from GPM metrics.
|
|
563
|
+
|
|
564
|
+
Args:
|
|
565
|
+
dev:
|
|
566
|
+
The NVML device handle.
|
|
567
|
+
gpu_instance_id:
|
|
568
|
+
The GPU instance ID for MIG devices.
|
|
569
|
+
interval:
|
|
570
|
+
Interval in seconds between two samples.
|
|
571
|
+
|
|
572
|
+
Returns:
|
|
573
|
+
The SM utilization as an integer percentage, or None if failed.
|
|
574
|
+
|
|
575
|
+
"""
|
|
576
|
+
dev_gpm_metrics = _get_gpm_metrics(
|
|
577
|
+
metrics=[pynvml.NVML_GPM_METRIC_SM_UTIL],
|
|
578
|
+
dev=dev,
|
|
579
|
+
gpu_instance_id=gpu_instance_id,
|
|
580
|
+
interval=interval,
|
|
581
|
+
)
|
|
582
|
+
if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
|
|
583
|
+
return int(dev_gpm_metrics[0].value)
|
|
584
|
+
|
|
585
|
+
return None
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def _extract_field_value(
|
|
589
|
+
field_value: pynvml.c_nvmlFieldValue_t,
|
|
590
|
+
) -> int | float | None:
|
|
591
|
+
"""
|
|
592
|
+
Extract the value from a NVML field value structure.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
field_value:
|
|
596
|
+
The NVML field value structure.
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
The extracted value as int, float, or None if unknown.
|
|
600
|
+
|
|
601
|
+
"""
|
|
602
|
+
if field_value.nvmlReturn != pynvml.NVML_SUCCESS:
|
|
603
|
+
return None
|
|
604
|
+
match field_value.valueType:
|
|
605
|
+
case pynvml.NVML_VALUE_TYPE_DOUBLE:
|
|
606
|
+
return field_value.value.dVal
|
|
607
|
+
case pynvml.NVML_VALUE_TYPE_UNSIGNED_INT:
|
|
608
|
+
return field_value.value.uiVal
|
|
609
|
+
case pynvml.NVML_VALUE_TYPE_UNSIGNED_LONG:
|
|
610
|
+
return field_value.value.ulVal
|
|
611
|
+
case pynvml.NVML_VALUE_TYPE_UNSIGNED_LONG_LONG:
|
|
612
|
+
return field_value.value.ullVal
|
|
613
|
+
case pynvml.NVML_VALUE_TYPE_SIGNED_LONG_LONG:
|
|
614
|
+
return field_value.value.sllVal
|
|
615
|
+
case pynvml.NVML_VALUE_TYPE_SIGNED_INT:
|
|
616
|
+
return field_value.value.siVal
|
|
617
|
+
case pynvml.NVML_VALUE_TYPE_UNSIGNED_SHORT:
|
|
618
|
+
return field_value.value.usVal
|
|
619
|
+
return None
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def _get_links_state(
|
|
623
|
+
dev: pynvml.c_nvmlDevice_t,
|
|
624
|
+
) -> dict | None:
|
|
625
|
+
"""
|
|
626
|
+
Get the NVLink links count and state for a device.
|
|
627
|
+
|
|
628
|
+
Args:
|
|
629
|
+
dev:
|
|
630
|
+
The NVML device handle.
|
|
631
|
+
|
|
632
|
+
Returns:
|
|
633
|
+
A dict includes links state or None if failed.
|
|
634
|
+
|
|
635
|
+
"""
|
|
636
|
+
dev_links_count = 0
|
|
637
|
+
try:
|
|
638
|
+
dev_fields = pynvml.nvmlDeviceGetFieldValues(
|
|
639
|
+
dev,
|
|
640
|
+
fieldIds=[pynvml.NVML_FI_DEV_NVLINK_LINK_COUNT],
|
|
641
|
+
)
|
|
642
|
+
dev_links_count = _extract_field_value(dev_fields[0])
|
|
643
|
+
except pynvml.NVMLError:
|
|
644
|
+
debug_log_warning(logger, "Failed to get NVLink links count")
|
|
645
|
+
if not dev_links_count:
|
|
646
|
+
return None
|
|
647
|
+
|
|
648
|
+
dev_links_state = 0
|
|
649
|
+
try:
|
|
650
|
+
for link_idx in range(int(dev_links_count)):
|
|
651
|
+
dev_link_state = pynvml.nvmlDeviceGetNvLinkState(dev, link_idx)
|
|
652
|
+
if dev_link_state:
|
|
653
|
+
dev_links_state |= 1 << (link_idx + 1)
|
|
654
|
+
except pynvml.NVMLError:
|
|
655
|
+
debug_log_warning(logger, "Failed to get NVLink link state")
|
|
656
|
+
|
|
657
|
+
return {
|
|
658
|
+
"links_count": dev_links_count,
|
|
659
|
+
"links_state": dev_links_state,
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def _get_fabric_info(
|
|
664
|
+
dev: pynvml.c_nvmlDevice_t,
|
|
665
|
+
) -> dict | None:
|
|
666
|
+
"""
|
|
667
|
+
Get the NVSwitch fabric information for a device.
|
|
668
|
+
|
|
669
|
+
Args:
|
|
670
|
+
dev:
|
|
671
|
+
The NVML device handle.
|
|
672
|
+
|
|
673
|
+
Returns:
|
|
674
|
+
A dict includes fabric info or None if failed.
|
|
675
|
+
|
|
676
|
+
"""
|
|
677
|
+
try:
|
|
678
|
+
dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
|
|
679
|
+
ret = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
|
|
680
|
+
if ret != pynvml.NVML_SUCCESS:
|
|
681
|
+
return None
|
|
682
|
+
if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
|
|
683
|
+
return None
|
|
684
|
+
return {
|
|
685
|
+
"fabric_cluster_uuid": stringify_uuid(bytes(dev_fabric.clusterUuid)),
|
|
686
|
+
"fabric_clique_id": dev_fabric.cliqueId,
|
|
687
|
+
}
|
|
688
|
+
except pynvml.NVMLError:
|
|
689
|
+
debug_log_warning(logger, "Failed to get NVSwitch fabric info")
|
|
690
|
+
|
|
691
|
+
|
|
495
692
|
def _get_arch_family(dev_cc_t: list[int]) -> str:
|
|
496
693
|
"""
|
|
497
694
|
Get the architecture family based on the CUDA compute capability.
|
|
@@ -528,9 +725,9 @@ def _get_arch_family(dev_cc_t: list[int]) -> str:
|
|
|
528
725
|
return "Unknown"
|
|
529
726
|
|
|
530
727
|
|
|
531
|
-
def
|
|
728
|
+
def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
|
|
532
729
|
"""
|
|
533
|
-
Get the number of
|
|
730
|
+
Get the number of slice for a given GPU Instance Profile ID.
|
|
534
731
|
|
|
535
732
|
Args:
|
|
536
733
|
dev_gi_prf_id:
|
|
@@ -576,61 +773,33 @@ def _get_gpu_instance_slices(dev_gi_prf_id: int) -> int:
|
|
|
576
773
|
raise AttributeError(msg)
|
|
577
774
|
|
|
578
775
|
|
|
579
|
-
def
|
|
776
|
+
def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
|
|
580
777
|
"""
|
|
581
|
-
|
|
778
|
+
Compute the memory size of a MIG compute instance in GiB.
|
|
582
779
|
|
|
583
780
|
Args:
|
|
584
|
-
|
|
585
|
-
The
|
|
781
|
+
dev_mem:
|
|
782
|
+
The total memory info of the parent GPU device.
|
|
783
|
+
dev_gi_prf:
|
|
784
|
+
The profile info of the GPU instance.
|
|
586
785
|
|
|
587
786
|
Returns:
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
"""
|
|
591
|
-
match dev_gi_prf_id:
|
|
592
|
-
case (
|
|
593
|
-
pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1
|
|
594
|
-
| pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1
|
|
595
|
-
):
|
|
596
|
-
return "me"
|
|
597
|
-
case (
|
|
598
|
-
pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_ALL_ME
|
|
599
|
-
| pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_ALL_ME
|
|
600
|
-
):
|
|
601
|
-
return "me.all"
|
|
602
|
-
case (
|
|
603
|
-
pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_GFX
|
|
604
|
-
| pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_GFX
|
|
605
|
-
| pynvml.NVML_GPU_INSTANCE_PROFILE_4_SLICE_GFX
|
|
606
|
-
):
|
|
607
|
-
return "gfx"
|
|
608
|
-
return ""
|
|
609
|
-
|
|
787
|
+
The memory size in GiB.
|
|
610
788
|
|
|
611
|
-
def _get_gpu_instance_negative_attrs(dev_gi_prf_id) -> str:
|
|
612
789
|
"""
|
|
613
|
-
|
|
790
|
+
mem = dev_gi_prf.memorySizeMB * (1 << 20) # MiB to byte
|
|
614
791
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
"""
|
|
623
|
-
if dev_gi_prf_id in [
|
|
624
|
-
pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_NO_ME,
|
|
625
|
-
pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_NO_ME,
|
|
626
|
-
]:
|
|
627
|
-
return "me"
|
|
628
|
-
return ""
|
|
792
|
+
gib = round(
|
|
793
|
+
math.ceil(mem / dev_mem.total * 8)
|
|
794
|
+
/ 8
|
|
795
|
+
* ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
|
|
796
|
+
)
|
|
797
|
+
return gib
|
|
629
798
|
|
|
630
799
|
|
|
631
|
-
def
|
|
800
|
+
def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
|
|
632
801
|
"""
|
|
633
|
-
Get the number of
|
|
802
|
+
Get the number of slice for a given Compute Instance Profile ID.
|
|
634
803
|
|
|
635
804
|
Args:
|
|
636
805
|
dev_ci_prf_id:
|
|
@@ -663,28 +832,56 @@ def _get_compute_instance_slices(dev_ci_prf_id: int) -> int:
|
|
|
663
832
|
raise AttributeError(msg)
|
|
664
833
|
|
|
665
834
|
|
|
666
|
-
def
|
|
835
|
+
def _get_gpu_instance_attrs(dev_gi_prf_id: int) -> str:
|
|
667
836
|
"""
|
|
668
|
-
|
|
837
|
+
Get attributes for a given GPU Instance Profile ID.
|
|
669
838
|
|
|
670
839
|
Args:
|
|
671
|
-
|
|
672
|
-
The
|
|
673
|
-
mdev_attrs:
|
|
674
|
-
The attributes of the MIG device.
|
|
840
|
+
dev_gi_prf_id:
|
|
841
|
+
The GPU Instance Profile ID.
|
|
675
842
|
|
|
676
843
|
Returns:
|
|
677
|
-
|
|
844
|
+
A string representing the attributes, or an empty string if none.
|
|
678
845
|
|
|
679
846
|
"""
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
847
|
+
match dev_gi_prf_id:
|
|
848
|
+
case (
|
|
849
|
+
pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1
|
|
850
|
+
| pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1
|
|
851
|
+
):
|
|
852
|
+
return "me"
|
|
853
|
+
case (
|
|
854
|
+
pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_ALL_ME
|
|
855
|
+
| pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_ALL_ME
|
|
856
|
+
):
|
|
857
|
+
return "me.all"
|
|
858
|
+
case (
|
|
859
|
+
pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_GFX
|
|
860
|
+
| pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_GFX
|
|
861
|
+
| pynvml.NVML_GPU_INSTANCE_PROFILE_4_SLICE_GFX
|
|
862
|
+
):
|
|
863
|
+
return "gfx"
|
|
864
|
+
return ""
|
|
865
|
+
|
|
866
|
+
|
|
867
|
+
def _get_gpu_instance_negattrs(dev_gi_prf_id) -> str:
|
|
868
|
+
"""
|
|
869
|
+
Get negative attributes for a given GPU Instance Profile ID.
|
|
870
|
+
|
|
871
|
+
Args:
|
|
872
|
+
dev_gi_prf_id:
|
|
873
|
+
The GPU Instance Profile ID.
|
|
874
|
+
|
|
875
|
+
Returns:
|
|
876
|
+
A string representing the negative attributes, or an empty string if none.
|
|
877
|
+
|
|
878
|
+
"""
|
|
879
|
+
if dev_gi_prf_id in [
|
|
880
|
+
pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_NO_ME,
|
|
881
|
+
pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_NO_ME,
|
|
882
|
+
]:
|
|
883
|
+
return "me"
|
|
884
|
+
return ""
|
|
688
885
|
|
|
689
886
|
|
|
690
887
|
def _is_vgpu(dev_config: bytes) -> bool:
|