gpustack-runtime 0.1.39.post2__py3-none-any.whl → 0.1.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. gpustack_runtime/__main__.py +7 -3
  2. gpustack_runtime/_version.py +2 -2
  3. gpustack_runtime/_version_appendix.py +1 -1
  4. gpustack_runtime/cmds/__init__.py +2 -0
  5. gpustack_runtime/cmds/deployer.py +84 -2
  6. gpustack_runtime/cmds/images.py +2 -0
  7. gpustack_runtime/deployer/__init__.py +2 -0
  8. gpustack_runtime/deployer/__types__.py +52 -28
  9. gpustack_runtime/deployer/__utils__.py +99 -112
  10. gpustack_runtime/deployer/cdi/__init__.py +81 -0
  11. gpustack_runtime/deployer/cdi/__types__.py +667 -0
  12. gpustack_runtime/deployer/cdi/thead.py +103 -0
  13. gpustack_runtime/deployer/docker.py +42 -24
  14. gpustack_runtime/deployer/kuberentes.py +8 -4
  15. gpustack_runtime/deployer/podman.py +41 -23
  16. gpustack_runtime/detector/__init__.py +62 -3
  17. gpustack_runtime/detector/__types__.py +11 -0
  18. gpustack_runtime/detector/__utils__.py +23 -0
  19. gpustack_runtime/detector/amd.py +17 -9
  20. gpustack_runtime/detector/hygon.py +6 -1
  21. gpustack_runtime/detector/iluvatar.py +20 -5
  22. gpustack_runtime/detector/mthreads.py +8 -12
  23. gpustack_runtime/detector/nvidia.py +365 -168
  24. gpustack_runtime/detector/pyacl/__init__.py +9 -1
  25. gpustack_runtime/detector/pyamdgpu/__init__.py +8 -0
  26. gpustack_runtime/detector/pycuda/__init__.py +9 -1
  27. gpustack_runtime/detector/pydcmi/__init__.py +9 -2
  28. gpustack_runtime/detector/pyhgml/__init__.py +5879 -0
  29. gpustack_runtime/detector/pyhgml/libhgml.so +0 -0
  30. gpustack_runtime/detector/pyhgml/libuki.so +0 -0
  31. gpustack_runtime/detector/pyhsa/__init__.py +9 -0
  32. gpustack_runtime/detector/pyixml/__init__.py +89 -164
  33. gpustack_runtime/detector/pyrocmcore/__init__.py +42 -24
  34. gpustack_runtime/detector/pyrocmsmi/__init__.py +141 -138
  35. gpustack_runtime/detector/thead.py +733 -0
  36. gpustack_runtime/envs.py +128 -55
  37. {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/METADATA +4 -2
  38. gpustack_runtime-0.1.40.dist-info/RECORD +55 -0
  39. gpustack_runtime/detector/pymtml/__init__.py +0 -770
  40. gpustack_runtime-0.1.39.post2.dist-info/RECORD +0 -49
  41. {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/WHEEL +0 -0
  42. {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/entry_points.txt +0 -0
  43. {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/licenses/LICENSE +0 -0
@@ -2,9 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  import contextlib
4
4
  import logging
5
+ import math
6
+ import time
5
7
  from _ctypes import byref
6
8
  from functools import lru_cache
7
- from math import ceil
8
9
 
9
10
  import pynvml
10
11
 
@@ -125,103 +126,104 @@ class NVIDIADetector(Detector):
125
126
  for dev_idx in range(dev_count):
126
127
  dev = pynvml.nvmlDeviceGetHandleByIndex(dev_idx)
127
128
 
128
- dev_index = dev_idx
129
- if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
130
- if dev_files is None:
131
- dev_files = get_device_files(pattern=r"nvidia(?P<number>\d+)")
132
- if len(dev_files) >= dev_count:
133
- dev_file = dev_files[dev_idx]
134
- if dev_file.number is not None:
135
- dev_index = dev_file.number
136
- dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
137
-
138
- dev_cores = None
139
- if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
140
- with contextlib.suppress(pycuda.CUDAError):
141
- dev_gpudev = pycuda.cuDeviceGet(dev_idx)
142
- dev_cores = pycuda.cuDeviceGetAttribute(
143
- dev_gpudev,
144
- pycuda.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
145
- )
129
+ dev_cc_t = pynvml.nvmlDeviceGetCudaComputeCapability(dev)
130
+ dev_cc = ".".join(map(str, dev_cc_t))
146
131
 
147
- dev_mem = 0
148
- dev_mem_used = 0
132
+ dev_bdf = None
149
133
  with contextlib.suppress(pynvml.NVMLError):
150
- dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
151
- dev_mem = byte_to_mebibyte( # byte to MiB
152
- dev_mem_info.total,
153
- )
154
- dev_mem_used = byte_to_mebibyte( # byte to MiB
155
- dev_mem_info.used,
156
- )
157
- if dev_mem == 0:
158
- dev_mem, dev_mem_used = get_memory()
134
+ dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
135
+ dev_bdf = str(dev_pci_info.busIdLegacy).lower()
159
136
 
160
- dev_cores_util = None
137
+ dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
161
138
  with contextlib.suppress(pynvml.NVMLError):
162
- dev_util_rates = pynvml.nvmlDeviceGetUtilizationRates(dev)
163
- dev_cores_util = dev_util_rates.gpu
164
- if dev_cores_util is None:
165
- debug_log_warning(
166
- logger,
167
- "Failed to get device %d cores utilization, setting to 0",
168
- dev_index,
169
- )
170
- dev_cores_util = 0
139
+ dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
171
140
 
172
- dev_temp = None
173
- with contextlib.suppress(pynvml.NVMLError):
174
- dev_temp = pynvml.nvmlDeviceGetTemperature(
175
- dev,
176
- pynvml.NVML_TEMPERATURE_GPU,
177
- )
141
+ # With MIG disabled, treat as a single device.
178
142
 
179
- dev_power = None
180
- dev_power_used = None
181
- with contextlib.suppress(pynvml.NVMLError):
182
- dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
183
- dev_power = dev_power // 1000 # mW to W
184
- dev_power_used = (
185
- pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
186
- ) # mW to W
143
+ if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
144
+ dev_index = dev_idx
145
+ if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
146
+ if dev_files is None:
147
+ dev_files = get_device_files(
148
+ pattern=r"nvidia(?P<number>\d+)",
149
+ )
150
+ if len(dev_files) >= dev_count:
151
+ dev_file = dev_files[dev_idx]
152
+ if dev_file.number is not None:
153
+ dev_index = dev_file.number
187
154
 
188
- dev_cc_t = pynvml.nvmlDeviceGetCudaComputeCapability(dev)
189
- dev_cc = ".".join(map(str, dev_cc_t))
155
+ dev_name = pynvml.nvmlDeviceGetName(dev)
190
156
 
191
- dev_is_vgpu = False
192
- dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
193
- for addr in [dev_pci_info.busIdLegacy, dev_pci_info.busId]:
194
- if addr in pci_devs:
195
- dev_is_vgpu = _is_vgpu(pci_devs[addr].config)
196
- break
157
+ dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
158
+
159
+ dev_cores = None
160
+ if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
161
+ with contextlib.suppress(pycuda.CUDAError):
162
+ dev_gpudev = pycuda.cuDeviceGet(dev_idx)
163
+ dev_cores = pycuda.cuDeviceGetAttribute(
164
+ dev_gpudev,
165
+ pycuda.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
166
+ )
167
+
168
+ dev_cores_util = _get_sm_util_from_gpm_metrics(dev)
169
+ if dev_cores_util is None:
170
+ with contextlib.suppress(pynvml.NVMLError):
171
+ dev_util_rates = pynvml.nvmlDeviceGetUtilizationRates(dev)
172
+ dev_cores_util = dev_util_rates.gpu
173
+ if dev_cores_util is None:
174
+ debug_log_warning(
175
+ logger,
176
+ "Failed to get device %d cores utilization, setting to 0",
177
+ dev_index,
178
+ )
179
+ dev_cores_util = 0
197
180
 
198
- dev_appendix = {
199
- "arch_family": _get_arch_family(dev_cc_t),
200
- "vgpu": dev_is_vgpu,
201
- "bdf": str(dev_pci_info.busIdLegacy).lower(),
202
- }
181
+ dev_mem = 0
182
+ dev_mem_used = 0
183
+ with contextlib.suppress(pynvml.NVMLError):
184
+ dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
185
+ dev_mem = byte_to_mebibyte( # byte to MiB
186
+ dev_mem_info.total,
187
+ )
188
+ dev_mem_used = byte_to_mebibyte( # byte to MiB
189
+ dev_mem_info.used,
190
+ )
191
+ if dev_mem == 0:
192
+ dev_mem, dev_mem_used = get_memory()
203
193
 
204
- with contextlib.suppress(pynvml.NVMLError):
205
- dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
206
- r = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
207
- if r != pynvml.NVML_SUCCESS:
208
- dev_fabric = None
209
- if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
210
- dev_fabric = None
211
- if dev_fabric:
212
- dev_appendix["fabric_cluster_uuid"] = stringify_uuid(
213
- bytes(dev_fabric.clusterUuid),
194
+ dev_temp = None
195
+ with contextlib.suppress(pynvml.NVMLError):
196
+ dev_temp = pynvml.nvmlDeviceGetTemperature(
197
+ dev,
198
+ pynvml.NVML_TEMPERATURE_GPU,
214
199
  )
215
- dev_appendix["fabric_clique_id"] = dev_fabric.cliqueId
216
200
 
217
- dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
218
- with contextlib.suppress(pynvml.NVMLError):
219
- dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
201
+ dev_power = None
202
+ dev_power_used = None
203
+ with contextlib.suppress(pynvml.NVMLError):
204
+ dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
205
+ dev_power = dev_power // 1000 # mW to W
206
+ dev_power_used = (
207
+ pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
208
+ ) # mW to W
220
209
 
221
- # If MIG is not enabled, return the GPU itself.
210
+ dev_is_vgpu = False
211
+ if dev_bdf and dev_bdf in pci_devs:
212
+ dev_is_vgpu = _is_vgpu(pci_devs[dev_bdf].config)
213
+
214
+ dev_appendix = {
215
+ "arch_family": _get_arch_family(dev_cc_t),
216
+ "vgpu": dev_is_vgpu,
217
+ }
218
+ if dev_bdf:
219
+ dev_appendix["bdf"] = dev_bdf
220
+
221
+ if dev_links_state := _get_links_state(dev):
222
+ dev_appendix.update(dev_links_state)
223
+
224
+ if dev_fabric_info := _get_fabric_info(dev):
225
+ dev_appendix.update(dev_fabric_info)
222
226
 
223
- if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
224
- dev_name = pynvml.nvmlDeviceGetName(dev)
225
227
  ret.append(
226
228
  Device(
227
229
  manufacturer=self.manufacturer,
@@ -250,7 +252,7 @@ class NVIDIADetector(Detector):
250
252
  # inspired by https://github.com/NVIDIA/go-nvlib/blob/fdfe25d0ffc9d7a8c166f4639ef236da81116262/pkg/nvlib/device/mig_device.go#L61-L154.
251
253
 
252
254
  mdev_name = ""
253
- mdev_cores = 1
255
+ mdev_cores = None
254
256
  mdev_count = pynvml.nvmlDeviceGetMaxMigDeviceCount(dev)
255
257
  for mdev_idx in range(mdev_count):
256
258
  mdev = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
@@ -283,16 +285,21 @@ class NVIDIADetector(Detector):
283
285
  pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
284
286
  ) # mW to W
285
287
 
286
- mdev_appendix = dev_appendix.copy()
288
+ mdev_appendix = {
289
+ "arch_family": _get_arch_family(dev_cc_t),
290
+ "vgpu": True,
291
+ }
292
+ if dev_bdf:
293
+ mdev_appendix["bdf"] = dev_bdf
287
294
 
288
295
  mdev_gi_id = pynvml.nvmlDeviceGetGpuInstanceId(mdev)
289
296
  mdev_appendix["gpu_instance_id"] = mdev_gi_id
290
297
  mdev_ci_id = pynvml.nvmlDeviceGetComputeInstanceId(mdev)
291
298
  mdev_appendix["compute_instance_id"] = mdev_ci_id
292
299
 
293
- if not mdev_name:
294
- mdev_attrs = pynvml.nvmlDeviceGetAttributes(mdev)
300
+ mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
295
301
 
302
+ if not mdev_name:
296
303
  mdev_gi = pynvml.nvmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
297
304
  mdev_ci = pynvml.nvmlGpuInstanceGetComputeInstanceById(
298
305
  mdev_gi,
@@ -310,11 +317,6 @@ class NVIDIADetector(Detector):
310
317
  )
311
318
  if dev_gi_prf.id != mdev_gi_info.profileId:
312
319
  continue
313
- mdev_cores = getattr(
314
- dev_gi_prf,
315
- "multiprocessorCount",
316
- 1,
317
- )
318
320
  except pynvml.NVMLError:
319
321
  continue
320
322
 
@@ -335,31 +337,31 @@ class NVIDIADetector(Detector):
335
337
  except pynvml.NVMLError:
336
338
  continue
337
339
 
338
- gi_slices = _get_gpu_instance_slices(dev_gi_prf_id)
339
- gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
340
- gi_neg_attrs = _get_gpu_instance_negative_attrs(
341
- dev_gi_prf_id,
342
- )
343
- ci_slices = _get_compute_instance_slices(
340
+ ci_slice = _get_compute_instance_slice(
344
341
  dev_ci_prf_id,
345
342
  )
346
- ci_mem = _get_compute_instance_memory_in_gib(
343
+ gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
344
+ gi_mem = _get_gpu_instance_memory(
347
345
  dev_mem_info,
348
- mdev_attrs,
346
+ dev_gi_prf,
347
+ )
348
+ gi_attrs = _get_gpu_instance_attrs(dev_gi_prf_id)
349
+ gi_neg_attrs = _get_gpu_instance_negattrs(
350
+ dev_gi_prf_id,
349
351
  )
350
352
 
351
- if gi_slices == ci_slices:
352
- mdev_name = f"{gi_slices}g.{ci_mem}gb"
353
+ if ci_slice == gi_slice:
354
+ mdev_name = f"{gi_slice}g.{gi_mem}gb"
353
355
  else:
354
356
  mdev_name = (
355
- f"{ci_slices}c.{gi_slices}g.{ci_mem}gb"
357
+ f"{ci_slice}c.{gi_slice}g.{gi_mem}gb"
356
358
  )
357
359
  if gi_attrs:
358
360
  mdev_name += f"+{gi_attrs}"
359
361
  if gi_neg_attrs:
360
362
  mdev_name += f"-{gi_neg_attrs}"
361
363
 
362
- mdev_cores = ci_slices
364
+ mdev_cores = mdev_ci_prf.multiprocessorCount
363
365
 
364
366
  break
365
367
 
@@ -374,6 +376,7 @@ class NVIDIADetector(Detector):
374
376
  runtime_version_original=sys_runtime_ver_original,
375
377
  compute_capability=dev_cc,
376
378
  cores=mdev_cores,
379
+ cores_utilization=mdev_cores_util,
377
380
  memory=mdev_mem,
378
381
  memory_used=mdev_mem_used,
379
382
  memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
@@ -467,8 +470,7 @@ class NVIDIADetector(Detector):
467
470
  dev_i_handle,
468
471
  dev_j_handle,
469
472
  )
470
- # In practice, there may not be NVLINK nodes that are not interconnected.
471
- if "fabric_cluster_uuid" in dev_i.appendix:
473
+ if dev_i.appendix.get("links_state", 0) > 0:
472
474
  distance = TopologyDistanceEnum.LINK
473
475
  except pynvml.NVMLError:
474
476
  debug_log_exception(
@@ -492,6 +494,201 @@ class NVIDIADetector(Detector):
492
494
  return ret
493
495
 
494
496
 
497
+ def _get_gpm_metrics(
498
+ metrics: list[int],
499
+ dev: pynvml.c_nvmlDevice_t,
500
+ gpu_instance_id: int | None = None,
501
+ interval: float = 0.1,
502
+ ) -> list[pynvml.c_nvmlGpmMetric_t] | None:
503
+ """
504
+ Get GPM metrics for a device or a MIG GPU instance.
505
+
506
+ Args:
507
+ metrics:
508
+ A list of GPM metric IDs to query.
509
+ dev:
510
+ The NVML device handle.
511
+ gpu_instance_id:
512
+ The GPU instance ID for MIG devices.
513
+ interval:
514
+ Interval in seconds between two samples.
515
+
516
+ Returns:
517
+ A list of GPM metric structures, or None if failed.
518
+
519
+ """
520
+ try:
521
+ dev_gpm_support = pynvml.nvmlGpmQueryDeviceSupport(dev)
522
+ if not bool(dev_gpm_support.isSupportedDevice):
523
+ return None
524
+ except pynvml.NVMLError:
525
+ debug_log_warning(logger, "Unsupported GPM query")
526
+ return None
527
+
528
+ dev_gpm_metrics = pynvml.c_nvmlGpmMetricsGet_t()
529
+ try:
530
+ dev_gpm_metrics.sample1 = pynvml.nvmlGpmSampleAlloc()
531
+ dev_gpm_metrics.sample2 = pynvml.nvmlGpmSampleAlloc()
532
+ if gpu_instance_id is None:
533
+ pynvml.nvmlGpmSampleGet(dev, dev_gpm_metrics.sample1)
534
+ time.sleep(interval)
535
+ pynvml.nvmlGpmSampleGet(dev, dev_gpm_metrics.sample2)
536
+ else:
537
+ pynvml.nvmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample1)
538
+ time.sleep(interval)
539
+ pynvml.nvmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample2)
540
+ dev_gpm_metrics.version = pynvml.NVML_GPM_METRICS_GET_VERSION
541
+ dev_gpm_metrics.numMetrics = len(metrics)
542
+ for metric_idx, metric in enumerate(metrics):
543
+ dev_gpm_metrics.metrics[metric_idx].metricId = metric
544
+ pynvml.nvmlGpmMetricsGet(dev_gpm_metrics)
545
+ except pynvml.NVMLError:
546
+ debug_log_exception(logger, "Failed to get GPM metrics")
547
+ return None
548
+ finally:
549
+ if dev_gpm_metrics.sample1:
550
+ pynvml.nvmlGpmSampleFree(dev_gpm_metrics.sample1)
551
+ if dev_gpm_metrics.sample2:
552
+ pynvml.nvmlGpmSampleFree(dev_gpm_metrics.sample2)
553
+ return list(dev_gpm_metrics.metrics)
554
+
555
+
556
+ def _get_sm_util_from_gpm_metrics(
557
+ dev: pynvml.c_nvmlDevice_t,
558
+ gpu_instance_id: int | None = None,
559
+ interval: float = 0.1,
560
+ ) -> int | None:
561
+ """
562
+ Get SM utilization from GPM metrics.
563
+
564
+ Args:
565
+ dev:
566
+ The NVML device handle.
567
+ gpu_instance_id:
568
+ The GPU instance ID for MIG devices.
569
+ interval:
570
+ Interval in seconds between two samples.
571
+
572
+ Returns:
573
+ The SM utilization as an integer percentage, or None if failed.
574
+
575
+ """
576
+ dev_gpm_metrics = _get_gpm_metrics(
577
+ metrics=[pynvml.NVML_GPM_METRIC_SM_UTIL],
578
+ dev=dev,
579
+ gpu_instance_id=gpu_instance_id,
580
+ interval=interval,
581
+ )
582
+ if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
583
+ return int(dev_gpm_metrics[0].value)
584
+
585
+ return None
586
+
587
+
588
+ def _extract_field_value(
589
+ field_value: pynvml.c_nvmlFieldValue_t,
590
+ ) -> int | float | None:
591
+ """
592
+ Extract the value from a NVML field value structure.
593
+
594
+ Args:
595
+ field_value:
596
+ The NVML field value structure.
597
+
598
+ Returns:
599
+ The extracted value as int, float, or None if unknown.
600
+
601
+ """
602
+ if field_value.nvmlReturn != pynvml.NVML_SUCCESS:
603
+ return None
604
+ match field_value.valueType:
605
+ case pynvml.NVML_VALUE_TYPE_DOUBLE:
606
+ return field_value.value.dVal
607
+ case pynvml.NVML_VALUE_TYPE_UNSIGNED_INT:
608
+ return field_value.value.uiVal
609
+ case pynvml.NVML_VALUE_TYPE_UNSIGNED_LONG:
610
+ return field_value.value.ulVal
611
+ case pynvml.NVML_VALUE_TYPE_UNSIGNED_LONG_LONG:
612
+ return field_value.value.ullVal
613
+ case pynvml.NVML_VALUE_TYPE_SIGNED_LONG_LONG:
614
+ return field_value.value.sllVal
615
+ case pynvml.NVML_VALUE_TYPE_SIGNED_INT:
616
+ return field_value.value.siVal
617
+ case pynvml.NVML_VALUE_TYPE_UNSIGNED_SHORT:
618
+ return field_value.value.usVal
619
+ return None
620
+
621
+
622
+ def _get_links_state(
623
+ dev: pynvml.c_nvmlDevice_t,
624
+ ) -> dict | None:
625
+ """
626
+ Get the NVLink links count and state for a device.
627
+
628
+ Args:
629
+ dev:
630
+ The NVML device handle.
631
+
632
+ Returns:
633
+ A dict includes links state or None if failed.
634
+
635
+ """
636
+ dev_links_count = 0
637
+ try:
638
+ dev_fields = pynvml.nvmlDeviceGetFieldValues(
639
+ dev,
640
+ fieldIds=[pynvml.NVML_FI_DEV_NVLINK_LINK_COUNT],
641
+ )
642
+ dev_links_count = _extract_field_value(dev_fields[0])
643
+ except pynvml.NVMLError:
644
+ debug_log_warning(logger, "Failed to get NVLink links count")
645
+ if not dev_links_count:
646
+ return None
647
+
648
+ dev_links_state = 0
649
+ try:
650
+ for link_idx in range(int(dev_links_count)):
651
+ dev_link_state = pynvml.nvmlDeviceGetNvLinkState(dev, link_idx)
652
+ if dev_link_state:
653
+ dev_links_state |= 1 << (link_idx + 1)
654
+ except pynvml.NVMLError:
655
+ debug_log_warning(logger, "Failed to get NVLink link state")
656
+
657
+ return {
658
+ "links_count": dev_links_count,
659
+ "links_state": dev_links_state,
660
+ }
661
+
662
+
663
+ def _get_fabric_info(
664
+ dev: pynvml.c_nvmlDevice_t,
665
+ ) -> dict | None:
666
+ """
667
+ Get the NVSwitch fabric information for a device.
668
+
669
+ Args:
670
+ dev:
671
+ The NVML device handle.
672
+
673
+ Returns:
674
+ A dict includes fabric info or None if failed.
675
+
676
+ """
677
+ try:
678
+ dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
679
+ ret = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
680
+ if ret != pynvml.NVML_SUCCESS:
681
+ return None
682
+ if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
683
+ return None
684
+ return {
685
+ "fabric_cluster_uuid": stringify_uuid(bytes(dev_fabric.clusterUuid)),
686
+ "fabric_clique_id": dev_fabric.cliqueId,
687
+ }
688
+ except pynvml.NVMLError:
689
+ debug_log_warning(logger, "Failed to get NVSwitch fabric info")
690
+
691
+
495
692
  def _get_arch_family(dev_cc_t: list[int]) -> str:
496
693
  """
497
694
  Get the architecture family based on the CUDA compute capability.
@@ -528,9 +725,9 @@ def _get_arch_family(dev_cc_t: list[int]) -> str:
528
725
  return "Unknown"
529
726
 
530
727
 
531
- def _get_gpu_instance_slices(dev_gi_prf_id: int) -> int:
728
+ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
532
729
  """
533
- Get the number of slices for a given GPU Instance Profile ID.
730
+ Get the number of slice for a given GPU Instance Profile ID.
534
731
 
535
732
  Args:
536
733
  dev_gi_prf_id:
@@ -576,61 +773,33 @@ def _get_gpu_instance_slices(dev_gi_prf_id: int) -> int:
576
773
  raise AttributeError(msg)
577
774
 
578
775
 
579
- def _get_gpu_instance_attrs(dev_gi_prf_id: int) -> str:
776
+ def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
580
777
  """
581
- Get attributes for a given GPU Instance Profile ID.
778
+ Compute the memory size of a MIG compute instance in GiB.
582
779
 
583
780
  Args:
584
- dev_gi_prf_id:
585
- The GPU Instance Profile ID.
781
+ dev_mem:
782
+ The total memory info of the parent GPU device.
783
+ dev_gi_prf:
784
+ The profile info of the GPU instance.
586
785
 
587
786
  Returns:
588
- A string representing the attributes, or an empty string if none.
589
-
590
- """
591
- match dev_gi_prf_id:
592
- case (
593
- pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1
594
- | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1
595
- ):
596
- return "me"
597
- case (
598
- pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_ALL_ME
599
- | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_ALL_ME
600
- ):
601
- return "me.all"
602
- case (
603
- pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_GFX
604
- | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_GFX
605
- | pynvml.NVML_GPU_INSTANCE_PROFILE_4_SLICE_GFX
606
- ):
607
- return "gfx"
608
- return ""
609
-
787
+ The memory size in GiB.
610
788
 
611
- def _get_gpu_instance_negative_attrs(dev_gi_prf_id) -> str:
612
789
  """
613
- Get negative attributes for a given GPU Instance Profile ID.
790
+ mem = dev_gi_prf.memorySizeMB * (1 << 20) # MiB to byte
614
791
 
615
- Args:
616
- dev_gi_prf_id:
617
- The GPU Instance Profile ID.
618
-
619
- Returns:
620
- A string representing the negative attributes, or an empty string if none.
621
-
622
- """
623
- if dev_gi_prf_id in [
624
- pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_NO_ME,
625
- pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_NO_ME,
626
- ]:
627
- return "me"
628
- return ""
792
+ gib = round(
793
+ math.ceil(mem / dev_mem.total * 8)
794
+ / 8
795
+ * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
796
+ )
797
+ return gib
629
798
 
630
799
 
631
- def _get_compute_instance_slices(dev_ci_prf_id: int) -> int:
800
+ def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
632
801
  """
633
- Get the number of slices for a given Compute Instance Profile ID.
802
+ Get the number of slice for a given Compute Instance Profile ID.
634
803
 
635
804
  Args:
636
805
  dev_ci_prf_id:
@@ -663,28 +832,56 @@ def _get_compute_instance_slices(dev_ci_prf_id: int) -> int:
663
832
  raise AttributeError(msg)
664
833
 
665
834
 
666
- def _get_compute_instance_memory_in_gib(dev_mem, mdev_attrs) -> int:
835
+ def _get_gpu_instance_attrs(dev_gi_prf_id: int) -> str:
667
836
  """
668
- Compute the memory size of a MIG compute instance in GiB.
837
+ Get attributes for a given GPU Instance Profile ID.
669
838
 
670
839
  Args:
671
- dev_mem:
672
- The total memory info of the parent GPU device.
673
- mdev_attrs:
674
- The attributes of the MIG device.
840
+ dev_gi_prf_id:
841
+ The GPU Instance Profile ID.
675
842
 
676
843
  Returns:
677
- The memory size in GiB.
844
+ A string representing the attributes, or an empty string if none.
678
845
 
679
846
  """
680
- gib = round(
681
- ceil(
682
- (mdev_attrs.memorySizeMB * (1 << 20)) / dev_mem.total * 8,
683
- )
684
- / 8
685
- * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
686
- )
687
- return gib
847
+ match dev_gi_prf_id:
848
+ case (
849
+ pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1
850
+ | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1
851
+ ):
852
+ return "me"
853
+ case (
854
+ pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_ALL_ME
855
+ | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_ALL_ME
856
+ ):
857
+ return "me.all"
858
+ case (
859
+ pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_GFX
860
+ | pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_GFX
861
+ | pynvml.NVML_GPU_INSTANCE_PROFILE_4_SLICE_GFX
862
+ ):
863
+ return "gfx"
864
+ return ""
865
+
866
+
867
+ def _get_gpu_instance_negattrs(dev_gi_prf_id) -> str:
868
+ """
869
+ Get negative attributes for a given GPU Instance Profile ID.
870
+
871
+ Args:
872
+ dev_gi_prf_id:
873
+ The GPU Instance Profile ID.
874
+
875
+ Returns:
876
+ A string representing the negative attributes, or an empty string if none.
877
+
878
+ """
879
+ if dev_gi_prf_id in [
880
+ pynvml.NVML_GPU_INSTANCE_PROFILE_1_SLICE_NO_ME,
881
+ pynvml.NVML_GPU_INSTANCE_PROFILE_2_SLICE_NO_ME,
882
+ ]:
883
+ return "me"
884
+ return ""
688
885
 
689
886
 
690
887
  def _is_vgpu(dev_config: bytes) -> bool: