gpustack-runtime 0.1.39.post3__py3-none-any.whl → 0.1.40.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. gpustack_runtime/__main__.py +7 -3
  2. gpustack_runtime/_version.py +2 -2
  3. gpustack_runtime/_version_appendix.py +1 -1
  4. gpustack_runtime/cmds/__init__.py +4 -0
  5. gpustack_runtime/cmds/deployer.py +84 -2
  6. gpustack_runtime/cmds/images.py +2 -0
  7. gpustack_runtime/deployer/__init__.py +2 -0
  8. gpustack_runtime/deployer/__types__.py +52 -28
  9. gpustack_runtime/deployer/__utils__.py +99 -112
  10. gpustack_runtime/deployer/cdi/__init__.py +81 -0
  11. gpustack_runtime/deployer/cdi/__types__.py +667 -0
  12. gpustack_runtime/deployer/cdi/thead.py +103 -0
  13. gpustack_runtime/deployer/docker.py +36 -22
  14. gpustack_runtime/deployer/kuberentes.py +8 -4
  15. gpustack_runtime/deployer/podman.py +35 -21
  16. gpustack_runtime/detector/__init__.py +62 -3
  17. gpustack_runtime/detector/__types__.py +11 -0
  18. gpustack_runtime/detector/iluvatar.py +10 -3
  19. gpustack_runtime/detector/nvidia.py +186 -97
  20. gpustack_runtime/detector/pyacl/__init__.py +9 -1
  21. gpustack_runtime/detector/pyamdgpu/__init__.py +8 -0
  22. gpustack_runtime/detector/pycuda/__init__.py +9 -1
  23. gpustack_runtime/detector/pydcmi/__init__.py +9 -2
  24. gpustack_runtime/detector/pyhgml/__init__.py +5879 -0
  25. gpustack_runtime/detector/pyhgml/libhgml.so +0 -0
  26. gpustack_runtime/detector/pyhgml/libuki.so +0 -0
  27. gpustack_runtime/detector/pyhsa/__init__.py +9 -0
  28. gpustack_runtime/detector/pyixml/__init__.py +89 -164
  29. gpustack_runtime/detector/pyrocmcore/__init__.py +42 -24
  30. gpustack_runtime/detector/pyrocmsmi/__init__.py +138 -129
  31. gpustack_runtime/detector/thead.py +733 -0
  32. gpustack_runtime/envs.py +127 -54
  33. {gpustack_runtime-0.1.39.post3.dist-info → gpustack_runtime-0.1.40.post1.dist-info}/METADATA +3 -2
  34. gpustack_runtime-0.1.40.post1.dist-info/RECORD +55 -0
  35. gpustack_runtime-0.1.39.post3.dist-info/RECORD +0 -48
  36. {gpustack_runtime-0.1.39.post3.dist-info → gpustack_runtime-0.1.40.post1.dist-info}/WHEEL +0 -0
  37. {gpustack_runtime-0.1.39.post3.dist-info → gpustack_runtime-0.1.40.post1.dist-info}/entry_points.txt +0 -0
  38. {gpustack_runtime-0.1.39.post3.dist-info → gpustack_runtime-0.1.40.post1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,733 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import logging
5
+ import math
6
+ import time
7
+ from functools import lru_cache
8
+
9
+ from .. import envs
10
+ from ..logging import debug_log_exception, debug_log_warning
11
+ from . import pyhgml
12
+ from .__types__ import (
13
+ Detector,
14
+ Device,
15
+ Devices,
16
+ ManufacturerEnum,
17
+ Topology,
18
+ TopologyDistanceEnum,
19
+ )
20
+ from .__utils__ import (
21
+ PCIDevice,
22
+ bitmask_to_str,
23
+ byte_to_mebibyte,
24
+ get_brief_version,
25
+ get_device_files,
26
+ get_numa_node_by_bdf,
27
+ get_numa_nodeset_size,
28
+ get_pci_devices,
29
+ get_physical_function_by_bdf,
30
+ get_utilization,
31
+ map_numa_node_to_cpu_affinity,
32
+ )
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class THeadDetector(Detector):
38
+ """
39
+ Detect T-Head PPUs.
40
+ """
41
+
42
+ @staticmethod
43
+ @lru_cache
44
+ def is_supported() -> bool:
45
+ """
46
+ Check if the T-Head detector is supported.
47
+
48
+ Returns:
49
+ True if supported, False otherwise.
50
+
51
+ """
52
+ supported = False
53
+ if envs.GPUSTACK_RUNTIME_DETECT.lower() not in ("auto", "iluvatar"):
54
+ logger.debug("T-Head detection is disabled by environment variable")
55
+ return supported
56
+
57
+ pci_devs = THeadDetector.detect_pci_devices()
58
+ if not pci_devs and not envs.GPUSTACK_RUNTIME_DETECT_NO_PCI_CHECK:
59
+ logger.debug("No T-Head PCI devices found")
60
+ return supported
61
+
62
+ try:
63
+ pyhgml.hgmlInit()
64
+ pyhgml.hgmlShutdown()
65
+ supported = True
66
+ except pyhgml.HGMLError:
67
+ debug_log_exception(logger, "Failed to initialize HGML library")
68
+
69
+ return supported
70
+
71
+ @staticmethod
72
+ @lru_cache
73
+ def detect_pci_devices() -> dict[str, PCIDevice]:
74
+ # See https://pcisig.com/membership/member-companies?combine=Alibaba.
75
+ pci_devs = get_pci_devices(vendor="0x1ded")
76
+ if not pci_devs:
77
+ return {}
78
+ return {dev.address: dev for dev in pci_devs}
79
+
80
+ def __init__(self):
81
+ super().__init__(ManufacturerEnum.THEAD)
82
+
83
+ def detect(self) -> Devices | None:
84
+ """
85
+ Detect T-Head GPUs using pyhgml.
86
+
87
+ Returns:
88
+ A list of detected T-Head GPU devices,
89
+ or None if not supported.
90
+
91
+ Raises:
92
+ If there is an error during detection.
93
+
94
+ """
95
+ if not self.is_supported():
96
+ return None
97
+
98
+ ret: Devices = []
99
+
100
+ try:
101
+ pyhgml.hgmlInit()
102
+
103
+ sys_driver_ver = pyhgml.hgmlSystemGetDriverVersion()
104
+
105
+ sys_runtime_ver_original = None
106
+ sys_runtime_ver = None
107
+ with contextlib.suppress(pyhgml.HGMLError):
108
+ sys_runtime_ver_original = pyhgml.hgmlSystemGetHggcDriverVersion()
109
+ sys_runtime_ver_original = ".".join(
110
+ map(
111
+ str,
112
+ [
113
+ sys_runtime_ver_original // 1000,
114
+ (sys_runtime_ver_original % 1000) // 10,
115
+ (sys_runtime_ver_original % 10),
116
+ ],
117
+ ),
118
+ )
119
+ sys_runtime_ver = get_brief_version(
120
+ sys_runtime_ver_original,
121
+ )
122
+
123
+ dev_count = pyhgml.hgmlDeviceGetCount()
124
+ dev_files = None
125
+ for dev_idx in range(dev_count):
126
+ dev = pyhgml.hgmlDeviceGetHandleByIndex(dev_idx)
127
+
128
+ dev_cc_t = pyhgml.hgmlDeviceGetHggcComputeCapability(dev)
129
+ dev_cc = ".".join(map(str, dev_cc_t))
130
+
131
+ dev_bdf = None
132
+ with contextlib.suppress(pyhgml.HGMLError):
133
+ dev_pci_info = pyhgml.hgmlDeviceGetPciInfo(dev)
134
+ dev_bdf = str(dev_pci_info.busIdLegacy).lower()
135
+
136
+ dev_mig_mode = pyhgml.HGML_DEVICE_MIG_DISABLE
137
+ with contextlib.suppress(pyhgml.HGMLError):
138
+ dev_mig_mode, _ = pyhgml.hgmlDeviceGetMigMode(dev)
139
+
140
+ # With MIG disabled, treat as a single device.
141
+
142
+ if dev_mig_mode == pyhgml.HGML_DEVICE_MIG_DISABLE:
143
+ dev_index = dev_idx
144
+ if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
145
+ if dev_files is None:
146
+ dev_files = get_device_files(
147
+ pattern=r"alixpu_ppu(?P<number>\d+)",
148
+ )
149
+ if len(dev_files) >= dev_count:
150
+ dev_file = dev_files[dev_idx]
151
+ if dev_file.number is not None:
152
+ dev_index = dev_file.number
153
+
154
+ dev_name = pyhgml.hgmlDeviceGetName(dev)
155
+
156
+ dev_uuid = pyhgml.hgmlDeviceGetUUID(dev)
157
+
158
+ dev_cores = None
159
+ with contextlib.suppress(pyhgml.HGMLError):
160
+ dev_cores = pyhgml.hgmlDeviceGetNumGpuCores(dev)
161
+
162
+ dev_cores_util = None
163
+ with contextlib.suppress(pyhgml.HGMLError):
164
+ dev_util_rates = pyhgml.hgmlDeviceGetUtilizationRates(dev)
165
+ dev_cores_util = dev_util_rates.gpu
166
+ if dev_cores_util is None:
167
+ debug_log_warning(
168
+ logger,
169
+ "Failed to get device %d cores utilization, setting to 0",
170
+ dev_index,
171
+ )
172
+ dev_cores_util = 0
173
+
174
+ dev_mem = 0
175
+ dev_mem_used = 0
176
+ with contextlib.suppress(pyhgml.HGMLError):
177
+ dev_mem_info = pyhgml.hgmlDeviceGetMemoryInfo(dev)
178
+ dev_mem = byte_to_mebibyte( # byte to MiB
179
+ dev_mem_info.total,
180
+ )
181
+ dev_mem_used = byte_to_mebibyte( # byte to MiB
182
+ dev_mem_info.used,
183
+ )
184
+
185
+ dev_temp = None
186
+ with contextlib.suppress(pyhgml.HGMLError):
187
+ dev_temp = pyhgml.hgmlDeviceGetTemperature(
188
+ dev,
189
+ pyhgml.HGML_TEMPERATURE_GPU,
190
+ )
191
+
192
+ dev_power = None
193
+ dev_power_used = None
194
+ with contextlib.suppress(pyhgml.HGMLError):
195
+ dev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(dev)
196
+ dev_power = dev_power // 1000 # mW to W
197
+ dev_power_used = (
198
+ pyhgml.hgmlDeviceGetPowerUsage(dev) // 1000
199
+ ) # mW to W
200
+
201
+ dev_is_vgpu = False
202
+ if dev_bdf:
203
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
204
+
205
+ dev_appendix = {
206
+ "vgpu": dev_is_vgpu,
207
+ }
208
+ if dev_bdf:
209
+ dev_appendix["bdf"] = dev_bdf
210
+
211
+ if dev_links_state := _get_links_state(dev):
212
+ dev_appendix.update(dev_links_state)
213
+
214
+ ret.append(
215
+ Device(
216
+ manufacturer=self.manufacturer,
217
+ index=dev_index,
218
+ name=dev_name,
219
+ uuid=dev_uuid,
220
+ driver_version=sys_driver_ver,
221
+ runtime_version=sys_runtime_ver,
222
+ runtime_version_original=sys_runtime_ver_original,
223
+ compute_capability=dev_cc,
224
+ cores=dev_cores,
225
+ cores_utilization=dev_cores_util,
226
+ memory=dev_mem,
227
+ memory_used=dev_mem_used,
228
+ memory_utilization=get_utilization(dev_mem_used, dev_mem),
229
+ temperature=dev_temp,
230
+ power=dev_power,
231
+ power_used=dev_power_used,
232
+ appendix=dev_appendix,
233
+ ),
234
+ )
235
+
236
+ continue
237
+
238
+ # Otherwise, get MIG devices.
239
+
240
+ mdev_name = ""
241
+ mdev_cores = None
242
+ mdev_count = pyhgml.hgmlDeviceGetMaxMigDeviceCount(dev)
243
+ for mdev_idx in range(mdev_count):
244
+ mdev = pyhgml.hgmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
245
+
246
+ mdev_index = mdev_idx
247
+ mdev_uuid = pyhgml.hgmlDeviceGetUUID(mdev)
248
+
249
+ mdev_mem, mdev_mem_used = 0, 0
250
+ with contextlib.suppress(pyhgml.HGMLError):
251
+ mdev_mem_info = pyhgml.hgmlDeviceGetMemoryInfo(mdev)
252
+ byte_to_mebibyte( # byte to MiB
253
+ mdev_mem_info.total,
254
+ )
255
+ byte_to_mebibyte( # byte to MiB
256
+ mdev_mem_info.used,
257
+ )
258
+
259
+ mdev_temp = pyhgml.hgmlDeviceGetTemperature(
260
+ mdev,
261
+ pyhgml.HGML_TEMPERATURE_GPU,
262
+ )
263
+
264
+ mdev_power = None
265
+ with contextlib.suppress(pyhgml.HGMLError):
266
+ mdev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(
267
+ mdev,
268
+ )
269
+ mdev_power = mdev_power // 1000 # mW to W
270
+ mdev_power_used = (
271
+ pyhgml.hgmlDeviceGetPowerUsage(mdev) // 1000
272
+ ) # mW to W
273
+
274
+ mdev_appendix = {
275
+ "vgpu": True,
276
+ }
277
+ if dev_bdf:
278
+ mdev_appendix["bdf"] = dev_bdf
279
+
280
+ mdev_gi_id = pyhgml.hgmlDeviceGetGpuInstanceId(mdev)
281
+ mdev_appendix["gpu_instance_id"] = mdev_gi_id
282
+ mdev_ci_id = pyhgml.hgmlDeviceGetComputeInstanceId(mdev)
283
+ mdev_appendix["compute_instance_id"] = mdev_ci_id
284
+
285
+ mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
286
+
287
+ if not mdev_name:
288
+ mdev_gi = pyhgml.hgmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
289
+ mdev_ci = pyhgml.hgmlGpuInstanceGetComputeInstanceById(
290
+ mdev_gi,
291
+ mdev_ci_id,
292
+ )
293
+ mdev_gi_info = pyhgml.hgmlGpuInstanceGetInfo(mdev_gi)
294
+ mdev_ci_info = pyhgml.hgmlComputeInstanceGetInfo(mdev_ci)
295
+ for dev_gi_prf_id in range(
296
+ pyhgml.HGML_GPU_INSTANCE_PROFILE_COUNT,
297
+ ):
298
+ try:
299
+ dev_gi_prf = pyhgml.hgmlDeviceGetGpuInstanceProfileInfo(
300
+ dev,
301
+ dev_gi_prf_id,
302
+ )
303
+ if dev_gi_prf.id != mdev_gi_info.profileId:
304
+ continue
305
+ except pyhgml.HGMLError:
306
+ continue
307
+
308
+ for dev_ci_prf_id in range(
309
+ pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_COUNT,
310
+ ):
311
+ for dev_cig_prf_id in range(
312
+ pyhgml.HGML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
313
+ ):
314
+ try:
315
+ mdev_ci_prf = pyhgml.hgmlGpuInstanceGetComputeInstanceProfileInfo(
316
+ mdev_gi,
317
+ dev_ci_prf_id,
318
+ dev_cig_prf_id,
319
+ )
320
+ if mdev_ci_prf.id != mdev_ci_info.profileId:
321
+ continue
322
+ except pyhgml.HGMLError:
323
+ continue
324
+
325
+ ci_slice = _get_compute_instance_slice(
326
+ dev_ci_prf_id,
327
+ )
328
+ gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
329
+ gi_mem = _get_gpu_instance_memory(
330
+ dev_mem_info,
331
+ dev_gi_prf,
332
+ )
333
+
334
+ if ci_slice == gi_slice:
335
+ mdev_name = f"{gi_slice}g.{gi_mem}gb"
336
+ else:
337
+ mdev_name = (
338
+ f"{ci_slice}u.{gi_slice}g.{gi_mem}gb"
339
+ )
340
+
341
+ mdev_cores = mdev_ci_prf.multiprocessorCount
342
+
343
+ break
344
+
345
+ ret.append(
346
+ Device(
347
+ manufacturer=self.manufacturer,
348
+ index=mdev_index,
349
+ name=mdev_name,
350
+ uuid=mdev_uuid,
351
+ driver_version=sys_driver_ver,
352
+ runtime_version=sys_runtime_ver,
353
+ runtime_version_original=sys_runtime_ver_original,
354
+ compute_capability=dev_cc,
355
+ cores=mdev_cores,
356
+ cores_utilization=mdev_cores_util,
357
+ memory=mdev_mem,
358
+ memory_used=mdev_mem_used,
359
+ memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
360
+ temperature=mdev_temp,
361
+ power=mdev_power,
362
+ power_used=mdev_power_used,
363
+ appendix=mdev_appendix,
364
+ ),
365
+ )
366
+ except pyhgml.HGMLError:
367
+ debug_log_exception(logger, "Failed to fetch devices")
368
+ raise
369
+ except Exception:
370
+ debug_log_exception(logger, "Failed to process devices fetching")
371
+ raise
372
+ finally:
373
+ pyhgml.hgmlShutdown()
374
+
375
+ return ret
376
+
377
+ def get_topology(self, devices: Devices | None = None) -> Topology | None:
378
+ """
379
+ Get the Topology object between NVIDIA GPUs.
380
+
381
+ Args:
382
+ devices:
383
+ The list of detected NVIDIA devices.
384
+ If None, detect topology for all available devices.
385
+
386
+ Returns:
387
+ The Topology object, or None if not supported.
388
+
389
+ """
390
+ if devices is None:
391
+ devices = self.detect()
392
+ if devices is None:
393
+ return None
394
+
395
+ ret = Topology(
396
+ manufacturer=self.manufacturer,
397
+ devices_count=len(devices),
398
+ )
399
+
400
+ try:
401
+ pyhgml.hgmlInit()
402
+
403
+ for i, dev_i in enumerate(devices):
404
+ dev_i_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_i.uuid)
405
+
406
+ # Get affinity with PCIe BDF if possible.
407
+ if dev_i_bdf := dev_i.appendix.get("bdf", ""):
408
+ ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
409
+ dev_i_bdf,
410
+ )
411
+ ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
412
+ ret.devices_numa_affinities[i],
413
+ )
414
+ # Otherwise, get affinity via IXML.
415
+ if not ret.devices_cpu_affinities[i]:
416
+ # Get NUMA affinity.
417
+ try:
418
+ dev_i_memset = pyhgml.hgmlDeviceGetMemoryAffinity(
419
+ dev_i_handle,
420
+ get_numa_nodeset_size(),
421
+ pyhgml.HGML_AFFINITY_SCOPE_NODE,
422
+ )
423
+ ret.devices_numa_affinities[i] = bitmask_to_str(
424
+ list(dev_i_memset),
425
+ )
426
+ except pyhgml.HGMLError:
427
+ debug_log_exception(
428
+ logger,
429
+ "Failed to get NUMA affinity for device %d",
430
+ dev_i.index,
431
+ )
432
+ # Get CPU affinity.
433
+ ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
434
+ ret.devices_numa_affinities[i],
435
+ )
436
+
437
+ # Get distances to other devices.
438
+ for j, dev_j in enumerate(devices):
439
+ if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
440
+ continue
441
+
442
+ dev_j_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_j.uuid)
443
+
444
+ distance = TopologyDistanceEnum.UNK
445
+ try:
446
+ distance = pyhgml.hgmlDeviceGetTopologyCommonAncestor(
447
+ dev_i_handle,
448
+ dev_j_handle,
449
+ )
450
+ if dev_i.appendix.get("links_state", 0) > 0:
451
+ distance = TopologyDistanceEnum.LINK
452
+ except pyhgml.HGMLError:
453
+ debug_log_exception(
454
+ logger,
455
+ "Failed to get distance between device %d and %d",
456
+ dev_i.index,
457
+ dev_j.index,
458
+ )
459
+
460
+ ret.devices_distances[i][j] = distance
461
+ ret.devices_distances[j][i] = distance
462
+ except pyhgml.HGMLError:
463
+ debug_log_exception(logger, "Failed to fetch topology")
464
+ raise
465
+ except Exception:
466
+ debug_log_exception(logger, "Failed to process topology fetching")
467
+ raise
468
+ finally:
469
+ pyhgml.hgmlShutdown()
470
+
471
+ return ret
472
+
473
+
474
+ def _get_gpm_metrics(
475
+ metrics: list[int],
476
+ dev: pyhgml.c_hgmlDevice_t,
477
+ gpu_instance_id: int | None = None,
478
+ interval: float = 0.1,
479
+ ) -> list[pyhgml.c_hgmlGpmMetric_t] | None:
480
+ """
481
+ Get GPM metrics for a device or a MIG GPU instance.
482
+
483
+ Args:
484
+ metrics:
485
+ A list of GPM metric IDs to query.
486
+ dev:
487
+ The HGML device handle.
488
+ gpu_instance_id:
489
+ The GPU instance ID for MIG devices.
490
+ interval:
491
+ Interval in seconds between two samples.
492
+
493
+ Returns:
494
+ A list of GPM metric structures, or None if failed.
495
+
496
+ """
497
+ try:
498
+ dev_gpm_support = pyhgml.hgmlGpmQueryDeviceSupport(dev)
499
+ if not bool(dev_gpm_support.isSupportedDevice):
500
+ return None
501
+ except pyhgml.HGMLError:
502
+ debug_log_warning(logger, "Unsupported GPM query")
503
+ return None
504
+
505
+ dev_gpm_metrics = pyhgml.c_hgmlGpmMetricsGet_t()
506
+ try:
507
+ dev_gpm_metrics.sample1 = pyhgml.hgmlGpmSampleAlloc()
508
+ dev_gpm_metrics.sample2 = pyhgml.hgmlGpmSampleAlloc()
509
+ if gpu_instance_id is None:
510
+ pyhgml.hgmlGpmSampleGet(dev, dev_gpm_metrics.sample1)
511
+ time.sleep(interval)
512
+ pyhgml.hgmlGpmSampleGet(dev, dev_gpm_metrics.sample2)
513
+ else:
514
+ pyhgml.hgmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample1)
515
+ time.sleep(interval)
516
+ pyhgml.hgmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample2)
517
+ dev_gpm_metrics.version = pyhgml.HGML_GPM_METRICS_GET_VERSION
518
+ dev_gpm_metrics.numMetrics = len(metrics)
519
+ for metric_idx, metric in enumerate(metrics):
520
+ dev_gpm_metrics.metrics[metric_idx].metricId = metric
521
+ pyhgml.hgmlGpmMetricsGet(dev_gpm_metrics)
522
+ except pyhgml.HGMLError:
523
+ debug_log_exception(logger, "Failed to get GPM metrics")
524
+ return None
525
+ finally:
526
+ if dev_gpm_metrics.sample1:
527
+ pyhgml.hgmlGpmSampleFree(dev_gpm_metrics.sample1)
528
+ if dev_gpm_metrics.sample2:
529
+ pyhgml.hgmlGpmSampleFree(dev_gpm_metrics.sample2)
530
+ return list(dev_gpm_metrics.metrics)
531
+
532
+
533
+ def _get_sm_util_from_gpm_metrics(
534
+ dev: pyhgml.c_hgmlDevice_t,
535
+ gpu_instance_id: int | None = None,
536
+ interval: float = 0.1,
537
+ ) -> int | None:
538
+ """
539
+ Get SM utilization from GPM metrics.
540
+
541
+ Args:
542
+ dev:
543
+ The HGML device handle.
544
+ gpu_instance_id:
545
+ The GPU instance ID for MIG devices.
546
+ interval:
547
+ Interval in seconds between two samples.
548
+
549
+ Returns:
550
+ The SM utilization as an integer percentage, or None if failed.
551
+
552
+ """
553
+ dev_gpm_metrics = _get_gpm_metrics(
554
+ metrics=[pyhgml.HGML_GPM_METRIC_SM_UTIL],
555
+ dev=dev,
556
+ gpu_instance_id=gpu_instance_id,
557
+ interval=interval,
558
+ )
559
+ if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
560
+ return int(dev_gpm_metrics[0].value)
561
+
562
+ return None
563
+
564
+
565
+ def _extract_field_value(
566
+ field_value: pyhgml.c_hgmlFieldValue_t,
567
+ ) -> int | float | None:
568
+ """
569
+ Extract the value from a HGML field value structure.
570
+
571
+ Args:
572
+ field_value:
573
+ The HGML field value structure.
574
+
575
+ Returns:
576
+ The extracted value as int, float, or None if unknown.
577
+
578
+ """
579
+ if field_value.hgmlReturn != pyhgml.HGML_SUCCESS:
580
+ return None
581
+ match field_value.valueType:
582
+ case pyhgml.HGML_VALUE_TYPE_DOUBLE:
583
+ return field_value.value.dVal
584
+ case pyhgml.HGML_VALUE_TYPE_UNSIGNED_INT:
585
+ return field_value.value.uiVal
586
+ case pyhgml.HGML_VALUE_TYPE_UNSIGNED_LONG:
587
+ return field_value.value.ulVal
588
+ case pyhgml.HGML_VALUE_TYPE_UNSIGNED_LONG_LONG:
589
+ return field_value.value.ullVal
590
+ case pyhgml.HGML_VALUE_TYPE_SIGNED_LONG_LONG:
591
+ return field_value.value.sllVal
592
+ case pyhgml.HGML_VALUE_TYPE_SIGNED_INT:
593
+ return field_value.value.siVal
594
+ return None
595
+
596
+
597
+ def _get_links_state(
598
+ dev: pyhgml.c_hgmlDevice_t,
599
+ ) -> dict | None:
600
+ """
601
+ Get the ICNLink links count and state for a device.
602
+
603
+ Args:
604
+ dev:
605
+ The HGML device handle.
606
+
607
+ Returns:
608
+ A dict includes links state or None if failed.
609
+
610
+ """
611
+ dev_links_count = 0
612
+ try:
613
+ dev_fields = pyhgml.hgmlDeviceGetFieldValues(
614
+ dev,
615
+ fieldIds=[pyhgml.HGML_FI_DEV_ICNLINK_LINK_COUNT],
616
+ )
617
+ dev_links_count = _extract_field_value(dev_fields[0])
618
+ except pyhgml.HGMLError:
619
+ debug_log_warning(logger, "Failed to get ICNLink links count")
620
+ if not dev_links_count:
621
+ return None
622
+
623
+ dev_links_state = 0
624
+ try:
625
+ for link_idx in range(int(dev_links_count)):
626
+ dev_link_state = pyhgml.hgmlDeviceGetIcnLinkState(dev, link_idx)
627
+ if dev_link_state:
628
+ dev_links_state |= 1 << (link_idx + 1)
629
+ except pyhgml.HGMLError:
630
+ debug_log_warning(logger, "Failed to get ICNLink link state")
631
+
632
+ return {
633
+ "links_count": dev_links_count,
634
+ "links_state": dev_links_state,
635
+ }
636
+
637
+
638
+ def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
639
+ """
640
+ Get the number of slices for a given GPU Instance Profile ID.
641
+
642
+ Args:
643
+ dev_gi_prf_id:
644
+ The GPU Instance Profile ID.
645
+
646
+ Returns:
647
+ The number of slices.
648
+
649
+ """
650
+ match dev_gi_prf_id:
651
+ case (
652
+ pyhgml.HGML_GPU_INSTANCE_PROFILE_1_SLICE
653
+ | pyhgml.HGML_GPU_INSTANCE_PROFILE_1_SLICE_REV1
654
+ | pyhgml.HGML_GPU_INSTANCE_PROFILE_1_SLICE_REV2
655
+ ):
656
+ return 1
657
+ case (
658
+ pyhgml.HGML_GPU_INSTANCE_PROFILE_2_SLICE
659
+ | pyhgml.HGML_GPU_INSTANCE_PROFILE_2_SLICE_REV1
660
+ ):
661
+ return 2
662
+ case pyhgml.HGML_GPU_INSTANCE_PROFILE_3_SLICE:
663
+ return 3
664
+ case pyhgml.HGML_GPU_INSTANCE_PROFILE_4_SLICE:
665
+ return 4
666
+ case pyhgml.HGML_GPU_INSTANCE_PROFILE_6_SLICE:
667
+ return 6
668
+ case pyhgml.HGML_GPU_INSTANCE_PROFILE_7_SLICE:
669
+ return 7
670
+ case pyhgml.HGML_GPU_INSTANCE_PROFILE_8_SLICE:
671
+ return 8
672
+
673
+ msg = f"Invalid GPU Instance Profile ID: {dev_gi_prf_id}"
674
+ raise AttributeError(msg)
675
+
676
+
677
+ def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
678
+ """
679
+ Compute the memory size of a MIG compute instance in GiB.
680
+
681
+ Args:
682
+ dev_mem:
683
+ The total memory info of the parent GPU device.
684
+ dev_gi_prf:
685
+ The profile info of the GPU instance.
686
+
687
+ Returns:
688
+ The memory size in GiB.
689
+
690
+ """
691
+ mem = dev_gi_prf.memorySizeMB * (1 << 20) # MiB to byte
692
+
693
+ gib = round(
694
+ math.ceil(mem / dev_mem.total * 8)
695
+ / 8
696
+ * ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
697
+ )
698
+ return gib
699
+
700
+
701
+ def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
702
+ """
703
+ Get the number of slice for a given Compute Instance Profile ID.
704
+
705
+ Args:
706
+ dev_ci_prf_id:
707
+ The Compute Instance Profile ID.
708
+
709
+ Returns:
710
+ The number of slice.
711
+
712
+ """
713
+ match dev_ci_prf_id:
714
+ case (
715
+ pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_1_SLICE
716
+ | pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1
717
+ ):
718
+ return 1
719
+ case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_2_SLICE:
720
+ return 2
721
+ case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_3_SLICE:
722
+ return 3
723
+ case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_4_SLICE:
724
+ return 4
725
+ case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_6_SLICE:
726
+ return 6
727
+ case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_7_SLICE:
728
+ return 7
729
+ case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_8_SLICE:
730
+ return 8
731
+
732
+ msg = f"Invalid Compute Instance Profile ID: {dev_ci_prf_id}"
733
+ raise AttributeError(msg)