gpustack-runtime 0.1.39.post2__py3-none-any.whl → 0.1.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/__main__.py +7 -3
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/cmds/__init__.py +2 -0
- gpustack_runtime/cmds/deployer.py +84 -2
- gpustack_runtime/cmds/images.py +2 -0
- gpustack_runtime/deployer/__init__.py +2 -0
- gpustack_runtime/deployer/__types__.py +52 -28
- gpustack_runtime/deployer/__utils__.py +99 -112
- gpustack_runtime/deployer/cdi/__init__.py +81 -0
- gpustack_runtime/deployer/cdi/__types__.py +667 -0
- gpustack_runtime/deployer/cdi/thead.py +103 -0
- gpustack_runtime/deployer/docker.py +42 -24
- gpustack_runtime/deployer/kuberentes.py +8 -4
- gpustack_runtime/deployer/podman.py +41 -23
- gpustack_runtime/detector/__init__.py +62 -3
- gpustack_runtime/detector/__types__.py +11 -0
- gpustack_runtime/detector/__utils__.py +23 -0
- gpustack_runtime/detector/amd.py +17 -9
- gpustack_runtime/detector/hygon.py +6 -1
- gpustack_runtime/detector/iluvatar.py +20 -5
- gpustack_runtime/detector/mthreads.py +8 -12
- gpustack_runtime/detector/nvidia.py +365 -168
- gpustack_runtime/detector/pyacl/__init__.py +9 -1
- gpustack_runtime/detector/pyamdgpu/__init__.py +8 -0
- gpustack_runtime/detector/pycuda/__init__.py +9 -1
- gpustack_runtime/detector/pydcmi/__init__.py +9 -2
- gpustack_runtime/detector/pyhgml/__init__.py +5879 -0
- gpustack_runtime/detector/pyhgml/libhgml.so +0 -0
- gpustack_runtime/detector/pyhgml/libuki.so +0 -0
- gpustack_runtime/detector/pyhsa/__init__.py +9 -0
- gpustack_runtime/detector/pyixml/__init__.py +89 -164
- gpustack_runtime/detector/pyrocmcore/__init__.py +42 -24
- gpustack_runtime/detector/pyrocmsmi/__init__.py +141 -138
- gpustack_runtime/detector/thead.py +733 -0
- gpustack_runtime/envs.py +128 -55
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/METADATA +4 -2
- gpustack_runtime-0.1.40.dist-info/RECORD +55 -0
- gpustack_runtime/detector/pymtml/__init__.py +0 -770
- gpustack_runtime-0.1.39.post2.dist-info/RECORD +0 -49
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.40.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,733 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import logging
|
|
5
|
+
import math
|
|
6
|
+
import time
|
|
7
|
+
from functools import lru_cache
|
|
8
|
+
|
|
9
|
+
from .. import envs
|
|
10
|
+
from ..logging import debug_log_exception, debug_log_warning
|
|
11
|
+
from . import pyhgml
|
|
12
|
+
from .__types__ import (
|
|
13
|
+
Detector,
|
|
14
|
+
Device,
|
|
15
|
+
Devices,
|
|
16
|
+
ManufacturerEnum,
|
|
17
|
+
Topology,
|
|
18
|
+
TopologyDistanceEnum,
|
|
19
|
+
)
|
|
20
|
+
from .__utils__ import (
|
|
21
|
+
PCIDevice,
|
|
22
|
+
bitmask_to_str,
|
|
23
|
+
byte_to_mebibyte,
|
|
24
|
+
get_brief_version,
|
|
25
|
+
get_device_files,
|
|
26
|
+
get_numa_node_by_bdf,
|
|
27
|
+
get_numa_nodeset_size,
|
|
28
|
+
get_pci_devices,
|
|
29
|
+
get_physical_function_by_bdf,
|
|
30
|
+
get_utilization,
|
|
31
|
+
map_numa_node_to_cpu_affinity,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class THeadDetector(Detector):
|
|
38
|
+
"""
|
|
39
|
+
Detect T-Head PPUs.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
@lru_cache
|
|
44
|
+
def is_supported() -> bool:
|
|
45
|
+
"""
|
|
46
|
+
Check if the T-Head detector is supported.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
True if supported, False otherwise.
|
|
50
|
+
|
|
51
|
+
"""
|
|
52
|
+
supported = False
|
|
53
|
+
if envs.GPUSTACK_RUNTIME_DETECT.lower() not in ("auto", "iluvatar"):
|
|
54
|
+
logger.debug("T-Head detection is disabled by environment variable")
|
|
55
|
+
return supported
|
|
56
|
+
|
|
57
|
+
pci_devs = THeadDetector.detect_pci_devices()
|
|
58
|
+
if not pci_devs and not envs.GPUSTACK_RUNTIME_DETECT_NO_PCI_CHECK:
|
|
59
|
+
logger.debug("No T-Head PCI devices found")
|
|
60
|
+
return supported
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
pyhgml.hgmlInit()
|
|
64
|
+
pyhgml.hgmlShutdown()
|
|
65
|
+
supported = True
|
|
66
|
+
except pyhgml.HGMLError:
|
|
67
|
+
debug_log_exception(logger, "Failed to initialize HGML library")
|
|
68
|
+
|
|
69
|
+
return supported
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
@lru_cache
|
|
73
|
+
def detect_pci_devices() -> dict[str, PCIDevice]:
|
|
74
|
+
# See https://pcisig.com/membership/member-companies?combine=Alibaba.
|
|
75
|
+
pci_devs = get_pci_devices(vendor="0x1ded")
|
|
76
|
+
if not pci_devs:
|
|
77
|
+
return {}
|
|
78
|
+
return {dev.address: dev for dev in pci_devs}
|
|
79
|
+
|
|
80
|
+
def __init__(self):
|
|
81
|
+
super().__init__(ManufacturerEnum.THEAD)
|
|
82
|
+
|
|
83
|
+
def detect(self) -> Devices | None:
|
|
84
|
+
"""
|
|
85
|
+
Detect T-Head GPUs using pyhgml.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
A list of detected T-Head GPU devices,
|
|
89
|
+
or None if not supported.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
If there is an error during detection.
|
|
93
|
+
|
|
94
|
+
"""
|
|
95
|
+
if not self.is_supported():
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
ret: Devices = []
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
pyhgml.hgmlInit()
|
|
102
|
+
|
|
103
|
+
sys_driver_ver = pyhgml.hgmlSystemGetDriverVersion()
|
|
104
|
+
|
|
105
|
+
sys_runtime_ver_original = None
|
|
106
|
+
sys_runtime_ver = None
|
|
107
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
108
|
+
sys_runtime_ver_original = pyhgml.hgmlSystemGetHggcDriverVersion()
|
|
109
|
+
sys_runtime_ver_original = ".".join(
|
|
110
|
+
map(
|
|
111
|
+
str,
|
|
112
|
+
[
|
|
113
|
+
sys_runtime_ver_original // 1000,
|
|
114
|
+
(sys_runtime_ver_original % 1000) // 10,
|
|
115
|
+
(sys_runtime_ver_original % 10),
|
|
116
|
+
],
|
|
117
|
+
),
|
|
118
|
+
)
|
|
119
|
+
sys_runtime_ver = get_brief_version(
|
|
120
|
+
sys_runtime_ver_original,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
dev_count = pyhgml.hgmlDeviceGetCount()
|
|
124
|
+
dev_files = None
|
|
125
|
+
for dev_idx in range(dev_count):
|
|
126
|
+
dev = pyhgml.hgmlDeviceGetHandleByIndex(dev_idx)
|
|
127
|
+
|
|
128
|
+
dev_cc_t = pyhgml.hgmlDeviceGetHggcComputeCapability(dev)
|
|
129
|
+
dev_cc = ".".join(map(str, dev_cc_t))
|
|
130
|
+
|
|
131
|
+
dev_bdf = None
|
|
132
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
133
|
+
dev_pci_info = pyhgml.hgmlDeviceGetPciInfo(dev)
|
|
134
|
+
dev_bdf = str(dev_pci_info.busIdLegacy).lower()
|
|
135
|
+
|
|
136
|
+
dev_mig_mode = pyhgml.HGML_DEVICE_MIG_DISABLE
|
|
137
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
138
|
+
dev_mig_mode, _ = pyhgml.hgmlDeviceGetMigMode(dev)
|
|
139
|
+
|
|
140
|
+
# With MIG disabled, treat as a single device.
|
|
141
|
+
|
|
142
|
+
if dev_mig_mode == pyhgml.HGML_DEVICE_MIG_DISABLE:
|
|
143
|
+
dev_index = dev_idx
|
|
144
|
+
if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
|
|
145
|
+
if dev_files is None:
|
|
146
|
+
dev_files = get_device_files(
|
|
147
|
+
pattern=r"alixpu_ppu(?P<number>\d+)",
|
|
148
|
+
)
|
|
149
|
+
if len(dev_files) >= dev_count:
|
|
150
|
+
dev_file = dev_files[dev_idx]
|
|
151
|
+
if dev_file.number is not None:
|
|
152
|
+
dev_index = dev_file.number
|
|
153
|
+
|
|
154
|
+
dev_name = pyhgml.hgmlDeviceGetName(dev)
|
|
155
|
+
|
|
156
|
+
dev_uuid = pyhgml.hgmlDeviceGetUUID(dev)
|
|
157
|
+
|
|
158
|
+
dev_cores = None
|
|
159
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
160
|
+
dev_cores = pyhgml.hgmlDeviceGetNumGpuCores(dev)
|
|
161
|
+
|
|
162
|
+
dev_cores_util = None
|
|
163
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
164
|
+
dev_util_rates = pyhgml.hgmlDeviceGetUtilizationRates(dev)
|
|
165
|
+
dev_cores_util = dev_util_rates.gpu
|
|
166
|
+
if dev_cores_util is None:
|
|
167
|
+
debug_log_warning(
|
|
168
|
+
logger,
|
|
169
|
+
"Failed to get device %d cores utilization, setting to 0",
|
|
170
|
+
dev_index,
|
|
171
|
+
)
|
|
172
|
+
dev_cores_util = 0
|
|
173
|
+
|
|
174
|
+
dev_mem = 0
|
|
175
|
+
dev_mem_used = 0
|
|
176
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
177
|
+
dev_mem_info = pyhgml.hgmlDeviceGetMemoryInfo(dev)
|
|
178
|
+
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
179
|
+
dev_mem_info.total,
|
|
180
|
+
)
|
|
181
|
+
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
182
|
+
dev_mem_info.used,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
dev_temp = None
|
|
186
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
187
|
+
dev_temp = pyhgml.hgmlDeviceGetTemperature(
|
|
188
|
+
dev,
|
|
189
|
+
pyhgml.HGML_TEMPERATURE_GPU,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
dev_power = None
|
|
193
|
+
dev_power_used = None
|
|
194
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
195
|
+
dev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(dev)
|
|
196
|
+
dev_power = dev_power // 1000 # mW to W
|
|
197
|
+
dev_power_used = (
|
|
198
|
+
pyhgml.hgmlDeviceGetPowerUsage(dev) // 1000
|
|
199
|
+
) # mW to W
|
|
200
|
+
|
|
201
|
+
dev_is_vgpu = False
|
|
202
|
+
if dev_bdf:
|
|
203
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
204
|
+
|
|
205
|
+
dev_appendix = {
|
|
206
|
+
"vgpu": dev_is_vgpu,
|
|
207
|
+
}
|
|
208
|
+
if dev_bdf:
|
|
209
|
+
dev_appendix["bdf"] = dev_bdf
|
|
210
|
+
|
|
211
|
+
if dev_links_state := _get_links_state(dev):
|
|
212
|
+
dev_appendix.update(dev_links_state)
|
|
213
|
+
|
|
214
|
+
ret.append(
|
|
215
|
+
Device(
|
|
216
|
+
manufacturer=self.manufacturer,
|
|
217
|
+
index=dev_index,
|
|
218
|
+
name=dev_name,
|
|
219
|
+
uuid=dev_uuid,
|
|
220
|
+
driver_version=sys_driver_ver,
|
|
221
|
+
runtime_version=sys_runtime_ver,
|
|
222
|
+
runtime_version_original=sys_runtime_ver_original,
|
|
223
|
+
compute_capability=dev_cc,
|
|
224
|
+
cores=dev_cores,
|
|
225
|
+
cores_utilization=dev_cores_util,
|
|
226
|
+
memory=dev_mem,
|
|
227
|
+
memory_used=dev_mem_used,
|
|
228
|
+
memory_utilization=get_utilization(dev_mem_used, dev_mem),
|
|
229
|
+
temperature=dev_temp,
|
|
230
|
+
power=dev_power,
|
|
231
|
+
power_used=dev_power_used,
|
|
232
|
+
appendix=dev_appendix,
|
|
233
|
+
),
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
# Otherwise, get MIG devices.
|
|
239
|
+
|
|
240
|
+
mdev_name = ""
|
|
241
|
+
mdev_cores = None
|
|
242
|
+
mdev_count = pyhgml.hgmlDeviceGetMaxMigDeviceCount(dev)
|
|
243
|
+
for mdev_idx in range(mdev_count):
|
|
244
|
+
mdev = pyhgml.hgmlDeviceGetMigDeviceHandleByIndex(dev, mdev_idx)
|
|
245
|
+
|
|
246
|
+
mdev_index = mdev_idx
|
|
247
|
+
mdev_uuid = pyhgml.hgmlDeviceGetUUID(mdev)
|
|
248
|
+
|
|
249
|
+
mdev_mem, mdev_mem_used = 0, 0
|
|
250
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
251
|
+
mdev_mem_info = pyhgml.hgmlDeviceGetMemoryInfo(mdev)
|
|
252
|
+
byte_to_mebibyte( # byte to MiB
|
|
253
|
+
mdev_mem_info.total,
|
|
254
|
+
)
|
|
255
|
+
byte_to_mebibyte( # byte to MiB
|
|
256
|
+
mdev_mem_info.used,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
mdev_temp = pyhgml.hgmlDeviceGetTemperature(
|
|
260
|
+
mdev,
|
|
261
|
+
pyhgml.HGML_TEMPERATURE_GPU,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
mdev_power = None
|
|
265
|
+
with contextlib.suppress(pyhgml.HGMLError):
|
|
266
|
+
mdev_power = pyhgml.hgmlDeviceGetPowerManagementDefaultLimit(
|
|
267
|
+
mdev,
|
|
268
|
+
)
|
|
269
|
+
mdev_power = mdev_power // 1000 # mW to W
|
|
270
|
+
mdev_power_used = (
|
|
271
|
+
pyhgml.hgmlDeviceGetPowerUsage(mdev) // 1000
|
|
272
|
+
) # mW to W
|
|
273
|
+
|
|
274
|
+
mdev_appendix = {
|
|
275
|
+
"vgpu": True,
|
|
276
|
+
}
|
|
277
|
+
if dev_bdf:
|
|
278
|
+
mdev_appendix["bdf"] = dev_bdf
|
|
279
|
+
|
|
280
|
+
mdev_gi_id = pyhgml.hgmlDeviceGetGpuInstanceId(mdev)
|
|
281
|
+
mdev_appendix["gpu_instance_id"] = mdev_gi_id
|
|
282
|
+
mdev_ci_id = pyhgml.hgmlDeviceGetComputeInstanceId(mdev)
|
|
283
|
+
mdev_appendix["compute_instance_id"] = mdev_ci_id
|
|
284
|
+
|
|
285
|
+
mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
|
|
286
|
+
|
|
287
|
+
if not mdev_name:
|
|
288
|
+
mdev_gi = pyhgml.hgmlDeviceGetGpuInstanceById(dev, mdev_gi_id)
|
|
289
|
+
mdev_ci = pyhgml.hgmlGpuInstanceGetComputeInstanceById(
|
|
290
|
+
mdev_gi,
|
|
291
|
+
mdev_ci_id,
|
|
292
|
+
)
|
|
293
|
+
mdev_gi_info = pyhgml.hgmlGpuInstanceGetInfo(mdev_gi)
|
|
294
|
+
mdev_ci_info = pyhgml.hgmlComputeInstanceGetInfo(mdev_ci)
|
|
295
|
+
for dev_gi_prf_id in range(
|
|
296
|
+
pyhgml.HGML_GPU_INSTANCE_PROFILE_COUNT,
|
|
297
|
+
):
|
|
298
|
+
try:
|
|
299
|
+
dev_gi_prf = pyhgml.hgmlDeviceGetGpuInstanceProfileInfo(
|
|
300
|
+
dev,
|
|
301
|
+
dev_gi_prf_id,
|
|
302
|
+
)
|
|
303
|
+
if dev_gi_prf.id != mdev_gi_info.profileId:
|
|
304
|
+
continue
|
|
305
|
+
except pyhgml.HGMLError:
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
for dev_ci_prf_id in range(
|
|
309
|
+
pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_COUNT,
|
|
310
|
+
):
|
|
311
|
+
for dev_cig_prf_id in range(
|
|
312
|
+
pyhgml.HGML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT,
|
|
313
|
+
):
|
|
314
|
+
try:
|
|
315
|
+
mdev_ci_prf = pyhgml.hgmlGpuInstanceGetComputeInstanceProfileInfo(
|
|
316
|
+
mdev_gi,
|
|
317
|
+
dev_ci_prf_id,
|
|
318
|
+
dev_cig_prf_id,
|
|
319
|
+
)
|
|
320
|
+
if mdev_ci_prf.id != mdev_ci_info.profileId:
|
|
321
|
+
continue
|
|
322
|
+
except pyhgml.HGMLError:
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
ci_slice = _get_compute_instance_slice(
|
|
326
|
+
dev_ci_prf_id,
|
|
327
|
+
)
|
|
328
|
+
gi_slice = _get_gpu_instance_slice(dev_gi_prf_id)
|
|
329
|
+
gi_mem = _get_gpu_instance_memory(
|
|
330
|
+
dev_mem_info,
|
|
331
|
+
dev_gi_prf,
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
if ci_slice == gi_slice:
|
|
335
|
+
mdev_name = f"{gi_slice}g.{gi_mem}gb"
|
|
336
|
+
else:
|
|
337
|
+
mdev_name = (
|
|
338
|
+
f"{ci_slice}u.{gi_slice}g.{gi_mem}gb"
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
mdev_cores = mdev_ci_prf.multiprocessorCount
|
|
342
|
+
|
|
343
|
+
break
|
|
344
|
+
|
|
345
|
+
ret.append(
|
|
346
|
+
Device(
|
|
347
|
+
manufacturer=self.manufacturer,
|
|
348
|
+
index=mdev_index,
|
|
349
|
+
name=mdev_name,
|
|
350
|
+
uuid=mdev_uuid,
|
|
351
|
+
driver_version=sys_driver_ver,
|
|
352
|
+
runtime_version=sys_runtime_ver,
|
|
353
|
+
runtime_version_original=sys_runtime_ver_original,
|
|
354
|
+
compute_capability=dev_cc,
|
|
355
|
+
cores=mdev_cores,
|
|
356
|
+
cores_utilization=mdev_cores_util,
|
|
357
|
+
memory=mdev_mem,
|
|
358
|
+
memory_used=mdev_mem_used,
|
|
359
|
+
memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
|
|
360
|
+
temperature=mdev_temp,
|
|
361
|
+
power=mdev_power,
|
|
362
|
+
power_used=mdev_power_used,
|
|
363
|
+
appendix=mdev_appendix,
|
|
364
|
+
),
|
|
365
|
+
)
|
|
366
|
+
except pyhgml.HGMLError:
|
|
367
|
+
debug_log_exception(logger, "Failed to fetch devices")
|
|
368
|
+
raise
|
|
369
|
+
except Exception:
|
|
370
|
+
debug_log_exception(logger, "Failed to process devices fetching")
|
|
371
|
+
raise
|
|
372
|
+
finally:
|
|
373
|
+
pyhgml.hgmlShutdown()
|
|
374
|
+
|
|
375
|
+
return ret
|
|
376
|
+
|
|
377
|
+
def get_topology(self, devices: Devices | None = None) -> Topology | None:
|
|
378
|
+
"""
|
|
379
|
+
Get the Topology object between NVIDIA GPUs.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
devices:
|
|
383
|
+
The list of detected NVIDIA devices.
|
|
384
|
+
If None, detect topology for all available devices.
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
The Topology object, or None if not supported.
|
|
388
|
+
|
|
389
|
+
"""
|
|
390
|
+
if devices is None:
|
|
391
|
+
devices = self.detect()
|
|
392
|
+
if devices is None:
|
|
393
|
+
return None
|
|
394
|
+
|
|
395
|
+
ret = Topology(
|
|
396
|
+
manufacturer=self.manufacturer,
|
|
397
|
+
devices_count=len(devices),
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
try:
|
|
401
|
+
pyhgml.hgmlInit()
|
|
402
|
+
|
|
403
|
+
for i, dev_i in enumerate(devices):
|
|
404
|
+
dev_i_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_i.uuid)
|
|
405
|
+
|
|
406
|
+
# Get affinity with PCIe BDF if possible.
|
|
407
|
+
if dev_i_bdf := dev_i.appendix.get("bdf", ""):
|
|
408
|
+
ret.devices_numa_affinities[i] = get_numa_node_by_bdf(
|
|
409
|
+
dev_i_bdf,
|
|
410
|
+
)
|
|
411
|
+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
412
|
+
ret.devices_numa_affinities[i],
|
|
413
|
+
)
|
|
414
|
+
# Otherwise, get affinity via IXML.
|
|
415
|
+
if not ret.devices_cpu_affinities[i]:
|
|
416
|
+
# Get NUMA affinity.
|
|
417
|
+
try:
|
|
418
|
+
dev_i_memset = pyhgml.hgmlDeviceGetMemoryAffinity(
|
|
419
|
+
dev_i_handle,
|
|
420
|
+
get_numa_nodeset_size(),
|
|
421
|
+
pyhgml.HGML_AFFINITY_SCOPE_NODE,
|
|
422
|
+
)
|
|
423
|
+
ret.devices_numa_affinities[i] = bitmask_to_str(
|
|
424
|
+
list(dev_i_memset),
|
|
425
|
+
)
|
|
426
|
+
except pyhgml.HGMLError:
|
|
427
|
+
debug_log_exception(
|
|
428
|
+
logger,
|
|
429
|
+
"Failed to get NUMA affinity for device %d",
|
|
430
|
+
dev_i.index,
|
|
431
|
+
)
|
|
432
|
+
# Get CPU affinity.
|
|
433
|
+
ret.devices_cpu_affinities[i] = map_numa_node_to_cpu_affinity(
|
|
434
|
+
ret.devices_numa_affinities[i],
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
# Get distances to other devices.
|
|
438
|
+
for j, dev_j in enumerate(devices):
|
|
439
|
+
if dev_i.index == dev_j.index or ret.devices_distances[i][j] != 0:
|
|
440
|
+
continue
|
|
441
|
+
|
|
442
|
+
dev_j_handle = pyhgml.hgmlDeviceGetHandleByUUID(dev_j.uuid)
|
|
443
|
+
|
|
444
|
+
distance = TopologyDistanceEnum.UNK
|
|
445
|
+
try:
|
|
446
|
+
distance = pyhgml.hgmlDeviceGetTopologyCommonAncestor(
|
|
447
|
+
dev_i_handle,
|
|
448
|
+
dev_j_handle,
|
|
449
|
+
)
|
|
450
|
+
if dev_i.appendix.get("links_state", 0) > 0:
|
|
451
|
+
distance = TopologyDistanceEnum.LINK
|
|
452
|
+
except pyhgml.HGMLError:
|
|
453
|
+
debug_log_exception(
|
|
454
|
+
logger,
|
|
455
|
+
"Failed to get distance between device %d and %d",
|
|
456
|
+
dev_i.index,
|
|
457
|
+
dev_j.index,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
ret.devices_distances[i][j] = distance
|
|
461
|
+
ret.devices_distances[j][i] = distance
|
|
462
|
+
except pyhgml.HGMLError:
|
|
463
|
+
debug_log_exception(logger, "Failed to fetch topology")
|
|
464
|
+
raise
|
|
465
|
+
except Exception:
|
|
466
|
+
debug_log_exception(logger, "Failed to process topology fetching")
|
|
467
|
+
raise
|
|
468
|
+
finally:
|
|
469
|
+
pyhgml.hgmlShutdown()
|
|
470
|
+
|
|
471
|
+
return ret
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def _get_gpm_metrics(
|
|
475
|
+
metrics: list[int],
|
|
476
|
+
dev: pyhgml.c_hgmlDevice_t,
|
|
477
|
+
gpu_instance_id: int | None = None,
|
|
478
|
+
interval: float = 0.1,
|
|
479
|
+
) -> list[pyhgml.c_hgmlGpmMetric_t] | None:
|
|
480
|
+
"""
|
|
481
|
+
Get GPM metrics for a device or a MIG GPU instance.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
metrics:
|
|
485
|
+
A list of GPM metric IDs to query.
|
|
486
|
+
dev:
|
|
487
|
+
The HGML device handle.
|
|
488
|
+
gpu_instance_id:
|
|
489
|
+
The GPU instance ID for MIG devices.
|
|
490
|
+
interval:
|
|
491
|
+
Interval in seconds between two samples.
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
A list of GPM metric structures, or None if failed.
|
|
495
|
+
|
|
496
|
+
"""
|
|
497
|
+
try:
|
|
498
|
+
dev_gpm_support = pyhgml.hgmlGpmQueryDeviceSupport(dev)
|
|
499
|
+
if not bool(dev_gpm_support.isSupportedDevice):
|
|
500
|
+
return None
|
|
501
|
+
except pyhgml.HGMLError:
|
|
502
|
+
debug_log_warning(logger, "Unsupported GPM query")
|
|
503
|
+
return None
|
|
504
|
+
|
|
505
|
+
dev_gpm_metrics = pyhgml.c_hgmlGpmMetricsGet_t()
|
|
506
|
+
try:
|
|
507
|
+
dev_gpm_metrics.sample1 = pyhgml.hgmlGpmSampleAlloc()
|
|
508
|
+
dev_gpm_metrics.sample2 = pyhgml.hgmlGpmSampleAlloc()
|
|
509
|
+
if gpu_instance_id is None:
|
|
510
|
+
pyhgml.hgmlGpmSampleGet(dev, dev_gpm_metrics.sample1)
|
|
511
|
+
time.sleep(interval)
|
|
512
|
+
pyhgml.hgmlGpmSampleGet(dev, dev_gpm_metrics.sample2)
|
|
513
|
+
else:
|
|
514
|
+
pyhgml.hgmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample1)
|
|
515
|
+
time.sleep(interval)
|
|
516
|
+
pyhgml.hgmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample2)
|
|
517
|
+
dev_gpm_metrics.version = pyhgml.HGML_GPM_METRICS_GET_VERSION
|
|
518
|
+
dev_gpm_metrics.numMetrics = len(metrics)
|
|
519
|
+
for metric_idx, metric in enumerate(metrics):
|
|
520
|
+
dev_gpm_metrics.metrics[metric_idx].metricId = metric
|
|
521
|
+
pyhgml.hgmlGpmMetricsGet(dev_gpm_metrics)
|
|
522
|
+
except pyhgml.HGMLError:
|
|
523
|
+
debug_log_exception(logger, "Failed to get GPM metrics")
|
|
524
|
+
return None
|
|
525
|
+
finally:
|
|
526
|
+
if dev_gpm_metrics.sample1:
|
|
527
|
+
pyhgml.hgmlGpmSampleFree(dev_gpm_metrics.sample1)
|
|
528
|
+
if dev_gpm_metrics.sample2:
|
|
529
|
+
pyhgml.hgmlGpmSampleFree(dev_gpm_metrics.sample2)
|
|
530
|
+
return list(dev_gpm_metrics.metrics)
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def _get_sm_util_from_gpm_metrics(
|
|
534
|
+
dev: pyhgml.c_hgmlDevice_t,
|
|
535
|
+
gpu_instance_id: int | None = None,
|
|
536
|
+
interval: float = 0.1,
|
|
537
|
+
) -> int | None:
|
|
538
|
+
"""
|
|
539
|
+
Get SM utilization from GPM metrics.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
dev:
|
|
543
|
+
The HGML device handle.
|
|
544
|
+
gpu_instance_id:
|
|
545
|
+
The GPU instance ID for MIG devices.
|
|
546
|
+
interval:
|
|
547
|
+
Interval in seconds between two samples.
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
The SM utilization as an integer percentage, or None if failed.
|
|
551
|
+
|
|
552
|
+
"""
|
|
553
|
+
dev_gpm_metrics = _get_gpm_metrics(
|
|
554
|
+
metrics=[pyhgml.HGML_GPM_METRIC_SM_UTIL],
|
|
555
|
+
dev=dev,
|
|
556
|
+
gpu_instance_id=gpu_instance_id,
|
|
557
|
+
interval=interval,
|
|
558
|
+
)
|
|
559
|
+
if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
|
|
560
|
+
return int(dev_gpm_metrics[0].value)
|
|
561
|
+
|
|
562
|
+
return None
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def _extract_field_value(
|
|
566
|
+
field_value: pyhgml.c_hgmlFieldValue_t,
|
|
567
|
+
) -> int | float | None:
|
|
568
|
+
"""
|
|
569
|
+
Extract the value from a HGML field value structure.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
field_value:
|
|
573
|
+
The HGML field value structure.
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
The extracted value as int, float, or None if unknown.
|
|
577
|
+
|
|
578
|
+
"""
|
|
579
|
+
if field_value.hgmlReturn != pyhgml.HGML_SUCCESS:
|
|
580
|
+
return None
|
|
581
|
+
match field_value.valueType:
|
|
582
|
+
case pyhgml.HGML_VALUE_TYPE_DOUBLE:
|
|
583
|
+
return field_value.value.dVal
|
|
584
|
+
case pyhgml.HGML_VALUE_TYPE_UNSIGNED_INT:
|
|
585
|
+
return field_value.value.uiVal
|
|
586
|
+
case pyhgml.HGML_VALUE_TYPE_UNSIGNED_LONG:
|
|
587
|
+
return field_value.value.ulVal
|
|
588
|
+
case pyhgml.HGML_VALUE_TYPE_UNSIGNED_LONG_LONG:
|
|
589
|
+
return field_value.value.ullVal
|
|
590
|
+
case pyhgml.HGML_VALUE_TYPE_SIGNED_LONG_LONG:
|
|
591
|
+
return field_value.value.sllVal
|
|
592
|
+
case pyhgml.HGML_VALUE_TYPE_SIGNED_INT:
|
|
593
|
+
return field_value.value.siVal
|
|
594
|
+
return None
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def _get_links_state(
|
|
598
|
+
dev: pyhgml.c_hgmlDevice_t,
|
|
599
|
+
) -> dict | None:
|
|
600
|
+
"""
|
|
601
|
+
Get the ICNLink links count and state for a device.
|
|
602
|
+
|
|
603
|
+
Args:
|
|
604
|
+
dev:
|
|
605
|
+
The HGML device handle.
|
|
606
|
+
|
|
607
|
+
Returns:
|
|
608
|
+
A dict includes links state or None if failed.
|
|
609
|
+
|
|
610
|
+
"""
|
|
611
|
+
dev_links_count = 0
|
|
612
|
+
try:
|
|
613
|
+
dev_fields = pyhgml.hgmlDeviceGetFieldValues(
|
|
614
|
+
dev,
|
|
615
|
+
fieldIds=[pyhgml.HGML_FI_DEV_ICNLINK_LINK_COUNT],
|
|
616
|
+
)
|
|
617
|
+
dev_links_count = _extract_field_value(dev_fields[0])
|
|
618
|
+
except pyhgml.HGMLError:
|
|
619
|
+
debug_log_warning(logger, "Failed to get ICNLink links count")
|
|
620
|
+
if not dev_links_count:
|
|
621
|
+
return None
|
|
622
|
+
|
|
623
|
+
dev_links_state = 0
|
|
624
|
+
try:
|
|
625
|
+
for link_idx in range(int(dev_links_count)):
|
|
626
|
+
dev_link_state = pyhgml.hgmlDeviceGetIcnLinkState(dev, link_idx)
|
|
627
|
+
if dev_link_state:
|
|
628
|
+
dev_links_state |= 1 << (link_idx + 1)
|
|
629
|
+
except pyhgml.HGMLError:
|
|
630
|
+
debug_log_warning(logger, "Failed to get ICNLink link state")
|
|
631
|
+
|
|
632
|
+
return {
|
|
633
|
+
"links_count": dev_links_count,
|
|
634
|
+
"links_state": dev_links_state,
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def _get_gpu_instance_slice(dev_gi_prf_id: int) -> int:
|
|
639
|
+
"""
|
|
640
|
+
Get the number of slices for a given GPU Instance Profile ID.
|
|
641
|
+
|
|
642
|
+
Args:
|
|
643
|
+
dev_gi_prf_id:
|
|
644
|
+
The GPU Instance Profile ID.
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
The number of slices.
|
|
648
|
+
|
|
649
|
+
"""
|
|
650
|
+
match dev_gi_prf_id:
|
|
651
|
+
case (
|
|
652
|
+
pyhgml.HGML_GPU_INSTANCE_PROFILE_1_SLICE
|
|
653
|
+
| pyhgml.HGML_GPU_INSTANCE_PROFILE_1_SLICE_REV1
|
|
654
|
+
| pyhgml.HGML_GPU_INSTANCE_PROFILE_1_SLICE_REV2
|
|
655
|
+
):
|
|
656
|
+
return 1
|
|
657
|
+
case (
|
|
658
|
+
pyhgml.HGML_GPU_INSTANCE_PROFILE_2_SLICE
|
|
659
|
+
| pyhgml.HGML_GPU_INSTANCE_PROFILE_2_SLICE_REV1
|
|
660
|
+
):
|
|
661
|
+
return 2
|
|
662
|
+
case pyhgml.HGML_GPU_INSTANCE_PROFILE_3_SLICE:
|
|
663
|
+
return 3
|
|
664
|
+
case pyhgml.HGML_GPU_INSTANCE_PROFILE_4_SLICE:
|
|
665
|
+
return 4
|
|
666
|
+
case pyhgml.HGML_GPU_INSTANCE_PROFILE_6_SLICE:
|
|
667
|
+
return 6
|
|
668
|
+
case pyhgml.HGML_GPU_INSTANCE_PROFILE_7_SLICE:
|
|
669
|
+
return 7
|
|
670
|
+
case pyhgml.HGML_GPU_INSTANCE_PROFILE_8_SLICE:
|
|
671
|
+
return 8
|
|
672
|
+
|
|
673
|
+
msg = f"Invalid GPU Instance Profile ID: {dev_gi_prf_id}"
|
|
674
|
+
raise AttributeError(msg)
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def _get_gpu_instance_memory(dev_mem, dev_gi_prf) -> int:
|
|
678
|
+
"""
|
|
679
|
+
Compute the memory size of a MIG compute instance in GiB.
|
|
680
|
+
|
|
681
|
+
Args:
|
|
682
|
+
dev_mem:
|
|
683
|
+
The total memory info of the parent GPU device.
|
|
684
|
+
dev_gi_prf:
|
|
685
|
+
The profile info of the GPU instance.
|
|
686
|
+
|
|
687
|
+
Returns:
|
|
688
|
+
The memory size in GiB.
|
|
689
|
+
|
|
690
|
+
"""
|
|
691
|
+
mem = dev_gi_prf.memorySizeMB * (1 << 20) # MiB to byte
|
|
692
|
+
|
|
693
|
+
gib = round(
|
|
694
|
+
math.ceil(mem / dev_mem.total * 8)
|
|
695
|
+
/ 8
|
|
696
|
+
* ((dev_mem.total + (1 << 30) - 1) / (1 << 30)),
|
|
697
|
+
)
|
|
698
|
+
return gib
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
def _get_compute_instance_slice(dev_ci_prf_id: int) -> int:
|
|
702
|
+
"""
|
|
703
|
+
Get the number of slice for a given Compute Instance Profile ID.
|
|
704
|
+
|
|
705
|
+
Args:
|
|
706
|
+
dev_ci_prf_id:
|
|
707
|
+
The Compute Instance Profile ID.
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
The number of slice.
|
|
711
|
+
|
|
712
|
+
"""
|
|
713
|
+
match dev_ci_prf_id:
|
|
714
|
+
case (
|
|
715
|
+
pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_1_SLICE
|
|
716
|
+
| pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1
|
|
717
|
+
):
|
|
718
|
+
return 1
|
|
719
|
+
case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_2_SLICE:
|
|
720
|
+
return 2
|
|
721
|
+
case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_3_SLICE:
|
|
722
|
+
return 3
|
|
723
|
+
case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_4_SLICE:
|
|
724
|
+
return 4
|
|
725
|
+
case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_6_SLICE:
|
|
726
|
+
return 6
|
|
727
|
+
case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_7_SLICE:
|
|
728
|
+
return 7
|
|
729
|
+
case pyhgml.HGML_COMPUTE_INSTANCE_PROFILE_8_SLICE:
|
|
730
|
+
return 8
|
|
731
|
+
|
|
732
|
+
msg = f"Invalid Compute Instance Profile ID: {dev_ci_prf_id}"
|
|
733
|
+
raise AttributeError(msg)
|