gpustack-runtime 0.1.39.post2__py3-none-any.whl → 0.1.39.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpustack_runtime/_version.py +2 -2
- gpustack_runtime/_version_appendix.py +1 -1
- gpustack_runtime/deployer/docker.py +6 -2
- gpustack_runtime/deployer/podman.py +6 -2
- gpustack_runtime/detector/__utils__.py +23 -0
- gpustack_runtime/detector/amd.py +17 -9
- gpustack_runtime/detector/hygon.py +6 -1
- gpustack_runtime/detector/iluvatar.py +10 -2
- gpustack_runtime/detector/mthreads.py +8 -12
- gpustack_runtime/detector/nvidia.py +194 -86
- gpustack_runtime/detector/pyrocmsmi/__init__.py +3 -9
- gpustack_runtime/envs.py +1 -1
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/METADATA +3 -2
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/RECORD +17 -18
- gpustack_runtime/detector/pymtml/__init__.py +0 -770
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/WHEEL +0 -0
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/entry_points.txt +0 -0
- {gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/licenses/LICENSE +0 -0
gpustack_runtime/_version.py
CHANGED
|
@@ -27,8 +27,8 @@ version_tuple: VERSION_TUPLE
|
|
|
27
27
|
__commit_id__: COMMIT_ID
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
|
|
30
|
-
__version__ = version = '0.1.39.
|
|
31
|
-
__version_tuple__ = version_tuple = (0, 1, 39, '
|
|
30
|
+
__version__ = version = '0.1.39.post3'
|
|
31
|
+
__version_tuple__ = version_tuple = (0, 1, 39, 'post3')
|
|
32
32
|
try:
|
|
33
33
|
from ._version_appendix import git_commit
|
|
34
34
|
__commit_id__ = commit_id = git_commit
|
|
@@ -1 +1 @@
|
|
|
1
|
-
git_commit = "
|
|
1
|
+
git_commit = "d65920e"
|
|
@@ -1213,8 +1213,12 @@ class DockerDeployer(EndoscopicDeployer):
|
|
|
1213
1213
|
self_container_envs: dict[str, str] = dict(
|
|
1214
1214
|
item.split("=", 1) for item in self_container.attrs["Config"].get("Env", [])
|
|
1215
1215
|
)
|
|
1216
|
-
self_image_envs: dict[str, str] =
|
|
1217
|
-
|
|
1216
|
+
self_image_envs: dict[str, str] = (
|
|
1217
|
+
dict(
|
|
1218
|
+
item.split("=", 1) for item in self_image.attrs["Config"].get("Env", [])
|
|
1219
|
+
)
|
|
1220
|
+
if self_image.attrs["Config"]
|
|
1221
|
+
else {}
|
|
1218
1222
|
)
|
|
1219
1223
|
mirrored_envs: dict[str, str] = {
|
|
1220
1224
|
# Filter out gpustack-internal envs and same-as-image envs.
|
|
@@ -1189,8 +1189,12 @@ class PodmanDeployer(EndoscopicDeployer):
|
|
|
1189
1189
|
self_container_envs: dict[str, str] = dict(
|
|
1190
1190
|
item.split("=", 1) for item in self_container.attrs["Config"].get("Env", [])
|
|
1191
1191
|
)
|
|
1192
|
-
self_image_envs: dict[str, str] =
|
|
1193
|
-
|
|
1192
|
+
self_image_envs: dict[str, str] = (
|
|
1193
|
+
dict(
|
|
1194
|
+
item.split("=", 1) for item in self_image.attrs["Config"].get("Env", [])
|
|
1195
|
+
)
|
|
1196
|
+
if self_image.attrs["Config"]
|
|
1197
|
+
else {}
|
|
1194
1198
|
)
|
|
1195
1199
|
mirrored_envs: dict[str, str] = {
|
|
1196
1200
|
# Filter out gpustack-internal envs and same-as-image envs.
|
|
@@ -951,3 +951,26 @@ def bitmask_to_str(bitmask_list: list) -> str:
|
|
|
951
951
|
offset += get_bits_size()
|
|
952
952
|
|
|
953
953
|
return list_to_range_str(sorted(bits_lists))
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
def get_physical_function_by_bdf(bdf: str) -> str:
|
|
957
|
+
"""
|
|
958
|
+
Get the physical function BDF for a given PCI device BDF address.
|
|
959
|
+
|
|
960
|
+
Args:
|
|
961
|
+
bdf:
|
|
962
|
+
The PCI device BDF address (e.g., "0000:00:1f.0").
|
|
963
|
+
|
|
964
|
+
Returns:
|
|
965
|
+
The physical function BDF if found, otherwise returns the original BDF.
|
|
966
|
+
|
|
967
|
+
"""
|
|
968
|
+
if bdf:
|
|
969
|
+
with contextlib.suppress(Exception):
|
|
970
|
+
dev_path = Path(f"/sys/bus/pci/devices/{bdf}")
|
|
971
|
+
if dev_path.exists():
|
|
972
|
+
physfn_path = dev_path / "physfn"
|
|
973
|
+
if physfn_path.exists():
|
|
974
|
+
physfn_realpath = physfn_path.resolve()
|
|
975
|
+
return physfn_realpath.name
|
|
976
|
+
return bdf
|
gpustack_runtime/detector/amd.py
CHANGED
|
@@ -16,6 +16,7 @@ from .__utils__ import (
|
|
|
16
16
|
get_brief_version,
|
|
17
17
|
get_numa_node_by_bdf,
|
|
18
18
|
get_pci_devices,
|
|
19
|
+
get_physical_function_by_bdf,
|
|
19
20
|
get_utilization,
|
|
20
21
|
map_numa_node_to_cpu_affinity,
|
|
21
22
|
)
|
|
@@ -107,7 +108,11 @@ class AMDDetector(Detector):
|
|
|
107
108
|
asic_serial = dev_gpu_asic_info.get("asic_serial")
|
|
108
109
|
dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
|
|
109
110
|
else:
|
|
110
|
-
dev_uuid =
|
|
111
|
+
dev_uuid = ""
|
|
112
|
+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
113
|
+
dev_uuid = (
|
|
114
|
+
f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
|
|
115
|
+
)
|
|
111
116
|
dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
|
|
112
117
|
|
|
113
118
|
dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
|
|
@@ -119,8 +124,13 @@ class AMDDetector(Detector):
|
|
|
119
124
|
|
|
120
125
|
dev_cc = dev_hsa_agent.compute_capability
|
|
121
126
|
if not dev_cc:
|
|
122
|
-
|
|
123
|
-
dev_cc =
|
|
127
|
+
if "target_graphics_version" in dev_gpu_asic_info:
|
|
128
|
+
dev_cc = dev_gpu_asic_info.get("target_graphics_version")
|
|
129
|
+
else:
|
|
130
|
+
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
|
|
131
|
+
dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(
|
|
132
|
+
dev_idx,
|
|
133
|
+
)
|
|
124
134
|
|
|
125
135
|
dev_bdf = None
|
|
126
136
|
dev_card_id = None
|
|
@@ -195,15 +205,13 @@ class AMDDetector(Detector):
|
|
|
195
205
|
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
|
|
196
206
|
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
|
|
197
207
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
dev,
|
|
202
|
-
)
|
|
208
|
+
dev_is_vgpu = False
|
|
209
|
+
if dev_bdf:
|
|
210
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
203
211
|
|
|
204
212
|
dev_appendix = {
|
|
205
213
|
"arch_family": _get_arch_family(dev_asic_family_id),
|
|
206
|
-
"vgpu":
|
|
214
|
+
"vgpu": dev_is_vgpu,
|
|
207
215
|
}
|
|
208
216
|
if dev_bdf:
|
|
209
217
|
dev_appendix["bdf"] = dev_bdf
|
|
@@ -16,6 +16,7 @@ from .__utils__ import (
|
|
|
16
16
|
get_brief_version,
|
|
17
17
|
get_numa_node_by_bdf,
|
|
18
18
|
get_pci_devices,
|
|
19
|
+
get_physical_function_by_bdf,
|
|
19
20
|
get_utilization,
|
|
20
21
|
map_numa_node_to_cpu_affinity,
|
|
21
22
|
)
|
|
@@ -156,8 +157,12 @@ class HygonDetector(Detector):
|
|
|
156
157
|
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
|
|
157
158
|
dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
|
|
158
159
|
|
|
160
|
+
dev_is_vgpu = False
|
|
161
|
+
if dev_bdf:
|
|
162
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
163
|
+
|
|
159
164
|
dev_appendix = {
|
|
160
|
-
"vgpu":
|
|
165
|
+
"vgpu": dev_is_vgpu,
|
|
161
166
|
}
|
|
162
167
|
if dev_bdf is not None:
|
|
163
168
|
dev_appendix["bdf"] = dev_bdf
|
|
@@ -23,6 +23,7 @@ from .__utils__ import (
|
|
|
23
23
|
get_numa_node_by_bdf,
|
|
24
24
|
get_numa_nodeset_size,
|
|
25
25
|
get_pci_devices,
|
|
26
|
+
get_physical_function_by_bdf,
|
|
26
27
|
get_utilization,
|
|
27
28
|
map_numa_node_to_cpu_affinity,
|
|
28
29
|
support_command,
|
|
@@ -165,13 +166,20 @@ class IluvatarDetector(Detector):
|
|
|
165
166
|
if dev_cc_t:
|
|
166
167
|
dev_cc = ".".join(map(str, dev_cc_t))
|
|
167
168
|
|
|
169
|
+
dev_bdf = None
|
|
170
|
+
with contextlib.suppress(pyixml.NVMLError):
|
|
171
|
+
dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
|
|
172
|
+
dev_bdf = str(dev_pci_info.busIdLegacy).lower()
|
|
173
|
+
|
|
168
174
|
dev_is_vgpu = False
|
|
169
|
-
|
|
175
|
+
if dev_bdf:
|
|
176
|
+
dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
|
|
170
177
|
|
|
171
178
|
dev_appendix = {
|
|
172
179
|
"vgpu": dev_is_vgpu,
|
|
173
|
-
"bdf": str(dev_pci_info.busIdLegacy).lower(),
|
|
174
180
|
}
|
|
181
|
+
if dev_bdf:
|
|
182
|
+
dev_appendix["bdf"] = dev_bdf
|
|
175
183
|
|
|
176
184
|
ret.append(
|
|
177
185
|
Device(
|
|
@@ -3,9 +3,10 @@ from __future__ import annotations
|
|
|
3
3
|
import logging
|
|
4
4
|
from functools import lru_cache
|
|
5
5
|
|
|
6
|
+
import pymtml
|
|
7
|
+
|
|
6
8
|
from .. import envs
|
|
7
9
|
from ..logging import debug_log_exception, debug_log_warning
|
|
8
|
-
from . import pymtml
|
|
9
10
|
from .__types__ import (
|
|
10
11
|
Detector,
|
|
11
12
|
Device,
|
|
@@ -105,9 +106,8 @@ class MThreadsDetector(Detector):
|
|
|
105
106
|
|
|
106
107
|
try:
|
|
107
108
|
pymtml.mtmlLibraryInit()
|
|
108
|
-
|
|
109
|
-
sys_driver_ver = pymtml.mtmlSystemGetDriverVersion()
|
|
110
|
-
|
|
109
|
+
system = pymtml.mtmlLibraryInitSystem()
|
|
110
|
+
sys_driver_ver = pymtml.mtmlSystemGetDriverVersion(system)
|
|
111
111
|
dev_count = pymtml.mtmlLibraryCountDevice()
|
|
112
112
|
for dev_idx in range(dev_count):
|
|
113
113
|
dev_index = dev_idx
|
|
@@ -139,25 +139,20 @@ class MThreadsDetector(Detector):
|
|
|
139
139
|
|
|
140
140
|
dev_mem = 0
|
|
141
141
|
dev_mem_used = 0
|
|
142
|
-
|
|
143
|
-
try:
|
|
142
|
+
with pymtml.mtmlMemoryContext(dev) as devmem:
|
|
144
143
|
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
145
144
|
pymtml.mtmlMemoryGetTotal(devmem),
|
|
146
145
|
)
|
|
147
146
|
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
148
147
|
pymtml.mtmlMemoryGetUsed(devmem),
|
|
149
148
|
)
|
|
150
|
-
finally:
|
|
151
|
-
pymtml.mtmlDeviceFreeMemory(devmem)
|
|
152
149
|
|
|
153
150
|
dev_cores_util = None
|
|
154
151
|
dev_temp = None
|
|
155
|
-
|
|
156
|
-
try:
|
|
152
|
+
with pymtml.mtmlGpuContext(dev) as devgpu:
|
|
157
153
|
dev_cores_util = pymtml.mtmlGpuGetUtilization(devgpu)
|
|
158
154
|
dev_temp = pymtml.mtmlGpuGetTemperature(devgpu)
|
|
159
|
-
|
|
160
|
-
pymtml.mtmlDeviceFreeGpu(devgpu)
|
|
155
|
+
|
|
161
156
|
if dev_cores_util is None:
|
|
162
157
|
debug_log_warning(
|
|
163
158
|
logger,
|
|
@@ -198,6 +193,7 @@ class MThreadsDetector(Detector):
|
|
|
198
193
|
debug_log_exception(logger, "Failed to process devices fetching")
|
|
199
194
|
raise
|
|
200
195
|
finally:
|
|
196
|
+
pymtml.mtmlLibraryFreeSystem(system)
|
|
201
197
|
pymtml.mtmlLibraryShutDown()
|
|
202
198
|
|
|
203
199
|
return ret
|
|
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
4
|
import logging
|
|
5
|
+
import math
|
|
6
|
+
import time
|
|
5
7
|
from _ctypes import byref
|
|
6
8
|
from functools import lru_cache
|
|
7
9
|
from math import ceil
|
|
@@ -76,7 +78,7 @@ class NVIDIADetector(Detector):
|
|
|
76
78
|
def __init__(self):
|
|
77
79
|
super().__init__(ManufacturerEnum.NVIDIA)
|
|
78
80
|
|
|
79
|
-
def detect(self) -> Devices | None:
|
|
81
|
+
def detect(self) -> Devices | None: # noqa: PLR0915
|
|
80
82
|
"""
|
|
81
83
|
Detect NVIDIA GPUs using pynvml.
|
|
82
84
|
|
|
@@ -125,103 +127,110 @@ class NVIDIADetector(Detector):
|
|
|
125
127
|
for dev_idx in range(dev_count):
|
|
126
128
|
dev = pynvml.nvmlDeviceGetHandleByIndex(dev_idx)
|
|
127
129
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
if dev_files is None:
|
|
131
|
-
dev_files = get_device_files(pattern=r"nvidia(?P<number>\d+)")
|
|
132
|
-
if len(dev_files) >= dev_count:
|
|
133
|
-
dev_file = dev_files[dev_idx]
|
|
134
|
-
if dev_file.number is not None:
|
|
135
|
-
dev_index = dev_file.number
|
|
136
|
-
dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
|
|
137
|
-
|
|
138
|
-
dev_cores = None
|
|
139
|
-
if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
|
|
140
|
-
with contextlib.suppress(pycuda.CUDAError):
|
|
141
|
-
dev_gpudev = pycuda.cuDeviceGet(dev_idx)
|
|
142
|
-
dev_cores = pycuda.cuDeviceGetAttribute(
|
|
143
|
-
dev_gpudev,
|
|
144
|
-
pycuda.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
|
145
|
-
)
|
|
130
|
+
dev_cc_t = pynvml.nvmlDeviceGetCudaComputeCapability(dev)
|
|
131
|
+
dev_cc = ".".join(map(str, dev_cc_t))
|
|
146
132
|
|
|
147
|
-
|
|
148
|
-
dev_mem_used = 0
|
|
133
|
+
dev_bdf = None
|
|
149
134
|
with contextlib.suppress(pynvml.NVMLError):
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
dev_mem_info.total,
|
|
153
|
-
)
|
|
154
|
-
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
155
|
-
dev_mem_info.used,
|
|
156
|
-
)
|
|
157
|
-
if dev_mem == 0:
|
|
158
|
-
dev_mem, dev_mem_used = get_memory()
|
|
135
|
+
dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
|
|
136
|
+
dev_bdf = str(dev_pci_info.busIdLegacy).lower()
|
|
159
137
|
|
|
160
|
-
|
|
161
|
-
with contextlib.suppress(pynvml.NVMLError):
|
|
162
|
-
dev_util_rates = pynvml.nvmlDeviceGetUtilizationRates(dev)
|
|
163
|
-
dev_cores_util = dev_util_rates.gpu
|
|
164
|
-
if dev_cores_util is None:
|
|
165
|
-
debug_log_warning(
|
|
166
|
-
logger,
|
|
167
|
-
"Failed to get device %d cores utilization, setting to 0",
|
|
168
|
-
dev_index,
|
|
169
|
-
)
|
|
170
|
-
dev_cores_util = 0
|
|
171
|
-
|
|
172
|
-
dev_temp = None
|
|
138
|
+
dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
|
|
173
139
|
with contextlib.suppress(pynvml.NVMLError):
|
|
174
|
-
|
|
175
|
-
dev,
|
|
176
|
-
pynvml.NVML_TEMPERATURE_GPU,
|
|
177
|
-
)
|
|
140
|
+
dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
|
|
178
141
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
142
|
+
# With MIG disabled, treat as a single device.
|
|
143
|
+
if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
|
|
144
|
+
dev_index = dev_idx
|
|
145
|
+
if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
|
|
146
|
+
if dev_files is None:
|
|
147
|
+
dev_files = get_device_files(
|
|
148
|
+
pattern=r"nvidia(?P<number>\d+)",
|
|
149
|
+
)
|
|
150
|
+
if len(dev_files) >= dev_count:
|
|
151
|
+
dev_file = dev_files[dev_idx]
|
|
152
|
+
if dev_file.number is not None:
|
|
153
|
+
dev_index = dev_file.number
|
|
187
154
|
|
|
188
|
-
|
|
189
|
-
dev_cc = ".".join(map(str, dev_cc_t))
|
|
155
|
+
dev_name = pynvml.nvmlDeviceGetName(dev)
|
|
190
156
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
if
|
|
195
|
-
|
|
196
|
-
|
|
157
|
+
dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
|
|
158
|
+
|
|
159
|
+
dev_cores = None
|
|
160
|
+
if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
|
|
161
|
+
with contextlib.suppress(pycuda.CUDAError):
|
|
162
|
+
dev_gpudev = pycuda.cuDeviceGet(dev_idx)
|
|
163
|
+
dev_cores = pycuda.cuDeviceGetAttribute(
|
|
164
|
+
dev_gpudev,
|
|
165
|
+
pycuda.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
dev_cores_util = _get_sm_util_from_gpm_metrics(dev)
|
|
169
|
+
if dev_cores_util is None:
|
|
170
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
171
|
+
dev_util_rates = pynvml.nvmlDeviceGetUtilizationRates(dev)
|
|
172
|
+
dev_cores_util = dev_util_rates.gpu
|
|
173
|
+
if dev_cores_util is None:
|
|
174
|
+
debug_log_warning(
|
|
175
|
+
logger,
|
|
176
|
+
"Failed to get device %d cores utilization, setting to 0",
|
|
177
|
+
dev_index,
|
|
178
|
+
)
|
|
179
|
+
dev_cores_util = 0
|
|
197
180
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
181
|
+
dev_mem = 0
|
|
182
|
+
dev_mem_used = 0
|
|
183
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
184
|
+
dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
|
|
185
|
+
dev_mem = byte_to_mebibyte( # byte to MiB
|
|
186
|
+
dev_mem_info.total,
|
|
187
|
+
)
|
|
188
|
+
dev_mem_used = byte_to_mebibyte( # byte to MiB
|
|
189
|
+
dev_mem_info.used,
|
|
190
|
+
)
|
|
191
|
+
if dev_mem == 0:
|
|
192
|
+
dev_mem, dev_mem_used = get_memory()
|
|
203
193
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
|
|
210
|
-
dev_fabric = None
|
|
211
|
-
if dev_fabric:
|
|
212
|
-
dev_appendix["fabric_cluster_uuid"] = stringify_uuid(
|
|
213
|
-
bytes(dev_fabric.clusterUuid),
|
|
194
|
+
dev_temp = None
|
|
195
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
196
|
+
dev_temp = pynvml.nvmlDeviceGetTemperature(
|
|
197
|
+
dev,
|
|
198
|
+
pynvml.NVML_TEMPERATURE_GPU,
|
|
214
199
|
)
|
|
215
|
-
dev_appendix["fabric_clique_id"] = dev_fabric.cliqueId
|
|
216
200
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
201
|
+
dev_power = None
|
|
202
|
+
dev_power_used = None
|
|
203
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
204
|
+
dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
|
|
205
|
+
dev_power = dev_power // 1000 # mW to W
|
|
206
|
+
dev_power_used = (
|
|
207
|
+
pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
|
|
208
|
+
) # mW to W
|
|
209
|
+
|
|
210
|
+
dev_is_vgpu = False
|
|
211
|
+
if dev_bdf and dev_bdf in pci_devs:
|
|
212
|
+
dev_is_vgpu = _is_vgpu(pci_devs[dev_bdf].config)
|
|
213
|
+
|
|
214
|
+
dev_appendix = {
|
|
215
|
+
"arch_family": _get_arch_family(dev_cc_t),
|
|
216
|
+
"vgpu": dev_is_vgpu,
|
|
217
|
+
}
|
|
218
|
+
if dev_bdf:
|
|
219
|
+
dev_appendix["bdf"] = dev_bdf
|
|
220
220
|
|
|
221
|
-
|
|
221
|
+
with contextlib.suppress(pynvml.NVMLError):
|
|
222
|
+
dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
|
|
223
|
+
r = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
|
|
224
|
+
if r != pynvml.NVML_SUCCESS:
|
|
225
|
+
dev_fabric = None
|
|
226
|
+
if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
|
|
227
|
+
dev_fabric = None
|
|
228
|
+
if dev_fabric:
|
|
229
|
+
dev_appendix["fabric_cluster_uuid"] = stringify_uuid(
|
|
230
|
+
bytes(dev_fabric.clusterUuid),
|
|
231
|
+
)
|
|
232
|
+
dev_appendix["fabric_clique_id"] = dev_fabric.cliqueId
|
|
222
233
|
|
|
223
|
-
if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
|
|
224
|
-
dev_name = pynvml.nvmlDeviceGetName(dev)
|
|
225
234
|
ret.append(
|
|
226
235
|
Device(
|
|
227
236
|
manufacturer=self.manufacturer,
|
|
@@ -283,13 +292,20 @@ class NVIDIADetector(Detector):
|
|
|
283
292
|
pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
|
|
284
293
|
) # mW to W
|
|
285
294
|
|
|
286
|
-
mdev_appendix =
|
|
295
|
+
mdev_appendix = {
|
|
296
|
+
"arch_family": _get_arch_family(dev_cc_t),
|
|
297
|
+
"vgpu": True,
|
|
298
|
+
}
|
|
299
|
+
if dev_bdf:
|
|
300
|
+
mdev_appendix["bdf"] = dev_bdf
|
|
287
301
|
|
|
288
302
|
mdev_gi_id = pynvml.nvmlDeviceGetGpuInstanceId(mdev)
|
|
289
303
|
mdev_appendix["gpu_instance_id"] = mdev_gi_id
|
|
290
304
|
mdev_ci_id = pynvml.nvmlDeviceGetComputeInstanceId(mdev)
|
|
291
305
|
mdev_appendix["compute_instance_id"] = mdev_ci_id
|
|
292
306
|
|
|
307
|
+
mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
|
|
308
|
+
|
|
293
309
|
if not mdev_name:
|
|
294
310
|
mdev_attrs = pynvml.nvmlDeviceGetAttributes(mdev)
|
|
295
311
|
|
|
@@ -374,6 +390,7 @@ class NVIDIADetector(Detector):
|
|
|
374
390
|
runtime_version_original=sys_runtime_ver_original,
|
|
375
391
|
compute_capability=dev_cc,
|
|
376
392
|
cores=mdev_cores,
|
|
393
|
+
cores_utilization=mdev_cores_util,
|
|
377
394
|
memory=mdev_mem,
|
|
378
395
|
memory_used=mdev_mem_used,
|
|
379
396
|
memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
|
|
@@ -492,6 +509,97 @@ class NVIDIADetector(Detector):
|
|
|
492
509
|
return ret
|
|
493
510
|
|
|
494
511
|
|
|
512
|
+
def _get_gpm_metrics(
|
|
513
|
+
metrics: list[int],
|
|
514
|
+
dev: pynvml.c_nvmlDevice_t,
|
|
515
|
+
gpu_instance_id: int | None = None,
|
|
516
|
+
interval: float = 0.1,
|
|
517
|
+
) -> list[pynvml.c_nvmlGpmMetric_t] | None:
|
|
518
|
+
"""
|
|
519
|
+
Get GPM metrics for a device or a MIG GPU instance.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
metrics:
|
|
523
|
+
A list of GPM metric IDs to query.
|
|
524
|
+
dev:
|
|
525
|
+
The NVML device handle.
|
|
526
|
+
gpu_instance_id:
|
|
527
|
+
The GPU instance ID for MIG devices.
|
|
528
|
+
interval:
|
|
529
|
+
Interval in seconds between two samples.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
A list of GPM metric structures, or None if failed.
|
|
533
|
+
|
|
534
|
+
"""
|
|
535
|
+
try:
|
|
536
|
+
dev_gpm_support = pynvml.nvmlGpmQueryDeviceSupport(dev)
|
|
537
|
+
if not bool(dev_gpm_support.isSupportedDevice):
|
|
538
|
+
return None
|
|
539
|
+
except pynvml.NVMLError:
|
|
540
|
+
debug_log_warning(logger, "Unsupported GPM query")
|
|
541
|
+
return None
|
|
542
|
+
|
|
543
|
+
dev_gpm_metrics = pynvml.c_nvmlGpmMetricsGet_t()
|
|
544
|
+
try:
|
|
545
|
+
dev_gpm_metrics.sample1 = pynvml.nvmlGpmSampleAlloc()
|
|
546
|
+
dev_gpm_metrics.sample2 = pynvml.nvmlGpmSampleAlloc()
|
|
547
|
+
if gpu_instance_id is None:
|
|
548
|
+
pynvml.nvmlGpmSampleGet(dev, dev_gpm_metrics.sample1)
|
|
549
|
+
time.sleep(interval)
|
|
550
|
+
pynvml.nvmlGpmSampleGet(dev, dev_gpm_metrics.sample2)
|
|
551
|
+
else:
|
|
552
|
+
pynvml.nvmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample1)
|
|
553
|
+
time.sleep(interval)
|
|
554
|
+
pynvml.nvmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample2)
|
|
555
|
+
dev_gpm_metrics.version = pynvml.NVML_GPM_METRICS_GET_VERSION
|
|
556
|
+
dev_gpm_metrics.numMetrics = len(metrics)
|
|
557
|
+
for metric_idx, metric in enumerate(metrics):
|
|
558
|
+
dev_gpm_metrics.metrics[metric_idx].metricId = metric
|
|
559
|
+
pynvml.nvmlGpmMetricsGet(dev_gpm_metrics)
|
|
560
|
+
except pynvml.NVMLError:
|
|
561
|
+
debug_log_exception(logger, "Failed to get GPM metrics")
|
|
562
|
+
return None
|
|
563
|
+
finally:
|
|
564
|
+
if dev_gpm_metrics.sample1:
|
|
565
|
+
pynvml.nvmlGpmSampleFree(dev_gpm_metrics.sample1)
|
|
566
|
+
if dev_gpm_metrics.sample2:
|
|
567
|
+
pynvml.nvmlGpmSampleFree(dev_gpm_metrics.sample2)
|
|
568
|
+
return list(dev_gpm_metrics.metrics)
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def _get_sm_util_from_gpm_metrics(
|
|
572
|
+
dev: pynvml.c_nvmlDevice_t,
|
|
573
|
+
gpu_instance_id: int | None = None,
|
|
574
|
+
interval: float = 0.1,
|
|
575
|
+
) -> int | None:
|
|
576
|
+
"""
|
|
577
|
+
Get SM utilization from GPM metrics.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
dev:
|
|
581
|
+
The NVML device handle.
|
|
582
|
+
gpu_instance_id:
|
|
583
|
+
The GPU instance ID for MIG devices.
|
|
584
|
+
interval:
|
|
585
|
+
Interval in seconds between two samples.
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
The SM utilization as an integer percentage, or None if failed.
|
|
589
|
+
|
|
590
|
+
"""
|
|
591
|
+
dev_gpm_metrics = _get_gpm_metrics(
|
|
592
|
+
metrics=[pynvml.NVML_GPM_METRIC_SM_UTIL],
|
|
593
|
+
dev=dev,
|
|
594
|
+
gpu_instance_id=gpu_instance_id,
|
|
595
|
+
interval=interval,
|
|
596
|
+
)
|
|
597
|
+
if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
|
|
598
|
+
return int(dev_gpm_metrics[0].value)
|
|
599
|
+
|
|
600
|
+
return None
|
|
601
|
+
|
|
602
|
+
|
|
495
603
|
def _get_arch_family(dev_cc_t: list[int]) -> str:
|
|
496
604
|
"""
|
|
497
605
|
Get the architecture family based on the CUDA compute capability.
|
|
@@ -223,15 +223,9 @@ def rsmi_dev_target_graphics_version_get(device=0):
|
|
|
223
223
|
c_version = c_uint64()
|
|
224
224
|
ret = rocmsmiLib.rsmi_dev_target_graphics_version_get(device, byref(c_version))
|
|
225
225
|
_rocmsmiCheckReturn(ret)
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
if "Instinct MI2" in dev_name:
|
|
230
|
-
hex_part = str(hex(int(version[2:]))).replace("0x", "")
|
|
231
|
-
version = version[:2] + hex_part
|
|
232
|
-
else:
|
|
233
|
-
version = str(c_version.value // 10 + c_version.value % 10)
|
|
234
|
-
return "gfx" + version
|
|
226
|
+
if c_version.value < 2000:
|
|
227
|
+
return "gfx" + str(c_version.value)
|
|
228
|
+
return "gfx" + hex(c_version.value)[2:]
|
|
235
229
|
except AttributeError:
|
|
236
230
|
return None
|
|
237
231
|
|
gpustack_runtime/envs.py
CHANGED
|
@@ -476,7 +476,7 @@ variables: dict[str, Callable[[], Any]] = {
|
|
|
476
476
|
"hygon.com/devices=HIP_VISIBLE_DEVICES;"
|
|
477
477
|
"iluvatar.ai/devices=CUDA_VISIBLE_DEVICES;"
|
|
478
478
|
"metax-tech.com/devices=CUDA_VISIBLE_DEVICES;"
|
|
479
|
-
"mthreads.com/devices=CUDA_VISIBLE_DEVICES;"
|
|
479
|
+
"mthreads.com/devices=CUDA_VISIBLE_DEVICES,MUSA_VISIBLE_DEVICES;"
|
|
480
480
|
"nvidia.com/devices=CUDA_VISIBLE_DEVICES;",
|
|
481
481
|
),
|
|
482
482
|
list_sep=",",
|
{gpustack_runtime-0.1.39.post2.dist-info → gpustack_runtime-0.1.39.post3.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpustack-runtime
|
|
3
|
-
Version: 0.1.39.
|
|
3
|
+
Version: 0.1.39.post3
|
|
4
4
|
Summary: GPUStack Runtime is library for detecting GPU resources and launching GPU workloads.
|
|
5
5
|
Project-URL: Homepage, https://github.com/gpustack/runtime
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/gpustack/gpustack/issues
|
|
@@ -15,8 +15,9 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
15
15
|
Requires-Python: >=3.10
|
|
16
16
|
Requires-Dist: argcomplete>=3.6.3
|
|
17
17
|
Requires-Dist: docker>=7.1.0
|
|
18
|
-
Requires-Dist: gpustack-runner>=0.1.23.
|
|
18
|
+
Requires-Dist: gpustack-runner>=0.1.23.post5
|
|
19
19
|
Requires-Dist: kubernetes>=33.1.0
|
|
20
|
+
Requires-Dist: mthreads-ml-py>=2.2.10
|
|
20
21
|
Requires-Dist: nvidia-ml-py>=13.580.65
|
|
21
22
|
Requires-Dist: podman==5.6.0
|
|
22
23
|
Requires-Dist: pyyaml
|