gpustack-runtime 0.1.39.post2__py3-none-any.whl → 0.1.39.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,8 +27,8 @@ version_tuple: VERSION_TUPLE
27
27
  __commit_id__: COMMIT_ID
28
28
  commit_id: COMMIT_ID
29
29
 
30
- __version__ = version = '0.1.39.post2'
31
- __version_tuple__ = version_tuple = (0, 1, 39, 'post2')
30
+ __version__ = version = '0.1.39.post3'
31
+ __version_tuple__ = version_tuple = (0, 1, 39, 'post3')
32
32
  try:
33
33
  from ._version_appendix import git_commit
34
34
  __commit_id__ = commit_id = git_commit
@@ -1 +1 @@
1
- git_commit = "e044bab"
1
+ git_commit = "d65920e"
@@ -1213,8 +1213,12 @@ class DockerDeployer(EndoscopicDeployer):
1213
1213
  self_container_envs: dict[str, str] = dict(
1214
1214
  item.split("=", 1) for item in self_container.attrs["Config"].get("Env", [])
1215
1215
  )
1216
- self_image_envs: dict[str, str] = dict(
1217
- item.split("=", 1) for item in self_image.attrs["Config"].get("Env", [])
1216
+ self_image_envs: dict[str, str] = (
1217
+ dict(
1218
+ item.split("=", 1) for item in self_image.attrs["Config"].get("Env", [])
1219
+ )
1220
+ if self_image.attrs["Config"]
1221
+ else {}
1218
1222
  )
1219
1223
  mirrored_envs: dict[str, str] = {
1220
1224
  # Filter out gpustack-internal envs and same-as-image envs.
@@ -1189,8 +1189,12 @@ class PodmanDeployer(EndoscopicDeployer):
1189
1189
  self_container_envs: dict[str, str] = dict(
1190
1190
  item.split("=", 1) for item in self_container.attrs["Config"].get("Env", [])
1191
1191
  )
1192
- self_image_envs: dict[str, str] = dict(
1193
- item.split("=", 1) for item in self_image.attrs["Config"].get("Env", [])
1192
+ self_image_envs: dict[str, str] = (
1193
+ dict(
1194
+ item.split("=", 1) for item in self_image.attrs["Config"].get("Env", [])
1195
+ )
1196
+ if self_image.attrs["Config"]
1197
+ else {}
1194
1198
  )
1195
1199
  mirrored_envs: dict[str, str] = {
1196
1200
  # Filter out gpustack-internal envs and same-as-image envs.
@@ -951,3 +951,26 @@ def bitmask_to_str(bitmask_list: list) -> str:
951
951
  offset += get_bits_size()
952
952
 
953
953
  return list_to_range_str(sorted(bits_lists))
954
+
955
+
956
+ def get_physical_function_by_bdf(bdf: str) -> str:
957
+ """
958
+ Get the physical function BDF for a given PCI device BDF address.
959
+
960
+ Args:
961
+ bdf:
962
+ The PCI device BDF address (e.g., "0000:00:1f.0").
963
+
964
+ Returns:
965
+ The physical function BDF if found, otherwise returns the original BDF.
966
+
967
+ """
968
+ if bdf:
969
+ with contextlib.suppress(Exception):
970
+ dev_path = Path(f"/sys/bus/pci/devices/{bdf}")
971
+ if dev_path.exists():
972
+ physfn_path = dev_path / "physfn"
973
+ if physfn_path.exists():
974
+ physfn_realpath = physfn_path.resolve()
975
+ return physfn_realpath.name
976
+ return bdf
@@ -16,6 +16,7 @@ from .__utils__ import (
16
16
  get_brief_version,
17
17
  get_numa_node_by_bdf,
18
18
  get_pci_devices,
19
+ get_physical_function_by_bdf,
19
20
  get_utilization,
20
21
  map_numa_node_to_cpu_affinity,
21
22
  )
@@ -107,7 +108,11 @@ class AMDDetector(Detector):
107
108
  asic_serial = dev_gpu_asic_info.get("asic_serial")
108
109
  dev_uuid = f"GPU-{(asic_serial[2:]).lower()}"
109
110
  else:
110
- dev_uuid = f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
111
+ dev_uuid = ""
112
+ with contextlib.suppress(pyrocmsmi.ROCMSMIError):
113
+ dev_uuid = (
114
+ f"GPU-{pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)[2:]}"
115
+ )
111
116
  dev_hsa_agent = hsa_agents.get(dev_uuid, pyhsa.Agent())
112
117
 
113
118
  dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
@@ -119,8 +124,13 @@ class AMDDetector(Detector):
119
124
 
120
125
  dev_cc = dev_hsa_agent.compute_capability
121
126
  if not dev_cc:
122
- with contextlib.suppress(pyrocmsmi.ROCMSMIError):
123
- dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
127
+ if "target_graphics_version" in dev_gpu_asic_info:
128
+ dev_cc = dev_gpu_asic_info.get("target_graphics_version")
129
+ else:
130
+ with contextlib.suppress(pyrocmsmi.ROCMSMIError):
131
+ dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(
132
+ dev_idx,
133
+ )
124
134
 
125
135
  dev_bdf = None
126
136
  dev_card_id = None
@@ -195,15 +205,13 @@ class AMDDetector(Detector):
195
205
  dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
196
206
  dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
197
207
 
198
- dev_compute_partition = None
199
- with contextlib.suppress(pyamdsmi.AmdSmiException):
200
- dev_compute_partition = pyamdsmi.amdsmi_get_gpu_compute_partition(
201
- dev,
202
- )
208
+ dev_is_vgpu = False
209
+ if dev_bdf:
210
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
203
211
 
204
212
  dev_appendix = {
205
213
  "arch_family": _get_arch_family(dev_asic_family_id),
206
- "vgpu": dev_compute_partition is not None,
214
+ "vgpu": dev_is_vgpu,
207
215
  }
208
216
  if dev_bdf:
209
217
  dev_appendix["bdf"] = dev_bdf
@@ -16,6 +16,7 @@ from .__utils__ import (
16
16
  get_brief_version,
17
17
  get_numa_node_by_bdf,
18
18
  get_pci_devices,
19
+ get_physical_function_by_bdf,
19
20
  get_utilization,
20
21
  map_numa_node_to_cpu_affinity,
21
22
  )
@@ -156,8 +157,12 @@ class HygonDetector(Detector):
156
157
  dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
157
158
  dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx)
158
159
 
160
+ dev_is_vgpu = False
161
+ if dev_bdf:
162
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
163
+
159
164
  dev_appendix = {
160
- "vgpu": False,
165
+ "vgpu": dev_is_vgpu,
161
166
  }
162
167
  if dev_bdf is not None:
163
168
  dev_appendix["bdf"] = dev_bdf
@@ -23,6 +23,7 @@ from .__utils__ import (
23
23
  get_numa_node_by_bdf,
24
24
  get_numa_nodeset_size,
25
25
  get_pci_devices,
26
+ get_physical_function_by_bdf,
26
27
  get_utilization,
27
28
  map_numa_node_to_cpu_affinity,
28
29
  support_command,
@@ -165,13 +166,20 @@ class IluvatarDetector(Detector):
165
166
  if dev_cc_t:
166
167
  dev_cc = ".".join(map(str, dev_cc_t))
167
168
 
169
+ dev_bdf = None
170
+ with contextlib.suppress(pyixml.NVMLError):
171
+ dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
172
+ dev_bdf = str(dev_pci_info.busIdLegacy).lower()
173
+
168
174
  dev_is_vgpu = False
169
- dev_pci_info = pyixml.nvmlDeviceGetPciInfo(dev)
175
+ if dev_bdf:
176
+ dev_is_vgpu = get_physical_function_by_bdf(dev_bdf) != dev_bdf
170
177
 
171
178
  dev_appendix = {
172
179
  "vgpu": dev_is_vgpu,
173
- "bdf": str(dev_pci_info.busIdLegacy).lower(),
174
180
  }
181
+ if dev_bdf:
182
+ dev_appendix["bdf"] = dev_bdf
175
183
 
176
184
  ret.append(
177
185
  Device(
@@ -3,9 +3,10 @@ from __future__ import annotations
3
3
  import logging
4
4
  from functools import lru_cache
5
5
 
6
+ import pymtml
7
+
6
8
  from .. import envs
7
9
  from ..logging import debug_log_exception, debug_log_warning
8
- from . import pymtml
9
10
  from .__types__ import (
10
11
  Detector,
11
12
  Device,
@@ -105,9 +106,8 @@ class MThreadsDetector(Detector):
105
106
 
106
107
  try:
107
108
  pymtml.mtmlLibraryInit()
108
-
109
- sys_driver_ver = pymtml.mtmlSystemGetDriverVersion()
110
-
109
+ system = pymtml.mtmlLibraryInitSystem()
110
+ sys_driver_ver = pymtml.mtmlSystemGetDriverVersion(system)
111
111
  dev_count = pymtml.mtmlLibraryCountDevice()
112
112
  for dev_idx in range(dev_count):
113
113
  dev_index = dev_idx
@@ -139,25 +139,20 @@ class MThreadsDetector(Detector):
139
139
 
140
140
  dev_mem = 0
141
141
  dev_mem_used = 0
142
- devmem = pymtml.mtmlDeviceInitMemory(dev)
143
- try:
142
+ with pymtml.mtmlMemoryContext(dev) as devmem:
144
143
  dev_mem = byte_to_mebibyte( # byte to MiB
145
144
  pymtml.mtmlMemoryGetTotal(devmem),
146
145
  )
147
146
  dev_mem_used = byte_to_mebibyte( # byte to MiB
148
147
  pymtml.mtmlMemoryGetUsed(devmem),
149
148
  )
150
- finally:
151
- pymtml.mtmlDeviceFreeMemory(devmem)
152
149
 
153
150
  dev_cores_util = None
154
151
  dev_temp = None
155
- devgpu = pymtml.mtmlDeviceInitGpu(dev)
156
- try:
152
+ with pymtml.mtmlGpuContext(dev) as devgpu:
157
153
  dev_cores_util = pymtml.mtmlGpuGetUtilization(devgpu)
158
154
  dev_temp = pymtml.mtmlGpuGetTemperature(devgpu)
159
- finally:
160
- pymtml.mtmlDeviceFreeGpu(devgpu)
155
+
161
156
  if dev_cores_util is None:
162
157
  debug_log_warning(
163
158
  logger,
@@ -198,6 +193,7 @@ class MThreadsDetector(Detector):
198
193
  debug_log_exception(logger, "Failed to process devices fetching")
199
194
  raise
200
195
  finally:
196
+ pymtml.mtmlLibraryFreeSystem(system)
201
197
  pymtml.mtmlLibraryShutDown()
202
198
 
203
199
  return ret
@@ -2,6 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  import contextlib
4
4
  import logging
5
+ import math
6
+ import time
5
7
  from _ctypes import byref
6
8
  from functools import lru_cache
7
9
  from math import ceil
@@ -76,7 +78,7 @@ class NVIDIADetector(Detector):
76
78
  def __init__(self):
77
79
  super().__init__(ManufacturerEnum.NVIDIA)
78
80
 
79
- def detect(self) -> Devices | None:
81
+ def detect(self) -> Devices | None: # noqa: PLR0915
80
82
  """
81
83
  Detect NVIDIA GPUs using pynvml.
82
84
 
@@ -125,103 +127,110 @@ class NVIDIADetector(Detector):
125
127
  for dev_idx in range(dev_count):
126
128
  dev = pynvml.nvmlDeviceGetHandleByIndex(dev_idx)
127
129
 
128
- dev_index = dev_idx
129
- if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
130
- if dev_files is None:
131
- dev_files = get_device_files(pattern=r"nvidia(?P<number>\d+)")
132
- if len(dev_files) >= dev_count:
133
- dev_file = dev_files[dev_idx]
134
- if dev_file.number is not None:
135
- dev_index = dev_file.number
136
- dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
137
-
138
- dev_cores = None
139
- if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
140
- with contextlib.suppress(pycuda.CUDAError):
141
- dev_gpudev = pycuda.cuDeviceGet(dev_idx)
142
- dev_cores = pycuda.cuDeviceGetAttribute(
143
- dev_gpudev,
144
- pycuda.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
145
- )
130
+ dev_cc_t = pynvml.nvmlDeviceGetCudaComputeCapability(dev)
131
+ dev_cc = ".".join(map(str, dev_cc_t))
146
132
 
147
- dev_mem = 0
148
- dev_mem_used = 0
133
+ dev_bdf = None
149
134
  with contextlib.suppress(pynvml.NVMLError):
150
- dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
151
- dev_mem = byte_to_mebibyte( # byte to MiB
152
- dev_mem_info.total,
153
- )
154
- dev_mem_used = byte_to_mebibyte( # byte to MiB
155
- dev_mem_info.used,
156
- )
157
- if dev_mem == 0:
158
- dev_mem, dev_mem_used = get_memory()
135
+ dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
136
+ dev_bdf = str(dev_pci_info.busIdLegacy).lower()
159
137
 
160
- dev_cores_util = None
161
- with contextlib.suppress(pynvml.NVMLError):
162
- dev_util_rates = pynvml.nvmlDeviceGetUtilizationRates(dev)
163
- dev_cores_util = dev_util_rates.gpu
164
- if dev_cores_util is None:
165
- debug_log_warning(
166
- logger,
167
- "Failed to get device %d cores utilization, setting to 0",
168
- dev_index,
169
- )
170
- dev_cores_util = 0
171
-
172
- dev_temp = None
138
+ dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
173
139
  with contextlib.suppress(pynvml.NVMLError):
174
- dev_temp = pynvml.nvmlDeviceGetTemperature(
175
- dev,
176
- pynvml.NVML_TEMPERATURE_GPU,
177
- )
140
+ dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
178
141
 
179
- dev_power = None
180
- dev_power_used = None
181
- with contextlib.suppress(pynvml.NVMLError):
182
- dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
183
- dev_power = dev_power // 1000 # mW to W
184
- dev_power_used = (
185
- pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
186
- ) # mW to W
142
+ # With MIG disabled, treat as a single device.
143
+ if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
144
+ dev_index = dev_idx
145
+ if envs.GPUSTACK_RUNTIME_DETECT_PHYSICAL_INDEX_PRIORITY:
146
+ if dev_files is None:
147
+ dev_files = get_device_files(
148
+ pattern=r"nvidia(?P<number>\d+)",
149
+ )
150
+ if len(dev_files) >= dev_count:
151
+ dev_file = dev_files[dev_idx]
152
+ if dev_file.number is not None:
153
+ dev_index = dev_file.number
187
154
 
188
- dev_cc_t = pynvml.nvmlDeviceGetCudaComputeCapability(dev)
189
- dev_cc = ".".join(map(str, dev_cc_t))
155
+ dev_name = pynvml.nvmlDeviceGetName(dev)
190
156
 
191
- dev_is_vgpu = False
192
- dev_pci_info = pynvml.nvmlDeviceGetPciInfo(dev)
193
- for addr in [dev_pci_info.busIdLegacy, dev_pci_info.busId]:
194
- if addr in pci_devs:
195
- dev_is_vgpu = _is_vgpu(pci_devs[addr].config)
196
- break
157
+ dev_uuid = pynvml.nvmlDeviceGetUUID(dev)
158
+
159
+ dev_cores = None
160
+ if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
161
+ with contextlib.suppress(pycuda.CUDAError):
162
+ dev_gpudev = pycuda.cuDeviceGet(dev_idx)
163
+ dev_cores = pycuda.cuDeviceGetAttribute(
164
+ dev_gpudev,
165
+ pycuda.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
166
+ )
167
+
168
+ dev_cores_util = _get_sm_util_from_gpm_metrics(dev)
169
+ if dev_cores_util is None:
170
+ with contextlib.suppress(pynvml.NVMLError):
171
+ dev_util_rates = pynvml.nvmlDeviceGetUtilizationRates(dev)
172
+ dev_cores_util = dev_util_rates.gpu
173
+ if dev_cores_util is None:
174
+ debug_log_warning(
175
+ logger,
176
+ "Failed to get device %d cores utilization, setting to 0",
177
+ dev_index,
178
+ )
179
+ dev_cores_util = 0
197
180
 
198
- dev_appendix = {
199
- "arch_family": _get_arch_family(dev_cc_t),
200
- "vgpu": dev_is_vgpu,
201
- "bdf": str(dev_pci_info.busIdLegacy).lower(),
202
- }
181
+ dev_mem = 0
182
+ dev_mem_used = 0
183
+ with contextlib.suppress(pynvml.NVMLError):
184
+ dev_mem_info = pynvml.nvmlDeviceGetMemoryInfo(dev)
185
+ dev_mem = byte_to_mebibyte( # byte to MiB
186
+ dev_mem_info.total,
187
+ )
188
+ dev_mem_used = byte_to_mebibyte( # byte to MiB
189
+ dev_mem_info.used,
190
+ )
191
+ if dev_mem == 0:
192
+ dev_mem, dev_mem_used = get_memory()
203
193
 
204
- with contextlib.suppress(pynvml.NVMLError):
205
- dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
206
- r = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
207
- if r != pynvml.NVML_SUCCESS:
208
- dev_fabric = None
209
- if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
210
- dev_fabric = None
211
- if dev_fabric:
212
- dev_appendix["fabric_cluster_uuid"] = stringify_uuid(
213
- bytes(dev_fabric.clusterUuid),
194
+ dev_temp = None
195
+ with contextlib.suppress(pynvml.NVMLError):
196
+ dev_temp = pynvml.nvmlDeviceGetTemperature(
197
+ dev,
198
+ pynvml.NVML_TEMPERATURE_GPU,
214
199
  )
215
- dev_appendix["fabric_clique_id"] = dev_fabric.cliqueId
216
200
 
217
- dev_mig_mode = pynvml.NVML_DEVICE_MIG_DISABLE
218
- with contextlib.suppress(pynvml.NVMLError):
219
- dev_mig_mode, _ = pynvml.nvmlDeviceGetMigMode(dev)
201
+ dev_power = None
202
+ dev_power_used = None
203
+ with contextlib.suppress(pynvml.NVMLError):
204
+ dev_power = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(dev)
205
+ dev_power = dev_power // 1000 # mW to W
206
+ dev_power_used = (
207
+ pynvml.nvmlDeviceGetPowerUsage(dev) // 1000
208
+ ) # mW to W
209
+
210
+ dev_is_vgpu = False
211
+ if dev_bdf and dev_bdf in pci_devs:
212
+ dev_is_vgpu = _is_vgpu(pci_devs[dev_bdf].config)
213
+
214
+ dev_appendix = {
215
+ "arch_family": _get_arch_family(dev_cc_t),
216
+ "vgpu": dev_is_vgpu,
217
+ }
218
+ if dev_bdf:
219
+ dev_appendix["bdf"] = dev_bdf
220
220
 
221
- # If MIG is not enabled, return the GPU itself.
221
+ with contextlib.suppress(pynvml.NVMLError):
222
+ dev_fabric = pynvml.c_nvmlGpuFabricInfoV_t()
223
+ r = pynvml.nvmlDeviceGetGpuFabricInfoV(dev, byref(dev_fabric))
224
+ if r != pynvml.NVML_SUCCESS:
225
+ dev_fabric = None
226
+ if dev_fabric.state != pynvml.NVML_GPU_FABRIC_STATE_COMPLETED:
227
+ dev_fabric = None
228
+ if dev_fabric:
229
+ dev_appendix["fabric_cluster_uuid"] = stringify_uuid(
230
+ bytes(dev_fabric.clusterUuid),
231
+ )
232
+ dev_appendix["fabric_clique_id"] = dev_fabric.cliqueId
222
233
 
223
- if dev_mig_mode == pynvml.NVML_DEVICE_MIG_DISABLE:
224
- dev_name = pynvml.nvmlDeviceGetName(dev)
225
234
  ret.append(
226
235
  Device(
227
236
  manufacturer=self.manufacturer,
@@ -283,13 +292,20 @@ class NVIDIADetector(Detector):
283
292
  pynvml.nvmlDeviceGetPowerUsage(mdev) // 1000
284
293
  ) # mW to W
285
294
 
286
- mdev_appendix = dev_appendix.copy()
295
+ mdev_appendix = {
296
+ "arch_family": _get_arch_family(dev_cc_t),
297
+ "vgpu": True,
298
+ }
299
+ if dev_bdf:
300
+ mdev_appendix["bdf"] = dev_bdf
287
301
 
288
302
  mdev_gi_id = pynvml.nvmlDeviceGetGpuInstanceId(mdev)
289
303
  mdev_appendix["gpu_instance_id"] = mdev_gi_id
290
304
  mdev_ci_id = pynvml.nvmlDeviceGetComputeInstanceId(mdev)
291
305
  mdev_appendix["compute_instance_id"] = mdev_ci_id
292
306
 
307
+ mdev_cores_util = _get_sm_util_from_gpm_metrics(dev, mdev_gi_id)
308
+
293
309
  if not mdev_name:
294
310
  mdev_attrs = pynvml.nvmlDeviceGetAttributes(mdev)
295
311
 
@@ -374,6 +390,7 @@ class NVIDIADetector(Detector):
374
390
  runtime_version_original=sys_runtime_ver_original,
375
391
  compute_capability=dev_cc,
376
392
  cores=mdev_cores,
393
+ cores_utilization=mdev_cores_util,
377
394
  memory=mdev_mem,
378
395
  memory_used=mdev_mem_used,
379
396
  memory_utilization=get_utilization(mdev_mem_used, mdev_mem),
@@ -492,6 +509,97 @@ class NVIDIADetector(Detector):
492
509
  return ret
493
510
 
494
511
 
512
+ def _get_gpm_metrics(
513
+ metrics: list[int],
514
+ dev: pynvml.c_nvmlDevice_t,
515
+ gpu_instance_id: int | None = None,
516
+ interval: float = 0.1,
517
+ ) -> list[pynvml.c_nvmlGpmMetric_t] | None:
518
+ """
519
+ Get GPM metrics for a device or a MIG GPU instance.
520
+
521
+ Args:
522
+ metrics:
523
+ A list of GPM metric IDs to query.
524
+ dev:
525
+ The NVML device handle.
526
+ gpu_instance_id:
527
+ The GPU instance ID for MIG devices.
528
+ interval:
529
+ Interval in seconds between two samples.
530
+
531
+ Returns:
532
+ A list of GPM metric structures, or None if failed.
533
+
534
+ """
535
+ try:
536
+ dev_gpm_support = pynvml.nvmlGpmQueryDeviceSupport(dev)
537
+ if not bool(dev_gpm_support.isSupportedDevice):
538
+ return None
539
+ except pynvml.NVMLError:
540
+ debug_log_warning(logger, "Unsupported GPM query")
541
+ return None
542
+
543
+ dev_gpm_metrics = pynvml.c_nvmlGpmMetricsGet_t()
544
+ try:
545
+ dev_gpm_metrics.sample1 = pynvml.nvmlGpmSampleAlloc()
546
+ dev_gpm_metrics.sample2 = pynvml.nvmlGpmSampleAlloc()
547
+ if gpu_instance_id is None:
548
+ pynvml.nvmlGpmSampleGet(dev, dev_gpm_metrics.sample1)
549
+ time.sleep(interval)
550
+ pynvml.nvmlGpmSampleGet(dev, dev_gpm_metrics.sample2)
551
+ else:
552
+ pynvml.nvmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample1)
553
+ time.sleep(interval)
554
+ pynvml.nvmlGpmMigSampleGet(dev, gpu_instance_id, dev_gpm_metrics.sample2)
555
+ dev_gpm_metrics.version = pynvml.NVML_GPM_METRICS_GET_VERSION
556
+ dev_gpm_metrics.numMetrics = len(metrics)
557
+ for metric_idx, metric in enumerate(metrics):
558
+ dev_gpm_metrics.metrics[metric_idx].metricId = metric
559
+ pynvml.nvmlGpmMetricsGet(dev_gpm_metrics)
560
+ except pynvml.NVMLError:
561
+ debug_log_exception(logger, "Failed to get GPM metrics")
562
+ return None
563
+ finally:
564
+ if dev_gpm_metrics.sample1:
565
+ pynvml.nvmlGpmSampleFree(dev_gpm_metrics.sample1)
566
+ if dev_gpm_metrics.sample2:
567
+ pynvml.nvmlGpmSampleFree(dev_gpm_metrics.sample2)
568
+ return list(dev_gpm_metrics.metrics)
569
+
570
+
571
+ def _get_sm_util_from_gpm_metrics(
572
+ dev: pynvml.c_nvmlDevice_t,
573
+ gpu_instance_id: int | None = None,
574
+ interval: float = 0.1,
575
+ ) -> int | None:
576
+ """
577
+ Get SM utilization from GPM metrics.
578
+
579
+ Args:
580
+ dev:
581
+ The NVML device handle.
582
+ gpu_instance_id:
583
+ The GPU instance ID for MIG devices.
584
+ interval:
585
+ Interval in seconds between two samples.
586
+
587
+ Returns:
588
+ The SM utilization as an integer percentage, or None if failed.
589
+
590
+ """
591
+ dev_gpm_metrics = _get_gpm_metrics(
592
+ metrics=[pynvml.NVML_GPM_METRIC_SM_UTIL],
593
+ dev=dev,
594
+ gpu_instance_id=gpu_instance_id,
595
+ interval=interval,
596
+ )
597
+ if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
598
+ return int(dev_gpm_metrics[0].value)
599
+
600
+ return None
601
+
602
+
495
603
  def _get_arch_family(dev_cc_t: list[int]) -> str:
496
604
  """
497
605
  Get the architecture family based on the CUDA compute capability.
@@ -223,15 +223,9 @@ def rsmi_dev_target_graphics_version_get(device=0):
223
223
  c_version = c_uint64()
224
224
  ret = rocmsmiLib.rsmi_dev_target_graphics_version_get(device, byref(c_version))
225
225
  _rocmsmiCheckReturn(ret)
226
- version = str(c_version.value)
227
- if len(version) == 4:
228
- dev_name = rsmi_dev_name_get(device)
229
- if "Instinct MI2" in dev_name:
230
- hex_part = str(hex(int(version[2:]))).replace("0x", "")
231
- version = version[:2] + hex_part
232
- else:
233
- version = str(c_version.value // 10 + c_version.value % 10)
234
- return "gfx" + version
226
+ if c_version.value < 2000:
227
+ return "gfx" + str(c_version.value)
228
+ return "gfx" + hex(c_version.value)[2:]
235
229
  except AttributeError:
236
230
  return None
237
231
 
gpustack_runtime/envs.py CHANGED
@@ -476,7 +476,7 @@ variables: dict[str, Callable[[], Any]] = {
476
476
  "hygon.com/devices=HIP_VISIBLE_DEVICES;"
477
477
  "iluvatar.ai/devices=CUDA_VISIBLE_DEVICES;"
478
478
  "metax-tech.com/devices=CUDA_VISIBLE_DEVICES;"
479
- "mthreads.com/devices=CUDA_VISIBLE_DEVICES;"
479
+ "mthreads.com/devices=CUDA_VISIBLE_DEVICES,MUSA_VISIBLE_DEVICES;"
480
480
  "nvidia.com/devices=CUDA_VISIBLE_DEVICES;",
481
481
  ),
482
482
  list_sep=",",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpustack-runtime
3
- Version: 0.1.39.post2
3
+ Version: 0.1.39.post3
4
4
  Summary: GPUStack Runtime is library for detecting GPU resources and launching GPU workloads.
5
5
  Project-URL: Homepage, https://github.com/gpustack/runtime
6
6
  Project-URL: Bug Tracker, https://github.com/gpustack/gpustack/issues
@@ -15,8 +15,9 @@ Classifier: Programming Language :: Python :: 3.13
15
15
  Requires-Python: >=3.10
16
16
  Requires-Dist: argcomplete>=3.6.3
17
17
  Requires-Dist: docker>=7.1.0
18
- Requires-Dist: gpustack-runner>=0.1.23.post4
18
+ Requires-Dist: gpustack-runner>=0.1.23.post5
19
19
  Requires-Dist: kubernetes>=33.1.0
20
+ Requires-Dist: mthreads-ml-py>=2.2.10
20
21
  Requires-Dist: nvidia-ml-py>=13.580.65
21
22
  Requires-Dist: podman==5.6.0
22
23
  Requires-Dist: pyyaml