xpustat 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpustat/__init__.py +77 -0
- xpustat/_process.py +59 -0
- xpustat/amd.py +359 -0
- xpustat/cambricon.py +153 -0
- xpustat/cli.py +443 -0
- xpustat/gaudi.py +103 -0
- xpustat/huawei.py +333 -0
- xpustat/hygon.py +142 -0
- xpustat/intel_gpu.py +150 -0
- xpustat/moorethreads.py +120 -0
- xpustat/nvidia.py +336 -0
- xpustat/xpu.py +311 -0
- xpustat-0.1.0.dist-info/METADATA +143 -0
- xpustat-0.1.0.dist-info/RECORD +18 -0
- xpustat-0.1.0.dist-info/WHEEL +5 -0
- xpustat-0.1.0.dist-info/entry_points.txt +2 -0
- xpustat-0.1.0.dist-info/licenses/LICENSE +21 -0
- xpustat-0.1.0.dist-info/top_level.txt +1 -0
xpustat/__init__.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""
|
|
2
|
+
xpustat — unified accelerator stat tool.
|
|
3
|
+
|
|
4
|
+
Supports NVIDIA, AMD, Intel Gaudi, Intel Arc/GPU,
|
|
5
|
+
Huawei Ascend, Hygon DCU, Cambricon MLU, and Moore Threads.
|
|
6
|
+
|
|
7
|
+
Quick start::
|
|
8
|
+
|
|
9
|
+
import xpustat
|
|
10
|
+
|
|
11
|
+
# Query all vendors at once
|
|
12
|
+
stats = xpustat.query_all()
|
|
13
|
+
|
|
14
|
+
# Iterate every device across all vendors
|
|
15
|
+
for dev in stats:
|
|
16
|
+
print(dev.name, dev.mem_used_mb, "/", dev.mem_total_mb, "MB")
|
|
17
|
+
|
|
18
|
+
# Vendor-specific access
|
|
19
|
+
for gpu in stats.nvidia:
|
|
20
|
+
print(gpu.name, gpu.util_pct, "%", gpu.temp_c, "°C")
|
|
21
|
+
|
|
22
|
+
# Subscript by vendor key
|
|
23
|
+
for gpu in stats["amd"]:
|
|
24
|
+
print(gpu.power_w, "W")
|
|
25
|
+
|
|
26
|
+
# Query a single vendor
|
|
27
|
+
gpus = xpustat.query("nvidia")
|
|
28
|
+
|
|
29
|
+
# JSON string (for HTTP responses / files)
|
|
30
|
+
print(stats.to_json(indent=2))
|
|
31
|
+
|
|
32
|
+
# JSON with metadata wrapper
|
|
33
|
+
print(stats.to_json(metadata=True))
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
__version__ = "0.1.0"
|
|
37
|
+
__version_tuple__ = (0, 1, 0)
|
|
38
|
+
|
|
39
|
+
from ._process import ProcessInfo
|
|
40
|
+
from .xpu import AnyDeviceStat, XPUStatCollection, query, query_all
|
|
41
|
+
from .nvidia import NvidiaGPUStat, NvidiaGPUStatCollection
|
|
42
|
+
from .amd import AMDGPUStat, AMDGPUStatCollection
|
|
43
|
+
from .gaudi import GaudiAIPStat, GaudiAIPStatCollection
|
|
44
|
+
from .intel_gpu import IntelGPUStat, IntelGPUStatCollection
|
|
45
|
+
from .huawei import HuaweiNPUStat, HuaweiNPUStatCollection
|
|
46
|
+
from .hygon import HygonDCUStat, HygonDCUStatCollection
|
|
47
|
+
from .cambricon import CambriconMLUStat, CambriconMLUStatCollection
|
|
48
|
+
from .moorethreads import MooreThreadsGPUStat, MooreThreadsGPUStatCollection
|
|
49
|
+
|
|
50
|
+
__all__ = [
|
|
51
|
+
"__version__",
|
|
52
|
+
"__version_tuple__",
|
|
53
|
+
# Core API
|
|
54
|
+
"query_all",
|
|
55
|
+
"query",
|
|
56
|
+
"XPUStatCollection",
|
|
57
|
+
"AnyDeviceStat",
|
|
58
|
+
# Process info
|
|
59
|
+
"ProcessInfo",
|
|
60
|
+
# Per-vendor stat objects
|
|
61
|
+
"NvidiaGPUStat",
|
|
62
|
+
"NvidiaGPUStatCollection",
|
|
63
|
+
"AMDGPUStat",
|
|
64
|
+
"AMDGPUStatCollection",
|
|
65
|
+
"GaudiAIPStat",
|
|
66
|
+
"GaudiAIPStatCollection",
|
|
67
|
+
"IntelGPUStat",
|
|
68
|
+
"IntelGPUStatCollection",
|
|
69
|
+
"HuaweiNPUStat",
|
|
70
|
+
"HuaweiNPUStatCollection",
|
|
71
|
+
"HygonDCUStat",
|
|
72
|
+
"HygonDCUStatCollection",
|
|
73
|
+
"CambriconMLUStat",
|
|
74
|
+
"CambriconMLUStatCollection",
|
|
75
|
+
"MooreThreadsGPUStat",
|
|
76
|
+
"MooreThreadsGPUStatCollection",
|
|
77
|
+
]
|
xpustat/_process.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared process info helpers across all vendors.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
import psutil # type: ignore[import-untyped]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ProcessInfo:
|
|
14
|
+
"""One running process using accelerator memory."""
|
|
15
|
+
|
|
16
|
+
__slots__ = ("pid", "username", "command", "gpu_mem_mb")
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
pid: int,
|
|
21
|
+
username: str,
|
|
22
|
+
command: str,
|
|
23
|
+
gpu_mem_mb: Optional[int],
|
|
24
|
+
) -> None:
|
|
25
|
+
self.pid = pid
|
|
26
|
+
self.username = username
|
|
27
|
+
self.command = command
|
|
28
|
+
self.gpu_mem_mb = gpu_mem_mb
|
|
29
|
+
|
|
30
|
+
def to_dict(self) -> dict:
|
|
31
|
+
return {
|
|
32
|
+
"pid": self.pid,
|
|
33
|
+
"username": self.username,
|
|
34
|
+
"command": self.command,
|
|
35
|
+
"gpu_mem_mb": self.gpu_mem_mb,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def __repr__(self) -> str:
|
|
39
|
+
mem = f"{self.gpu_mem_mb}M" if self.gpu_mem_mb is not None else "?"
|
|
40
|
+
return f"{self.username}:{self.command}/{self.pid}({mem})"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def enrich_process(pid: int, fallback_cmd: str) -> tuple[str, str]:
|
|
44
|
+
"""
|
|
45
|
+
Return (username, command) for a PID.
|
|
46
|
+
Falls back to ('?', fallback_cmd) if the process has already exited
|
|
47
|
+
or access is denied.
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
p = psutil.Process(pid)
|
|
51
|
+
username = p.username()
|
|
52
|
+
try:
|
|
53
|
+
cmdline = p.cmdline()
|
|
54
|
+
command = os.path.basename(cmdline[0]) if cmdline else fallback_cmd
|
|
55
|
+
except psutil.AccessDenied:
|
|
56
|
+
command = fallback_cmd
|
|
57
|
+
return username, command
|
|
58
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied, OSError):
|
|
59
|
+
return "?", fallback_cmd
|
xpustat/amd.py
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
from ._process import ProcessInfo, enrich_process
|
|
9
|
+
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
# Optional fast-path: amdsmi (Python bindings shipped with ROCm)
|
|
12
|
+
# ---------------------------------------------------------------------------
|
|
13
|
+
try:
|
|
14
|
+
import amdsmi as _amdsmi # type: ignore[import-untyped]
|
|
15
|
+
_HAS_AMDSMI = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
_HAS_AMDSMI = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AMDGPUStat:
|
|
21
|
+
"""Stats for a single AMD GPU."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
gpu_index: int,
|
|
26
|
+
name: str,
|
|
27
|
+
mem_used_mb: int,
|
|
28
|
+
mem_total_mb: int,
|
|
29
|
+
util_pct: Optional[int] = None,
|
|
30
|
+
temp_c: Optional[int] = None,
|
|
31
|
+
power_w: Optional[float] = None,
|
|
32
|
+
gfx_target: Optional[str] = None,
|
|
33
|
+
processes: Optional[List[ProcessInfo]] = None,
|
|
34
|
+
) -> None:
|
|
35
|
+
self.gpu_index = gpu_index
|
|
36
|
+
self.name = name
|
|
37
|
+
self.mem_used_mb = mem_used_mb
|
|
38
|
+
self.mem_total_mb = mem_total_mb
|
|
39
|
+
self.util_pct = util_pct
|
|
40
|
+
self.temp_c = temp_c
|
|
41
|
+
self.power_w = power_w
|
|
42
|
+
#: ROCm GFX target, e.g. ``"gfx1100"`` (RDNA3) or ``"gfx942"`` (MI300X).
|
|
43
|
+
self.gfx_target = gfx_target
|
|
44
|
+
self.processes: List[ProcessInfo] = processes or []
|
|
45
|
+
|
|
46
|
+
def to_dict(self) -> dict:
|
|
47
|
+
return {
|
|
48
|
+
"gpu_index": self.gpu_index,
|
|
49
|
+
"name": self.name,
|
|
50
|
+
"gfx_target": self.gfx_target,
|
|
51
|
+
"mem_used_mb": self.mem_used_mb,
|
|
52
|
+
"mem_total_mb": self.mem_total_mb,
|
|
53
|
+
"util_pct": self.util_pct,
|
|
54
|
+
"temp_c": self.temp_c,
|
|
55
|
+
"power_w": self.power_w,
|
|
56
|
+
"processes": [p.to_dict() for p in self.processes],
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def __repr__(self) -> str:
|
|
60
|
+
return (
|
|
61
|
+
f"AMDGPUStat(index={self.gpu_index}, name={self.name!r}, "
|
|
62
|
+
f"mem_used={self.mem_used_mb}MB, mem_total={self.mem_total_mb}MB)"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class AMDGPUStatCollection:
|
|
67
|
+
"""
|
|
68
|
+
Collects stats for all AMD GPUs.
|
|
69
|
+
|
|
70
|
+
Query priority:
|
|
71
|
+
1. ``amdsmi`` — Python bindings shipped with ROCm (fast, no subprocess).
|
|
72
|
+
Available after installing ROCm; also: pip install amdsmi
|
|
73
|
+
2. ``amd-smi`` — subprocess fallback (always available with ROCm drivers).
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
#: Which backend was used: ``"amdsmi"``, ``"subprocess"``, or ``"none"``.
|
|
77
|
+
backend: str = "none"
|
|
78
|
+
|
|
79
|
+
def __init__(self, gpus: List[AMDGPUStat], backend: str = "none") -> None:
|
|
80
|
+
self.gpus = gpus
|
|
81
|
+
self.backend = backend
|
|
82
|
+
|
|
83
|
+
def to_dict(self) -> dict:
|
|
84
|
+
return {"gpus": [gpu.to_dict() for gpu in self.gpus]}
|
|
85
|
+
|
|
86
|
+
def __len__(self) -> int:
|
|
87
|
+
return len(self.gpus)
|
|
88
|
+
|
|
89
|
+
def __iter__(self):
|
|
90
|
+
return iter(self.gpus)
|
|
91
|
+
|
|
92
|
+
def __repr__(self) -> str:
|
|
93
|
+
return f"AMDGPUStatCollection({self.gpus!r})"
|
|
94
|
+
|
|
95
|
+
# ------------------------------------------------------------------
|
|
96
|
+
# Public factory
|
|
97
|
+
# ------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def new_query(cls) -> "AMDGPUStatCollection":
|
|
101
|
+
"""
|
|
102
|
+
Query AMD GPUs. Uses ``amdsmi`` Python bindings when available,
|
|
103
|
+
falls back to ``amd-smi`` subprocess. Returns an empty collection
|
|
104
|
+
if neither is present or the query fails.
|
|
105
|
+
"""
|
|
106
|
+
if _HAS_AMDSMI:
|
|
107
|
+
try:
|
|
108
|
+
return cls._query_amdsmi()
|
|
109
|
+
except Exception:
|
|
110
|
+
pass # fall through to subprocess
|
|
111
|
+
|
|
112
|
+
return cls._query_subprocess()
|
|
113
|
+
|
|
114
|
+
# ------------------------------------------------------------------
|
|
115
|
+
# Fast path — amdsmi Python bindings (ships with ROCm)
|
|
116
|
+
# ------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def _query_amdsmi(cls) -> "AMDGPUStatCollection":
|
|
120
|
+
_amdsmi.amdsmi_init()
|
|
121
|
+
try:
|
|
122
|
+
handles = _amdsmi.amdsmi_get_processor_handles()
|
|
123
|
+
gpus: List[AMDGPUStat] = []
|
|
124
|
+
|
|
125
|
+
for i, handle in enumerate(handles):
|
|
126
|
+
# ── name + GFX target ────────────────────────────────────────
|
|
127
|
+
name = "Unknown AMD GPU"
|
|
128
|
+
gfx_target: Optional[str] = None
|
|
129
|
+
try:
|
|
130
|
+
asic = _amdsmi.amdsmi_get_gpu_asic_info(handle)
|
|
131
|
+
name = (
|
|
132
|
+
asic.get("market_name")
|
|
133
|
+
or asic.get("asic_name")
|
|
134
|
+
or name
|
|
135
|
+
)
|
|
136
|
+
gfx_target = asic.get("target_graphics_version") or None
|
|
137
|
+
except Exception:
|
|
138
|
+
pass
|
|
139
|
+
|
|
140
|
+
# ── VRAM ────────────────────────────────────────────────────
|
|
141
|
+
mem_used_mb = 0
|
|
142
|
+
mem_total_mb = 0
|
|
143
|
+
try:
|
|
144
|
+
vram_type = _amdsmi.AmdSmiMemoryType.VRAM
|
|
145
|
+
mem_used_mb = (
|
|
146
|
+
_amdsmi.amdsmi_get_gpu_memory_usage(handle, vram_type)
|
|
147
|
+
// (1024 * 1024)
|
|
148
|
+
)
|
|
149
|
+
mem_total_mb = (
|
|
150
|
+
_amdsmi.amdsmi_get_gpu_memory_total(handle, vram_type)
|
|
151
|
+
// (1024 * 1024)
|
|
152
|
+
)
|
|
153
|
+
except Exception:
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
# ── utilization ─────────────────────────────────────────────
|
|
157
|
+
util_pct: Optional[int] = None
|
|
158
|
+
try:
|
|
159
|
+
activity = _amdsmi.amdsmi_get_gpu_activity(handle)
|
|
160
|
+
val = activity.get("gfx_activity")
|
|
161
|
+
if val is not None:
|
|
162
|
+
util_pct = int(val)
|
|
163
|
+
except Exception:
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
# ── temperature ─────────────────────────────────────────────
|
|
167
|
+
temp_c: Optional[int] = None
|
|
168
|
+
try:
|
|
169
|
+
temp_c = int(
|
|
170
|
+
_amdsmi.amdsmi_get_temp_metric(
|
|
171
|
+
handle,
|
|
172
|
+
_amdsmi.AmdSmiTemperatureType.EDGE,
|
|
173
|
+
_amdsmi.AmdSmiTemperatureMetric.CURRENT,
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
except Exception:
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
# ── power ───────────────────────────────────────────────────
|
|
180
|
+
power_w: Optional[float] = None
|
|
181
|
+
try:
|
|
182
|
+
pwr = _amdsmi.amdsmi_get_power_info(handle)
|
|
183
|
+
raw = pwr.get("average_socket_power") or pwr.get("current_socket_power")
|
|
184
|
+
if raw is not None:
|
|
185
|
+
power_w = float(raw)
|
|
186
|
+
except Exception:
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
# ── processes ───────────────────────────────────────────────
|
|
190
|
+
processes: List[ProcessInfo] = []
|
|
191
|
+
try:
|
|
192
|
+
procs = _amdsmi.amdsmi_get_gpu_process_list(handle)
|
|
193
|
+
for p in procs:
|
|
194
|
+
pid = p.get("pid")
|
|
195
|
+
if pid is None:
|
|
196
|
+
continue
|
|
197
|
+
mem_val = p.get("memory_usage", {}).get("vram_mem", 0)
|
|
198
|
+
gpu_mem_mb = int(mem_val) // (1024 * 1024) if mem_val else None
|
|
199
|
+
username, command = enrich_process(pid, "")
|
|
200
|
+
processes.append(
|
|
201
|
+
ProcessInfo(
|
|
202
|
+
pid=pid,
|
|
203
|
+
username=username,
|
|
204
|
+
command=command,
|
|
205
|
+
gpu_mem_mb=gpu_mem_mb,
|
|
206
|
+
)
|
|
207
|
+
)
|
|
208
|
+
except Exception:
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
gpus.append(
|
|
212
|
+
AMDGPUStat(
|
|
213
|
+
gpu_index=i,
|
|
214
|
+
name=name,
|
|
215
|
+
gfx_target=gfx_target,
|
|
216
|
+
mem_used_mb=mem_used_mb,
|
|
217
|
+
mem_total_mb=mem_total_mb,
|
|
218
|
+
util_pct=util_pct,
|
|
219
|
+
temp_c=temp_c,
|
|
220
|
+
power_w=power_w,
|
|
221
|
+
processes=processes,
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return cls(gpus, backend="amdsmi")
|
|
226
|
+
finally:
|
|
227
|
+
try:
|
|
228
|
+
_amdsmi.amdsmi_shut_down()
|
|
229
|
+
except Exception:
|
|
230
|
+
pass
|
|
231
|
+
|
|
232
|
+
# ------------------------------------------------------------------
|
|
233
|
+
# Fallback — amd-smi subprocess
|
|
234
|
+
# ------------------------------------------------------------------
|
|
235
|
+
|
|
236
|
+
@staticmethod
|
|
237
|
+
def _run_amd_smi(args: List[str]) -> Optional[dict]:
|
|
238
|
+
try:
|
|
239
|
+
result = subprocess.run(
|
|
240
|
+
["amd-smi"] + args + ["--json"],
|
|
241
|
+
capture_output=True, text=True, timeout=10,
|
|
242
|
+
)
|
|
243
|
+
if result.returncode != 0 or not result.stdout.strip():
|
|
244
|
+
return None
|
|
245
|
+
return json.loads(result.stdout)
|
|
246
|
+
except (FileNotFoundError, subprocess.TimeoutExpired, json.JSONDecodeError):
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
@classmethod
|
|
250
|
+
def _query_subprocess(cls) -> "AMDGPUStatCollection":
|
|
251
|
+
if shutil.which("amd-smi") is None:
|
|
252
|
+
return cls([], backend="none")
|
|
253
|
+
|
|
254
|
+
static_data = cls._run_amd_smi(["static"])
|
|
255
|
+
metric_data = cls._run_amd_smi(["metric"])
|
|
256
|
+
process_data = cls._run_amd_smi(["process"])
|
|
257
|
+
|
|
258
|
+
if static_data is None or metric_data is None:
|
|
259
|
+
return cls([], backend="none")
|
|
260
|
+
|
|
261
|
+
static_by_index = {
|
|
262
|
+
e["gpu"]: e for e in static_data.get("gpu_data", [])
|
|
263
|
+
}
|
|
264
|
+
metric_by_index = {
|
|
265
|
+
e["gpu"]: e for e in metric_data.get("gpu_data", [])
|
|
266
|
+
}
|
|
267
|
+
process_by_index = {}
|
|
268
|
+
if process_data is not None:
|
|
269
|
+
for p_info in process_data:
|
|
270
|
+
idx = p_info.get("gpu")
|
|
271
|
+
if idx is not None:
|
|
272
|
+
process_by_index[idx] = p_info.get("process_list", [])
|
|
273
|
+
|
|
274
|
+
gpus: List[AMDGPUStat] = []
|
|
275
|
+
for idx, static in static_by_index.items():
|
|
276
|
+
metric = metric_by_index.get(idx, {})
|
|
277
|
+
asic = static.get("asic", {})
|
|
278
|
+
|
|
279
|
+
name = (
|
|
280
|
+
asic.get("market_name")
|
|
281
|
+
or static.get("board", {}).get("model_number")
|
|
282
|
+
or "Unknown AMD GPU"
|
|
283
|
+
)
|
|
284
|
+
gfx_target = asic.get("target_graphics_version") or None
|
|
285
|
+
|
|
286
|
+
mem = metric.get("mem_usage", {})
|
|
287
|
+
usage = metric.get("usage", {})
|
|
288
|
+
pwr = metric.get("power", {})
|
|
289
|
+
temp = metric.get("temperature", {})
|
|
290
|
+
|
|
291
|
+
processes: List[ProcessInfo] = []
|
|
292
|
+
for p_item in process_by_index.get(idx, []):
|
|
293
|
+
p = p_item.get("process_info", {})
|
|
294
|
+
pid = p.get("pid")
|
|
295
|
+
if pid is None:
|
|
296
|
+
continue
|
|
297
|
+
vram_mem = p.get("memory_usage", {}).get("vram_mem", {}).get("value", 0)
|
|
298
|
+
mem_mb = int(vram_mem) // (1024 * 1024) if vram_mem else None
|
|
299
|
+
cmd = p.get("name", "")
|
|
300
|
+
username, command = enrich_process(pid, cmd)
|
|
301
|
+
processes.append(
|
|
302
|
+
ProcessInfo(
|
|
303
|
+
pid=pid,
|
|
304
|
+
username=username,
|
|
305
|
+
command=command,
|
|
306
|
+
gpu_mem_mb=mem_mb,
|
|
307
|
+
)
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
gpus.append(AMDGPUStat(
|
|
311
|
+
gpu_index=idx,
|
|
312
|
+
name=name,
|
|
313
|
+
gfx_target=gfx_target,
|
|
314
|
+
mem_used_mb=_mb(mem.get("used_vram")),
|
|
315
|
+
mem_total_mb=_mb(mem.get("total_vram")),
|
|
316
|
+
util_pct=_int(usage.get("gfx_activity")),
|
|
317
|
+
temp_c=_int(temp.get("edge") or temp.get("hotspot")),
|
|
318
|
+
power_w=_float(pwr.get("socket_power")),
|
|
319
|
+
processes=processes,
|
|
320
|
+
))
|
|
321
|
+
|
|
322
|
+
return cls(gpus, backend="subprocess")
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# ---------------------------------------------------------------------------
|
|
326
|
+
# Helpers
|
|
327
|
+
# ---------------------------------------------------------------------------
|
|
328
|
+
|
|
329
|
+
def _mb(field) -> int:
|
|
330
|
+
if isinstance(field, dict):
|
|
331
|
+
try:
|
|
332
|
+
return int(field.get("value", 0))
|
|
333
|
+
except (TypeError, ValueError):
|
|
334
|
+
return 0
|
|
335
|
+
return 0
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _int(field) -> Optional[int]:
|
|
339
|
+
if isinstance(field, dict):
|
|
340
|
+
try:
|
|
341
|
+
return int(field.get("value", 0))
|
|
342
|
+
except (TypeError, ValueError):
|
|
343
|
+
return None
|
|
344
|
+
return None
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _float(field) -> Optional[float]:
|
|
348
|
+
if isinstance(field, dict):
|
|
349
|
+
try:
|
|
350
|
+
return float(field.get("value", 0))
|
|
351
|
+
except (TypeError, ValueError):
|
|
352
|
+
return None
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
if __name__ == "__main__":
|
|
357
|
+
col = AMDGPUStatCollection.new_query()
|
|
358
|
+
print(f"backend: {col.backend}")
|
|
359
|
+
print(json.dumps(col.to_dict(), indent=2))
|
xpustat/cambricon.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CambriconMLUStat:
|
|
7
|
+
"""Parse MLU info from cnmon for a single Cambricon MLU accelerator."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, card_id: int, chip_id: int, name: str, mem_used_mb: int, mem_total_mb: int):
|
|
10
|
+
self.card_id = card_id
|
|
11
|
+
self.chip_id = chip_id
|
|
12
|
+
self.name = name
|
|
13
|
+
self.mem_used_mb = mem_used_mb
|
|
14
|
+
self.mem_total_mb = mem_total_mb
|
|
15
|
+
|
|
16
|
+
def to_dict(self) -> dict:
|
|
17
|
+
return {
|
|
18
|
+
"card_id": self.card_id,
|
|
19
|
+
"chip_id": self.chip_id,
|
|
20
|
+
"name": self.name,
|
|
21
|
+
"mem_used_mb": self.mem_used_mb,
|
|
22
|
+
"mem_total_mb": self.mem_total_mb,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
def __repr__(self) -> str:
|
|
26
|
+
return (
|
|
27
|
+
f"CambriconMLUStat(card_id={self.card_id}, chip_id={self.chip_id}, "
|
|
28
|
+
f"name={self.name!r}, mem_used={self.mem_used_mb}MB, mem_total={self.mem_total_mb}MB)"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CambriconMLUStatCollection:
|
|
33
|
+
"""Collects stats for all Cambricon MLUs via cnmon.
|
|
34
|
+
|
|
35
|
+
Parses the ASCII table from ``cnmon info``, which has two data rows per
|
|
36
|
+
device entry (similar layout to npu-smi):
|
|
37
|
+
|
|
38
|
+
Row 1: | CardID Model | Health | Power(W) Temp(C) |
|
|
39
|
+
Row 2: | ChipID Device | Bus-Id | Util(%) MemUsed/Total |
|
|
40
|
+
|
|
41
|
+
Memory values are in MB.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, mlus: list[CambriconMLUStat]):
|
|
45
|
+
self.mlus = mlus
|
|
46
|
+
|
|
47
|
+
def to_dict(self) -> dict:
|
|
48
|
+
return {"mlus": [mlu.to_dict() for mlu in self.mlus]}
|
|
49
|
+
|
|
50
|
+
def __len__(self) -> int:
|
|
51
|
+
return len(self.mlus)
|
|
52
|
+
|
|
53
|
+
def __iter__(self):
|
|
54
|
+
return iter(self.mlus)
|
|
55
|
+
|
|
56
|
+
def __repr__(self) -> str:
|
|
57
|
+
return f"CambriconMLUStatCollection({self.mlus!r})"
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def new_query(cls) -> "CambriconMLUStatCollection":
|
|
61
|
+
"""
|
|
62
|
+
Query cnmon and return a CambriconMLUStatCollection.
|
|
63
|
+
Returns an empty collection if cnmon is not installed or fails.
|
|
64
|
+
"""
|
|
65
|
+
binary = shutil.which("cnmon") or shutil.which("/usr/local/neuware/bin/cnmon")
|
|
66
|
+
if binary is None:
|
|
67
|
+
return cls([])
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
result = subprocess.run(
|
|
71
|
+
[binary, "info"],
|
|
72
|
+
capture_output=True,
|
|
73
|
+
text=True,
|
|
74
|
+
timeout=10,
|
|
75
|
+
)
|
|
76
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
77
|
+
return cls([])
|
|
78
|
+
|
|
79
|
+
if result.returncode != 0 or not result.stdout.strip():
|
|
80
|
+
return cls([])
|
|
81
|
+
|
|
82
|
+
return cls(_parse_cnmon_info(result.stdout))
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# Parsing helpers
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
_DATA_ROW = re.compile(r"^\|(.+)\|(.+)\|(.+)\|$")
|
|
90
|
+
_MEM_FIELD = re.compile(r"(\d+)\s*/\s*(\d+)")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _parse_cnmon_info(output: str) -> list[CambriconMLUStat]:
|
|
94
|
+
"""
|
|
95
|
+
Parse the ASCII table from ``cnmon info``.
|
|
96
|
+
|
|
97
|
+
Each device occupies two consecutive data rows:
|
|
98
|
+
Row 1 → card_id, model name
|
|
99
|
+
Row 2 → chip_id, mem_used / mem_total
|
|
100
|
+
"""
|
|
101
|
+
mlus: list[CambriconMLUStat] = []
|
|
102
|
+
data_rows: list[tuple[str, str, str]] = []
|
|
103
|
+
|
|
104
|
+
for raw_line in output.splitlines():
|
|
105
|
+
line = raw_line.strip()
|
|
106
|
+
m = _DATA_ROW.match(line)
|
|
107
|
+
if not m:
|
|
108
|
+
continue
|
|
109
|
+
col1 = m.group(1).strip()
|
|
110
|
+
col2 = m.group(2).strip()
|
|
111
|
+
col3 = m.group(3).strip()
|
|
112
|
+
|
|
113
|
+
first_token = col1.split()[0] if col1.split() else ""
|
|
114
|
+
if not first_token.isdigit():
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
data_rows.append((col1, col2, col3))
|
|
118
|
+
|
|
119
|
+
for i in range(0, len(data_rows) - 1, 2):
|
|
120
|
+
row1_col1, _, _ = data_rows[i]
|
|
121
|
+
row2_col1, _, row2_col3 = data_rows[i + 1]
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
tokens1 = row1_col1.split()
|
|
125
|
+
card_id = int(tokens1[0])
|
|
126
|
+
name = tokens1[1] if len(tokens1) > 1 else "Unknown MLU"
|
|
127
|
+
|
|
128
|
+
chip_id = int(row2_col1.split()[0])
|
|
129
|
+
|
|
130
|
+
mem_match = _MEM_FIELD.search(row2_col3)
|
|
131
|
+
if mem_match:
|
|
132
|
+
mem_used_mb = int(mem_match.group(1))
|
|
133
|
+
mem_total_mb = int(mem_match.group(2))
|
|
134
|
+
else:
|
|
135
|
+
mem_used_mb, mem_total_mb = 0, 0
|
|
136
|
+
|
|
137
|
+
mlus.append(CambriconMLUStat(
|
|
138
|
+
card_id=card_id,
|
|
139
|
+
chip_id=chip_id,
|
|
140
|
+
name=name,
|
|
141
|
+
mem_used_mb=mem_used_mb,
|
|
142
|
+
mem_total_mb=mem_total_mb,
|
|
143
|
+
))
|
|
144
|
+
except (IndexError, ValueError):
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
return mlus
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
if __name__ == "__main__":
|
|
151
|
+
import json
|
|
152
|
+
collection = CambriconMLUStatCollection.new_query()
|
|
153
|
+
print(json.dumps(collection.to_dict(), indent=2))
|