xpustat 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpustat/__init__.py ADDED
@@ -0,0 +1,77 @@
1
+ """
2
+ xpustat — unified accelerator stat tool.
3
+
4
+ Supports NVIDIA, AMD, Intel Gaudi, Intel Arc/GPU,
5
+ Huawei Ascend, Hygon DCU, Cambricon MLU, and Moore Threads.
6
+
7
+ Quick start::
8
+
9
+ import xpustat
10
+
11
+ # Query all vendors at once
12
+ stats = xpustat.query_all()
13
+
14
+ # Iterate every device across all vendors
15
+ for dev in stats:
16
+ print(dev.name, dev.mem_used_mb, "/", dev.mem_total_mb, "MB")
17
+
18
+ # Vendor-specific access
19
+ for gpu in stats.nvidia:
20
+ print(gpu.name, gpu.util_pct, "%", gpu.temp_c, "°C")
21
+
22
+ # Subscript by vendor key
23
+ for gpu in stats["amd"]:
24
+ print(gpu.power_w, "W")
25
+
26
+ # Query a single vendor
27
+ gpus = xpustat.query("nvidia")
28
+
29
+ # JSON string (for HTTP responses / files)
30
+ print(stats.to_json(indent=2))
31
+
32
+ # JSON with metadata wrapper
33
+ print(stats.to_json(metadata=True))
34
+ """
35
+
36
+ __version__ = "0.1.0"
37
+ __version_tuple__ = (0, 1, 0)
38
+
39
+ from ._process import ProcessInfo
40
+ from .xpu import AnyDeviceStat, XPUStatCollection, query, query_all
41
+ from .nvidia import NvidiaGPUStat, NvidiaGPUStatCollection
42
+ from .amd import AMDGPUStat, AMDGPUStatCollection
43
+ from .gaudi import GaudiAIPStat, GaudiAIPStatCollection
44
+ from .intel_gpu import IntelGPUStat, IntelGPUStatCollection
45
+ from .huawei import HuaweiNPUStat, HuaweiNPUStatCollection
46
+ from .hygon import HygonDCUStat, HygonDCUStatCollection
47
+ from .cambricon import CambriconMLUStat, CambriconMLUStatCollection
48
+ from .moorethreads import MooreThreadsGPUStat, MooreThreadsGPUStatCollection
49
+
50
+ __all__ = [
51
+ "__version__",
52
+ "__version_tuple__",
53
+ # Core API
54
+ "query_all",
55
+ "query",
56
+ "XPUStatCollection",
57
+ "AnyDeviceStat",
58
+ # Process info
59
+ "ProcessInfo",
60
+ # Per-vendor stat objects
61
+ "NvidiaGPUStat",
62
+ "NvidiaGPUStatCollection",
63
+ "AMDGPUStat",
64
+ "AMDGPUStatCollection",
65
+ "GaudiAIPStat",
66
+ "GaudiAIPStatCollection",
67
+ "IntelGPUStat",
68
+ "IntelGPUStatCollection",
69
+ "HuaweiNPUStat",
70
+ "HuaweiNPUStatCollection",
71
+ "HygonDCUStat",
72
+ "HygonDCUStatCollection",
73
+ "CambriconMLUStat",
74
+ "CambriconMLUStatCollection",
75
+ "MooreThreadsGPUStat",
76
+ "MooreThreadsGPUStatCollection",
77
+ ]
xpustat/_process.py ADDED
@@ -0,0 +1,59 @@
1
+ """
2
+ Shared process info helpers across all vendors.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import os
8
+ from typing import Optional
9
+
10
+ import psutil # type: ignore[import-untyped]
11
+
12
+
13
+ class ProcessInfo:
14
+ """One running process using accelerator memory."""
15
+
16
+ __slots__ = ("pid", "username", "command", "gpu_mem_mb")
17
+
18
+ def __init__(
19
+ self,
20
+ pid: int,
21
+ username: str,
22
+ command: str,
23
+ gpu_mem_mb: Optional[int],
24
+ ) -> None:
25
+ self.pid = pid
26
+ self.username = username
27
+ self.command = command
28
+ self.gpu_mem_mb = gpu_mem_mb
29
+
30
+ def to_dict(self) -> dict:
31
+ return {
32
+ "pid": self.pid,
33
+ "username": self.username,
34
+ "command": self.command,
35
+ "gpu_mem_mb": self.gpu_mem_mb,
36
+ }
37
+
38
+ def __repr__(self) -> str:
39
+ mem = f"{self.gpu_mem_mb}M" if self.gpu_mem_mb is not None else "?"
40
+ return f"{self.username}:{self.command}/{self.pid}({mem})"
41
+
42
+
43
+ def enrich_process(pid: int, fallback_cmd: str) -> tuple[str, str]:
44
+ """
45
+ Return (username, command) for a PID.
46
+ Falls back to ('?', fallback_cmd) if the process has already exited
47
+ or access is denied.
48
+ """
49
+ try:
50
+ p = psutil.Process(pid)
51
+ username = p.username()
52
+ try:
53
+ cmdline = p.cmdline()
54
+ command = os.path.basename(cmdline[0]) if cmdline else fallback_cmd
55
+ except psutil.AccessDenied:
56
+ command = fallback_cmd
57
+ return username, command
58
+ except (psutil.NoSuchProcess, psutil.AccessDenied, OSError):
59
+ return "?", fallback_cmd
xpustat/amd.py ADDED
@@ -0,0 +1,359 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import shutil
5
+ import subprocess
6
+ from typing import List, Optional
7
+
8
+ from ._process import ProcessInfo, enrich_process
9
+
10
+ # ---------------------------------------------------------------------------
11
+ # Optional fast-path: amdsmi (Python bindings shipped with ROCm)
12
+ # ---------------------------------------------------------------------------
13
+ try:
14
+ import amdsmi as _amdsmi # type: ignore[import-untyped]
15
+ _HAS_AMDSMI = True
16
+ except ImportError:
17
+ _HAS_AMDSMI = False
18
+
19
+
20
+ class AMDGPUStat:
21
+ """Stats for a single AMD GPU."""
22
+
23
+ def __init__(
24
+ self,
25
+ gpu_index: int,
26
+ name: str,
27
+ mem_used_mb: int,
28
+ mem_total_mb: int,
29
+ util_pct: Optional[int] = None,
30
+ temp_c: Optional[int] = None,
31
+ power_w: Optional[float] = None,
32
+ gfx_target: Optional[str] = None,
33
+ processes: Optional[List[ProcessInfo]] = None,
34
+ ) -> None:
35
+ self.gpu_index = gpu_index
36
+ self.name = name
37
+ self.mem_used_mb = mem_used_mb
38
+ self.mem_total_mb = mem_total_mb
39
+ self.util_pct = util_pct
40
+ self.temp_c = temp_c
41
+ self.power_w = power_w
42
+ #: ROCm GFX target, e.g. ``"gfx1100"`` (RDNA3) or ``"gfx942"`` (MI300X).
43
+ self.gfx_target = gfx_target
44
+ self.processes: List[ProcessInfo] = processes or []
45
+
46
+ def to_dict(self) -> dict:
47
+ return {
48
+ "gpu_index": self.gpu_index,
49
+ "name": self.name,
50
+ "gfx_target": self.gfx_target,
51
+ "mem_used_mb": self.mem_used_mb,
52
+ "mem_total_mb": self.mem_total_mb,
53
+ "util_pct": self.util_pct,
54
+ "temp_c": self.temp_c,
55
+ "power_w": self.power_w,
56
+ "processes": [p.to_dict() for p in self.processes],
57
+ }
58
+
59
+ def __repr__(self) -> str:
60
+ return (
61
+ f"AMDGPUStat(index={self.gpu_index}, name={self.name!r}, "
62
+ f"mem_used={self.mem_used_mb}MB, mem_total={self.mem_total_mb}MB)"
63
+ )
64
+
65
+
66
+ class AMDGPUStatCollection:
67
+ """
68
+ Collects stats for all AMD GPUs.
69
+
70
+ Query priority:
71
+ 1. ``amdsmi`` — Python bindings shipped with ROCm (fast, no subprocess).
72
+ Available after installing ROCm; also: pip install amdsmi
73
+ 2. ``amd-smi`` — subprocess fallback (always available with ROCm drivers).
74
+ """
75
+
76
+ #: Which backend was used: ``"amdsmi"``, ``"subprocess"``, or ``"none"``.
77
+ backend: str = "none"
78
+
79
+ def __init__(self, gpus: List[AMDGPUStat], backend: str = "none") -> None:
80
+ self.gpus = gpus
81
+ self.backend = backend
82
+
83
+ def to_dict(self) -> dict:
84
+ return {"gpus": [gpu.to_dict() for gpu in self.gpus]}
85
+
86
+ def __len__(self) -> int:
87
+ return len(self.gpus)
88
+
89
+ def __iter__(self):
90
+ return iter(self.gpus)
91
+
92
+ def __repr__(self) -> str:
93
+ return f"AMDGPUStatCollection({self.gpus!r})"
94
+
95
+ # ------------------------------------------------------------------
96
+ # Public factory
97
+ # ------------------------------------------------------------------
98
+
99
+ @classmethod
100
+ def new_query(cls) -> "AMDGPUStatCollection":
101
+ """
102
+ Query AMD GPUs. Uses ``amdsmi`` Python bindings when available,
103
+ falls back to ``amd-smi`` subprocess. Returns an empty collection
104
+ if neither is present or the query fails.
105
+ """
106
+ if _HAS_AMDSMI:
107
+ try:
108
+ return cls._query_amdsmi()
109
+ except Exception:
110
+ pass # fall through to subprocess
111
+
112
+ return cls._query_subprocess()
113
+
114
+ # ------------------------------------------------------------------
115
+ # Fast path — amdsmi Python bindings (ships with ROCm)
116
+ # ------------------------------------------------------------------
117
+
118
+ @classmethod
119
+ def _query_amdsmi(cls) -> "AMDGPUStatCollection":
120
+ _amdsmi.amdsmi_init()
121
+ try:
122
+ handles = _amdsmi.amdsmi_get_processor_handles()
123
+ gpus: List[AMDGPUStat] = []
124
+
125
+ for i, handle in enumerate(handles):
126
+ # ── name + GFX target ────────────────────────────────────────
127
+ name = "Unknown AMD GPU"
128
+ gfx_target: Optional[str] = None
129
+ try:
130
+ asic = _amdsmi.amdsmi_get_gpu_asic_info(handle)
131
+ name = (
132
+ asic.get("market_name")
133
+ or asic.get("asic_name")
134
+ or name
135
+ )
136
+ gfx_target = asic.get("target_graphics_version") or None
137
+ except Exception:
138
+ pass
139
+
140
+ # ── VRAM ────────────────────────────────────────────────────
141
+ mem_used_mb = 0
142
+ mem_total_mb = 0
143
+ try:
144
+ vram_type = _amdsmi.AmdSmiMemoryType.VRAM
145
+ mem_used_mb = (
146
+ _amdsmi.amdsmi_get_gpu_memory_usage(handle, vram_type)
147
+ // (1024 * 1024)
148
+ )
149
+ mem_total_mb = (
150
+ _amdsmi.amdsmi_get_gpu_memory_total(handle, vram_type)
151
+ // (1024 * 1024)
152
+ )
153
+ except Exception:
154
+ pass
155
+
156
+ # ── utilization ─────────────────────────────────────────────
157
+ util_pct: Optional[int] = None
158
+ try:
159
+ activity = _amdsmi.amdsmi_get_gpu_activity(handle)
160
+ val = activity.get("gfx_activity")
161
+ if val is not None:
162
+ util_pct = int(val)
163
+ except Exception:
164
+ pass
165
+
166
+ # ── temperature ─────────────────────────────────────────────
167
+ temp_c: Optional[int] = None
168
+ try:
169
+ temp_c = int(
170
+ _amdsmi.amdsmi_get_temp_metric(
171
+ handle,
172
+ _amdsmi.AmdSmiTemperatureType.EDGE,
173
+ _amdsmi.AmdSmiTemperatureMetric.CURRENT,
174
+ )
175
+ )
176
+ except Exception:
177
+ pass
178
+
179
+ # ── power ───────────────────────────────────────────────────
180
+ power_w: Optional[float] = None
181
+ try:
182
+ pwr = _amdsmi.amdsmi_get_power_info(handle)
183
+ raw = pwr.get("average_socket_power") or pwr.get("current_socket_power")
184
+ if raw is not None:
185
+ power_w = float(raw)
186
+ except Exception:
187
+ pass
188
+
189
+ # ── processes ───────────────────────────────────────────────
190
+ processes: List[ProcessInfo] = []
191
+ try:
192
+ procs = _amdsmi.amdsmi_get_gpu_process_list(handle)
193
+ for p in procs:
194
+ pid = p.get("pid")
195
+ if pid is None:
196
+ continue
197
+ mem_val = p.get("memory_usage", {}).get("vram_mem", 0)
198
+ gpu_mem_mb = int(mem_val) // (1024 * 1024) if mem_val else None
199
+ username, command = enrich_process(pid, "")
200
+ processes.append(
201
+ ProcessInfo(
202
+ pid=pid,
203
+ username=username,
204
+ command=command,
205
+ gpu_mem_mb=gpu_mem_mb,
206
+ )
207
+ )
208
+ except Exception:
209
+ pass
210
+
211
+ gpus.append(
212
+ AMDGPUStat(
213
+ gpu_index=i,
214
+ name=name,
215
+ gfx_target=gfx_target,
216
+ mem_used_mb=mem_used_mb,
217
+ mem_total_mb=mem_total_mb,
218
+ util_pct=util_pct,
219
+ temp_c=temp_c,
220
+ power_w=power_w,
221
+ processes=processes,
222
+ )
223
+ )
224
+
225
+ return cls(gpus, backend="amdsmi")
226
+ finally:
227
+ try:
228
+ _amdsmi.amdsmi_shut_down()
229
+ except Exception:
230
+ pass
231
+
232
+ # ------------------------------------------------------------------
233
+ # Fallback — amd-smi subprocess
234
+ # ------------------------------------------------------------------
235
+
236
+ @staticmethod
237
+ def _run_amd_smi(args: List[str]) -> Optional[dict]:
238
+ try:
239
+ result = subprocess.run(
240
+ ["amd-smi"] + args + ["--json"],
241
+ capture_output=True, text=True, timeout=10,
242
+ )
243
+ if result.returncode != 0 or not result.stdout.strip():
244
+ return None
245
+ return json.loads(result.stdout)
246
+ except (FileNotFoundError, subprocess.TimeoutExpired, json.JSONDecodeError):
247
+ return None
248
+
249
+ @classmethod
250
+ def _query_subprocess(cls) -> "AMDGPUStatCollection":
251
+ if shutil.which("amd-smi") is None:
252
+ return cls([], backend="none")
253
+
254
+ static_data = cls._run_amd_smi(["static"])
255
+ metric_data = cls._run_amd_smi(["metric"])
256
+ process_data = cls._run_amd_smi(["process"])
257
+
258
+ if static_data is None or metric_data is None:
259
+ return cls([], backend="none")
260
+
261
+ static_by_index = {
262
+ e["gpu"]: e for e in static_data.get("gpu_data", [])
263
+ }
264
+ metric_by_index = {
265
+ e["gpu"]: e for e in metric_data.get("gpu_data", [])
266
+ }
267
+ process_by_index = {}
268
+ if process_data is not None:
269
+ for p_info in process_data:
270
+ idx = p_info.get("gpu")
271
+ if idx is not None:
272
+ process_by_index[idx] = p_info.get("process_list", [])
273
+
274
+ gpus: List[AMDGPUStat] = []
275
+ for idx, static in static_by_index.items():
276
+ metric = metric_by_index.get(idx, {})
277
+ asic = static.get("asic", {})
278
+
279
+ name = (
280
+ asic.get("market_name")
281
+ or static.get("board", {}).get("model_number")
282
+ or "Unknown AMD GPU"
283
+ )
284
+ gfx_target = asic.get("target_graphics_version") or None
285
+
286
+ mem = metric.get("mem_usage", {})
287
+ usage = metric.get("usage", {})
288
+ pwr = metric.get("power", {})
289
+ temp = metric.get("temperature", {})
290
+
291
+ processes: List[ProcessInfo] = []
292
+ for p_item in process_by_index.get(idx, []):
293
+ p = p_item.get("process_info", {})
294
+ pid = p.get("pid")
295
+ if pid is None:
296
+ continue
297
+ vram_mem = p.get("memory_usage", {}).get("vram_mem", {}).get("value", 0)
298
+ mem_mb = int(vram_mem) // (1024 * 1024) if vram_mem else None
299
+ cmd = p.get("name", "")
300
+ username, command = enrich_process(pid, cmd)
301
+ processes.append(
302
+ ProcessInfo(
303
+ pid=pid,
304
+ username=username,
305
+ command=command,
306
+ gpu_mem_mb=mem_mb,
307
+ )
308
+ )
309
+
310
+ gpus.append(AMDGPUStat(
311
+ gpu_index=idx,
312
+ name=name,
313
+ gfx_target=gfx_target,
314
+ mem_used_mb=_mb(mem.get("used_vram")),
315
+ mem_total_mb=_mb(mem.get("total_vram")),
316
+ util_pct=_int(usage.get("gfx_activity")),
317
+ temp_c=_int(temp.get("edge") or temp.get("hotspot")),
318
+ power_w=_float(pwr.get("socket_power")),
319
+ processes=processes,
320
+ ))
321
+
322
+ return cls(gpus, backend="subprocess")
323
+
324
+
325
+ # ---------------------------------------------------------------------------
326
+ # Helpers
327
+ # ---------------------------------------------------------------------------
328
+
329
+ def _mb(field) -> int:
330
+ if isinstance(field, dict):
331
+ try:
332
+ return int(field.get("value", 0))
333
+ except (TypeError, ValueError):
334
+ return 0
335
+ return 0
336
+
337
+
338
+ def _int(field) -> Optional[int]:
339
+ if isinstance(field, dict):
340
+ try:
341
+ return int(field.get("value", 0))
342
+ except (TypeError, ValueError):
343
+ return None
344
+ return None
345
+
346
+
347
+ def _float(field) -> Optional[float]:
348
+ if isinstance(field, dict):
349
+ try:
350
+ return float(field.get("value", 0))
351
+ except (TypeError, ValueError):
352
+ return None
353
+ return None
354
+
355
+
356
+ if __name__ == "__main__":
357
+ col = AMDGPUStatCollection.new_query()
358
+ print(f"backend: {col.backend}")
359
+ print(json.dumps(col.to_dict(), indent=2))
xpustat/cambricon.py ADDED
@@ -0,0 +1,153 @@
1
+ import re
2
+ import shutil
3
+ import subprocess
4
+
5
+
6
+ class CambriconMLUStat:
7
+ """Parse MLU info from cnmon for a single Cambricon MLU accelerator."""
8
+
9
+ def __init__(self, card_id: int, chip_id: int, name: str, mem_used_mb: int, mem_total_mb: int):
10
+ self.card_id = card_id
11
+ self.chip_id = chip_id
12
+ self.name = name
13
+ self.mem_used_mb = mem_used_mb
14
+ self.mem_total_mb = mem_total_mb
15
+
16
+ def to_dict(self) -> dict:
17
+ return {
18
+ "card_id": self.card_id,
19
+ "chip_id": self.chip_id,
20
+ "name": self.name,
21
+ "mem_used_mb": self.mem_used_mb,
22
+ "mem_total_mb": self.mem_total_mb,
23
+ }
24
+
25
+ def __repr__(self) -> str:
26
+ return (
27
+ f"CambriconMLUStat(card_id={self.card_id}, chip_id={self.chip_id}, "
28
+ f"name={self.name!r}, mem_used={self.mem_used_mb}MB, mem_total={self.mem_total_mb}MB)"
29
+ )
30
+
31
+
32
+ class CambriconMLUStatCollection:
33
+ """Collects stats for all Cambricon MLUs via cnmon.
34
+
35
+ Parses the ASCII table from ``cnmon info``, which has two data rows per
36
+ device entry (similar layout to npu-smi):
37
+
38
+ Row 1: | CardID Model | Health | Power(W) Temp(C) |
39
+ Row 2: | ChipID Device | Bus-Id | Util(%) MemUsed/Total |
40
+
41
+ Memory values are in MB.
42
+ """
43
+
44
+ def __init__(self, mlus: list[CambriconMLUStat]):
45
+ self.mlus = mlus
46
+
47
+ def to_dict(self) -> dict:
48
+ return {"mlus": [mlu.to_dict() for mlu in self.mlus]}
49
+
50
+ def __len__(self) -> int:
51
+ return len(self.mlus)
52
+
53
+ def __iter__(self):
54
+ return iter(self.mlus)
55
+
56
+ def __repr__(self) -> str:
57
+ return f"CambriconMLUStatCollection({self.mlus!r})"
58
+
59
+ @classmethod
60
+ def new_query(cls) -> "CambriconMLUStatCollection":
61
+ """
62
+ Query cnmon and return a CambriconMLUStatCollection.
63
+ Returns an empty collection if cnmon is not installed or fails.
64
+ """
65
+ binary = shutil.which("cnmon") or shutil.which("/usr/local/neuware/bin/cnmon")
66
+ if binary is None:
67
+ return cls([])
68
+
69
+ try:
70
+ result = subprocess.run(
71
+ [binary, "info"],
72
+ capture_output=True,
73
+ text=True,
74
+ timeout=10,
75
+ )
76
+ except (FileNotFoundError, subprocess.TimeoutExpired):
77
+ return cls([])
78
+
79
+ if result.returncode != 0 or not result.stdout.strip():
80
+ return cls([])
81
+
82
+ return cls(_parse_cnmon_info(result.stdout))
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # Parsing helpers
87
+ # ---------------------------------------------------------------------------
88
+
89
+ _DATA_ROW = re.compile(r"^\|(.+)\|(.+)\|(.+)\|$")
90
+ _MEM_FIELD = re.compile(r"(\d+)\s*/\s*(\d+)")
91
+
92
+
93
+ def _parse_cnmon_info(output: str) -> list[CambriconMLUStat]:
94
+ """
95
+ Parse the ASCII table from ``cnmon info``.
96
+
97
+ Each device occupies two consecutive data rows:
98
+ Row 1 → card_id, model name
99
+ Row 2 → chip_id, mem_used / mem_total
100
+ """
101
+ mlus: list[CambriconMLUStat] = []
102
+ data_rows: list[tuple[str, str, str]] = []
103
+
104
+ for raw_line in output.splitlines():
105
+ line = raw_line.strip()
106
+ m = _DATA_ROW.match(line)
107
+ if not m:
108
+ continue
109
+ col1 = m.group(1).strip()
110
+ col2 = m.group(2).strip()
111
+ col3 = m.group(3).strip()
112
+
113
+ first_token = col1.split()[0] if col1.split() else ""
114
+ if not first_token.isdigit():
115
+ continue
116
+
117
+ data_rows.append((col1, col2, col3))
118
+
119
+ for i in range(0, len(data_rows) - 1, 2):
120
+ row1_col1, _, _ = data_rows[i]
121
+ row2_col1, _, row2_col3 = data_rows[i + 1]
122
+
123
+ try:
124
+ tokens1 = row1_col1.split()
125
+ card_id = int(tokens1[0])
126
+ name = tokens1[1] if len(tokens1) > 1 else "Unknown MLU"
127
+
128
+ chip_id = int(row2_col1.split()[0])
129
+
130
+ mem_match = _MEM_FIELD.search(row2_col3)
131
+ if mem_match:
132
+ mem_used_mb = int(mem_match.group(1))
133
+ mem_total_mb = int(mem_match.group(2))
134
+ else:
135
+ mem_used_mb, mem_total_mb = 0, 0
136
+
137
+ mlus.append(CambriconMLUStat(
138
+ card_id=card_id,
139
+ chip_id=chip_id,
140
+ name=name,
141
+ mem_used_mb=mem_used_mb,
142
+ mem_total_mb=mem_total_mb,
143
+ ))
144
+ except (IndexError, ValueError):
145
+ continue
146
+
147
+ return mlus
148
+
149
+
150
+ if __name__ == "__main__":
151
+ import json
152
+ collection = CambriconMLUStatCollection.new_query()
153
+ print(json.dumps(collection.to_dict(), indent=2))