xputop 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xputop-0.1.0/LICENSE +92 -0
- xputop-0.1.0/MANIFEST.in +4 -0
- xputop-0.1.0/PKG-INFO +235 -0
- xputop-0.1.0/README.md +199 -0
- xputop-0.1.0/pyproject.toml +54 -0
- xputop-0.1.0/setup.cfg +4 -0
- xputop-0.1.0/tests/test_amd.py +76 -0
- xputop-0.1.0/tests/test_config.py +89 -0
- xputop-0.1.0/tests/test_intel.py +32 -0
- xputop-0.1.0/tests/test_monitor.py +55 -0
- xputop-0.1.0/tests/test_npu.py +113 -0
- xputop-0.1.0/tests/test_tpu.py +50 -0
- xputop-0.1.0/xputop/__init__.py +4 -0
- xputop-0.1.0/xputop/__main__.py +6 -0
- xputop-0.1.0/xputop/alert/__init__.py +0 -0
- xputop-0.1.0/xputop/alert/config.py +278 -0
- xputop-0.1.0/xputop/alert/email_alert.py +141 -0
- xputop-0.1.0/xputop/cli.py +587 -0
- xputop-0.1.0/xputop/core/__init__.py +0 -0
- xputop-0.1.0/xputop/core/backend.py +77 -0
- xputop-0.1.0/xputop/core/backends/__init__.py +131 -0
- xputop-0.1.0/xputop/core/backends/amd.py +243 -0
- xputop-0.1.0/xputop/core/backends/custom.py +157 -0
- xputop-0.1.0/xputop/core/backends/demo.py +51 -0
- xputop-0.1.0/xputop/core/backends/intel.py +110 -0
- xputop-0.1.0/xputop/core/backends/npu.py +296 -0
- xputop-0.1.0/xputop/core/backends/nvidia.py +156 -0
- xputop-0.1.0/xputop/core/backends/tpu.py +140 -0
- xputop-0.1.0/xputop/core/monitor.py +281 -0
- xputop-0.1.0/xputop/core/recorder.py +161 -0
- xputop-0.1.0/xputop/core/sysinfo.py +345 -0
- xputop-0.1.0/xputop/ui/__init__.py +0 -0
- xputop-0.1.0/xputop/ui/tui.py +643 -0
- xputop-0.1.0/xputop/ui/viewer.py +210 -0
- xputop-0.1.0/xputop.egg-info/PKG-INFO +235 -0
- xputop-0.1.0/xputop.egg-info/SOURCES.txt +38 -0
- xputop-0.1.0/xputop.egg-info/dependency_links.txt +1 -0
- xputop-0.1.0/xputop.egg-info/entry_points.txt +2 -0
- xputop-0.1.0/xputop.egg-info/requires.txt +7 -0
- xputop-0.1.0/xputop.egg-info/top_level.txt +1 -0
xputop-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
11
|
+
|
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
13
|
+
the copyright owner that is granting the License.
|
|
14
|
+
|
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
16
|
+
other entities that control, are controlled by, or are under common
|
|
17
|
+
control with that entity.
|
|
18
|
+
|
|
19
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
20
|
+
exercising permissions granted by this License.
|
|
21
|
+
|
|
22
|
+
"Source" form shall mean the preferred form for making modifications.
|
|
23
|
+
|
|
24
|
+
"Object" form shall mean any form resulting from mechanical
|
|
25
|
+
transformation or translation of a Source form.
|
|
26
|
+
|
|
27
|
+
"Work" shall mean the work of authorship made available under the License.
|
|
28
|
+
|
|
29
|
+
"Contribution" shall mean any work of authorship submitted to the Licensor
|
|
30
|
+
for inclusion in the Work.
|
|
31
|
+
|
|
32
|
+
"Contributor" shall mean Licensor and any Legal Entity on behalf of whom
|
|
33
|
+
a Contribution has been received by the Licensor.
|
|
34
|
+
|
|
35
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
36
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
37
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
38
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
39
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
40
|
+
Work and such Derivative Works in Source or Object form.
|
|
41
|
+
|
|
42
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
43
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
44
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
45
|
+
patent license to make, have made, use, offer to sell, sell,
|
|
46
|
+
import, and otherwise transfer the Work.
|
|
47
|
+
|
|
48
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
49
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
50
|
+
modifications, and in Source or Object form, provided that You
|
|
51
|
+
meet the following conditions:
|
|
52
|
+
|
|
53
|
+
(a) You must give any other recipients of the Work or
|
|
54
|
+
Derivative Works a copy of this License; and
|
|
55
|
+
|
|
56
|
+
(b) You must cause any modified files to carry prominent notices
|
|
57
|
+
stating that You changed the files; and
|
|
58
|
+
|
|
59
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
60
|
+
that You distribute, all copyright, patent, trademark, and
|
|
61
|
+
attribution notices from the Source form of the Work; and
|
|
62
|
+
|
|
63
|
+
(d) If the Work includes a "NOTICE" text file, You must include
|
|
64
|
+
a readable copy of the attribution notices contained within
|
|
65
|
+
such NOTICE file.
|
|
66
|
+
|
|
67
|
+
5. Submission of Contributions.
|
|
68
|
+
|
|
69
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
70
|
+
names, trademarks, service marks, or product names of the Licensor.
|
|
71
|
+
|
|
72
|
+
7. Disclaimer of Warranty. The Work is provided on an "AS IS" BASIS,
|
|
73
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
|
|
74
|
+
|
|
75
|
+
8. Limitation of Liability. In no event shall any Contributor be
|
|
76
|
+
liable to You for damages.
|
|
77
|
+
|
|
78
|
+
9. Accepting Warranty or Additional Liability.
|
|
79
|
+
|
|
80
|
+
Copyright 2025 nputop contributors
|
|
81
|
+
|
|
82
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
83
|
+
you may not use this file except in compliance with the License.
|
|
84
|
+
You may obtain a copy of the License at
|
|
85
|
+
|
|
86
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
87
|
+
|
|
88
|
+
Unless required by applicable law or agreed to in writing, software
|
|
89
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
90
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
91
|
+
See the License for the specific language governing permissions and
|
|
92
|
+
limitations under the License.
|
xputop-0.1.0/MANIFEST.in
ADDED
xputop-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xputop
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An interactive Huawei Ascend NPU process viewer and monitor, inspired by nvitop.
|
|
5
|
+
Author-email: Zander Zhao <zhaozhaongrui@mails.ucas.ac.cn>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/ZanderZhao/xputop
|
|
8
|
+
Project-URL: Repository, https://github.com/ZanderZhao/xputop
|
|
9
|
+
Project-URL: Issues, https://github.com/ZanderZhao/xputop/issues
|
|
10
|
+
Keywords: huawei,ascend,npu,monitor,top,npu-smi,gpu
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Environment :: Console :: Curses
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: System Administrators
|
|
16
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
17
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Topic :: System :: Monitoring
|
|
25
|
+
Classifier: Topic :: Utilities
|
|
26
|
+
Requires-Python: >=3.8
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: rich>=13.0.0
|
|
30
|
+
Requires-Dist: psutil>=5.9.0
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
33
|
+
Requires-Dist: build; extra == "dev"
|
|
34
|
+
Requires-Dist: twine; extra == "dev"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# xputop
|
|
38
|
+
|
|
39
|
+
An interactive Huawei Ascend NPU process viewer and monitor, inspired by [nvitop](https://github.com/XuehaiPan/nvitop).
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- Real-time monitoring of Huawei Ascend NPU devices (HBM memory, AI Core utilization, temperature, power)
|
|
44
|
+
- Rich terminal UI with per-device cards, usage bars, and process lists
|
|
45
|
+
- **Fixed process row count** per NPU card to prevent UI jitter (`-p N`, default 3)
|
|
46
|
+
- **Sparkline history curves** (like nvtop) for NPU, CPU, memory, disk, and network metrics
|
|
47
|
+
- **Detail mode** (`-d`) for combined chart + CPU + memory + disk view
|
|
48
|
+
- **Full detail mode** (`-a`) adds network I/O monitoring and system process tree
|
|
49
|
+
- **CPU per-core utilization** panel with load averages
|
|
50
|
+
- **System memory** (RAM + Swap) panel
|
|
51
|
+
- **Disk usage** panel with configurable mount points
|
|
52
|
+
- **Network I/O** panel with rate history (auto-skips if root privileges unavailable)
|
|
53
|
+
- Multi-card support with summary table
|
|
54
|
+
- Configurable heartbeat / refresh interval
|
|
55
|
+
- **Threshold-based email alerting** for both NPU and system metrics
|
|
56
|
+
- Configuration stored in `~/.config/xputop/config.toml` (XDG-compliant)
|
|
57
|
+
- **Chinese help** via `--zh`
|
|
58
|
+
- JSON output mode for scripting and automation
|
|
59
|
+
- Demo mode for development without hardware
|
|
60
|
+
- **Lightweight design** — non-blocking CPU sampling, no extra subprocesses
|
|
61
|
+
|
|
62
|
+
## Installation
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install xputop
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Quick Start
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Launch the interactive TUI
|
|
72
|
+
xputop
|
|
73
|
+
|
|
74
|
+
# Detail mode: chart + CPU + memory + disk
|
|
75
|
+
xputop -d
|
|
76
|
+
|
|
77
|
+
# Full detail: + network I/O + process tree
|
|
78
|
+
xputop -a
|
|
79
|
+
|
|
80
|
+
# Show with sparkline curves + CPU + memory + disk
|
|
81
|
+
xputop -C --cpu -m -D
|
|
82
|
+
|
|
83
|
+
# Control process display rows (default 3, 0=hide, -1=show all)
|
|
84
|
+
xputop -p 5 -C
|
|
85
|
+
xputop -p 0 # hide all processes
|
|
86
|
+
xputop -p -1 # show all (may jitter)
|
|
87
|
+
|
|
88
|
+
# Monitor specific disk paths
|
|
89
|
+
xputop -C -D /data /home
|
|
90
|
+
|
|
91
|
+
# Run in demo mode (no Ascend hardware needed)
|
|
92
|
+
xputop --demo -d
|
|
93
|
+
xputop --demo -a
|
|
94
|
+
|
|
95
|
+
# Set custom refresh interval (seconds)
|
|
96
|
+
xputop -i 5
|
|
97
|
+
|
|
98
|
+
# Print a single snapshot and exit
|
|
99
|
+
xputop once --cpu --mem --disk
|
|
100
|
+
|
|
101
|
+
# Print snapshot as JSON
|
|
102
|
+
xputop once --json --cpu --mem --disk /data
|
|
103
|
+
|
|
104
|
+
# Show Chinese help
|
|
105
|
+
xputop --zh
|
|
106
|
+
|
|
107
|
+
# Generate a default configuration file
|
|
108
|
+
xputop config --generate
|
|
109
|
+
|
|
110
|
+
# Send a test alert email
|
|
111
|
+
xputop alert-test
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Command-Line Options
|
|
115
|
+
|
|
116
|
+
| Short | Long | Default | Description |
|
|
117
|
+
|-------|------------------|---------|--------------------------------------------------|
|
|
118
|
+
| `-V` | `--version` | — | Show version and exit |
|
|
119
|
+
| `-i` | `--interval` | 2.0 | Refresh interval in seconds |
|
|
120
|
+
| `-c` | `--config` | — | Path to configuration file |
|
|
121
|
+
| `-C` | `--chart` | off | Enable nvtop-style sparkline history curves |
|
|
122
|
+
| `-l` | `--chart-length` | 120 | Number of history points for sparklines |
|
|
123
|
+
| `-p` | `--processes` | 3 | Process rows per NPU card (0=hide, -1=all) |
|
|
124
|
+
| | `--cpu` | off | Show CPU per-core utilization panel |
|
|
125
|
+
| `-m` | `--mem` | off | Show system memory panel |
|
|
126
|
+
| `-D` | `--disk` | off | Show disk usage panel (optionally specify paths) |
|
|
127
|
+
| `-d` | `--detail` | off | Detail mode = `--chart --cpu --mem --disk` |
|
|
128
|
+
| `-a` | `--detail-all` | off | Full detail = `-d` + network + process tree |
|
|
129
|
+
| | `--demo` | off | Demo mode with fake NPU data |
|
|
130
|
+
| | `--demo-devices` | 4 | Number of simulated NPU devices |
|
|
131
|
+
| | `--zh` | — | Show Chinese help |
|
|
132
|
+
|
|
133
|
+
## Configuration
|
|
134
|
+
|
|
135
|
+
Configuration is stored at `~/.config/xputop/config.toml`. Generate a default:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
xputop config --generate
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
You can also set `XPUTOP_CONFIG_DIR` to use a custom directory.
|
|
142
|
+
|
|
143
|
+
### Example config.toml
|
|
144
|
+
|
|
145
|
+
```toml
|
|
146
|
+
[general]
|
|
147
|
+
interval = 2.0
|
|
148
|
+
demo = false
|
|
149
|
+
demo_devices = 4
|
|
150
|
+
|
|
151
|
+
[display]
|
|
152
|
+
chart = true
|
|
153
|
+
chart_length = 120
|
|
154
|
+
cpu = true
|
|
155
|
+
mem = true
|
|
156
|
+
disk = true
|
|
157
|
+
disk_paths = /, /data
|
|
158
|
+
process_rows = 3
|
|
159
|
+
|
|
160
|
+
[email]
|
|
161
|
+
enabled = true
|
|
162
|
+
smtp_host = smtp.gmail.com
|
|
163
|
+
smtp_port = 587
|
|
164
|
+
use_tls = true
|
|
165
|
+
username = you@gmail.com
|
|
166
|
+
password = your_app_password
|
|
167
|
+
sender = xputop@yourdomain.com
|
|
168
|
+
recipients = admin@yourdomain.com, ops@yourdomain.com
|
|
169
|
+
subject_prefix = [xputop]
|
|
170
|
+
|
|
171
|
+
# NPU alert rules
|
|
172
|
+
[rule:0]
|
|
173
|
+
metric = temperature
|
|
174
|
+
limit = 80.0
|
|
175
|
+
cooldown = 300
|
|
176
|
+
|
|
177
|
+
[rule:1]
|
|
178
|
+
metric = hbm_usage_percent
|
|
179
|
+
limit = 95.0
|
|
180
|
+
cooldown = 300
|
|
181
|
+
|
|
182
|
+
# System alert rules
|
|
183
|
+
[rule:2]
|
|
184
|
+
metric = cpu_percent
|
|
185
|
+
limit = 95.0
|
|
186
|
+
cooldown = 120
|
|
187
|
+
|
|
188
|
+
[rule:3]
|
|
189
|
+
metric = mem_percent
|
|
190
|
+
limit = 90.0
|
|
191
|
+
cooldown = 300
|
|
192
|
+
|
|
193
|
+
[rule:4]
|
|
194
|
+
metric = disk_percent
|
|
195
|
+
limit = 95.0
|
|
196
|
+
cooldown = 600
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Alert Rule Metrics
|
|
200
|
+
|
|
201
|
+
| Metric | Type | Description | Unit |
|
|
202
|
+
|---------------------|--------|----------------------------------|------|
|
|
203
|
+
| `temperature` | NPU | Device temperature | °C |
|
|
204
|
+
| `power` | NPU | Power consumption | W |
|
|
205
|
+
| `aicore_rate` | NPU | AI Core utilization | % |
|
|
206
|
+
| `hbm_usage_percent` | NPU | HBM memory usage percentage | % |
|
|
207
|
+
| `hbm_used` | NPU | HBM memory used | MiB |
|
|
208
|
+
| `cpu_percent` | System | Overall CPU utilization | % |
|
|
209
|
+
| `mem_percent` | System | RAM usage percentage | % |
|
|
210
|
+
| `disk_percent` | System | Disk usage percentage | % |
|
|
211
|
+
| `swap_percent` | System | Swap usage percentage | % |
|
|
212
|
+
|
|
213
|
+
## Lightweight Design
|
|
214
|
+
|
|
215
|
+
Designed to run alongside model training with minimal overhead:
|
|
216
|
+
|
|
217
|
+
- CPU sampling uses psutil non-blocking mode (`cpu_interval=None`), adding zero extra latency
|
|
218
|
+
- All system metrics are collected in a single call — no extra subprocesses
|
|
219
|
+
- Network I/O detection auto-skips if root privileges are unavailable (no retry)
|
|
220
|
+
- Default 2-second heartbeat; recommended 5–10 seconds for heavy training workloads
|
|
221
|
+
- `npu-smi` calls use a 10-second timeout to prevent hangs
|
|
222
|
+
|
|
223
|
+
## Build & Release
|
|
224
|
+
|
|
225
|
+
See [BUILD.md](BUILD.md) for development setup, building, and publishing instructions.
|
|
226
|
+
|
|
227
|
+
## Requirements
|
|
228
|
+
|
|
229
|
+
- Python >= 3.8
|
|
230
|
+
- Huawei Ascend driver with `npu-smi` in PATH (or use `--demo` mode)
|
|
231
|
+
- `rich` and `psutil` (installed automatically)
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
Apache License 2.0
|
xputop-0.1.0/README.md
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# xputop
|
|
2
|
+
|
|
3
|
+
An interactive Huawei Ascend NPU process viewer and monitor, inspired by [nvitop](https://github.com/XuehaiPan/nvitop).
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Real-time monitoring of Huawei Ascend NPU devices (HBM memory, AI Core utilization, temperature, power)
|
|
8
|
+
- Rich terminal UI with per-device cards, usage bars, and process lists
|
|
9
|
+
- **Fixed process row count** per NPU card to prevent UI jitter (`-p N`, default 3)
|
|
10
|
+
- **Sparkline history curves** (like nvtop) for NPU, CPU, memory, disk, and network metrics
|
|
11
|
+
- **Detail mode** (`-d`) for combined chart + CPU + memory + disk view
|
|
12
|
+
- **Full detail mode** (`-a`) adds network I/O monitoring and system process tree
|
|
13
|
+
- **CPU per-core utilization** panel with load averages
|
|
14
|
+
- **System memory** (RAM + Swap) panel
|
|
15
|
+
- **Disk usage** panel with configurable mount points
|
|
16
|
+
- **Network I/O** panel with rate history (auto-skips if root privileges unavailable)
|
|
17
|
+
- Multi-card support with summary table
|
|
18
|
+
- Configurable heartbeat / refresh interval
|
|
19
|
+
- **Threshold-based email alerting** for both NPU and system metrics
|
|
20
|
+
- Configuration stored in `~/.config/xputop/config.toml` (XDG-compliant)
|
|
21
|
+
- **Chinese help** via `--zh`
|
|
22
|
+
- JSON output mode for scripting and automation
|
|
23
|
+
- Demo mode for development without hardware
|
|
24
|
+
- **Lightweight design** — non-blocking CPU sampling, no extra subprocesses
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install xputop
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Quick Start
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Launch the interactive TUI
|
|
36
|
+
xputop
|
|
37
|
+
|
|
38
|
+
# Detail mode: chart + CPU + memory + disk
|
|
39
|
+
xputop -d
|
|
40
|
+
|
|
41
|
+
# Full detail: + network I/O + process tree
|
|
42
|
+
xputop -a
|
|
43
|
+
|
|
44
|
+
# Show with sparkline curves + CPU + memory + disk
|
|
45
|
+
xputop -C --cpu -m -D
|
|
46
|
+
|
|
47
|
+
# Control process display rows (default 3, 0=hide, -1=show all)
|
|
48
|
+
xputop -p 5 -C
|
|
49
|
+
xputop -p 0 # hide all processes
|
|
50
|
+
xputop -p -1 # show all (may jitter)
|
|
51
|
+
|
|
52
|
+
# Monitor specific disk paths
|
|
53
|
+
xputop -C -D /data /home
|
|
54
|
+
|
|
55
|
+
# Run in demo mode (no Ascend hardware needed)
|
|
56
|
+
xputop --demo -d
|
|
57
|
+
xputop --demo -a
|
|
58
|
+
|
|
59
|
+
# Set custom refresh interval (seconds)
|
|
60
|
+
xputop -i 5
|
|
61
|
+
|
|
62
|
+
# Print a single snapshot and exit
|
|
63
|
+
xputop once --cpu --mem --disk
|
|
64
|
+
|
|
65
|
+
# Print snapshot as JSON
|
|
66
|
+
xputop once --json --cpu --mem --disk /data
|
|
67
|
+
|
|
68
|
+
# Show Chinese help
|
|
69
|
+
xputop --zh
|
|
70
|
+
|
|
71
|
+
# Generate a default configuration file
|
|
72
|
+
xputop config --generate
|
|
73
|
+
|
|
74
|
+
# Send a test alert email
|
|
75
|
+
xputop alert-test
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Command-Line Options
|
|
79
|
+
|
|
80
|
+
| Short | Long | Default | Description |
|
|
81
|
+
|-------|------------------|---------|--------------------------------------------------|
|
|
82
|
+
| `-V` | `--version` | — | Show version and exit |
|
|
83
|
+
| `-i` | `--interval` | 2.0 | Refresh interval in seconds |
|
|
84
|
+
| `-c` | `--config` | — | Path to configuration file |
|
|
85
|
+
| `-C` | `--chart` | off | Enable nvtop-style sparkline history curves |
|
|
86
|
+
| `-l` | `--chart-length` | 120 | Number of history points for sparklines |
|
|
87
|
+
| `-p` | `--processes` | 3 | Process rows per NPU card (0=hide, -1=all) |
|
|
88
|
+
| | `--cpu` | off | Show CPU per-core utilization panel |
|
|
89
|
+
| `-m` | `--mem` | off | Show system memory panel |
|
|
90
|
+
| `-D` | `--disk` | off | Show disk usage panel (optionally specify paths) |
|
|
91
|
+
| `-d` | `--detail` | off | Detail mode = `--chart --cpu --mem --disk` |
|
|
92
|
+
| `-a` | `--detail-all` | off | Full detail = `-d` + network + process tree |
|
|
93
|
+
| | `--demo` | off | Demo mode with fake NPU data |
|
|
94
|
+
| | `--demo-devices` | 4 | Number of simulated NPU devices |
|
|
95
|
+
| | `--zh` | — | Show Chinese help |
|
|
96
|
+
|
|
97
|
+
## Configuration
|
|
98
|
+
|
|
99
|
+
Configuration is stored at `~/.config/xputop/config.toml`. Generate a default:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
xputop config --generate
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
You can also set `XPUTOP_CONFIG_DIR` to use a custom directory.
|
|
106
|
+
|
|
107
|
+
### Example config.toml
|
|
108
|
+
|
|
109
|
+
```toml
|
|
110
|
+
[general]
|
|
111
|
+
interval = 2.0
|
|
112
|
+
demo = false
|
|
113
|
+
demo_devices = 4
|
|
114
|
+
|
|
115
|
+
[display]
|
|
116
|
+
chart = true
|
|
117
|
+
chart_length = 120
|
|
118
|
+
cpu = true
|
|
119
|
+
mem = true
|
|
120
|
+
disk = true
|
|
121
|
+
disk_paths = /, /data
|
|
122
|
+
process_rows = 3
|
|
123
|
+
|
|
124
|
+
[email]
|
|
125
|
+
enabled = true
|
|
126
|
+
smtp_host = smtp.gmail.com
|
|
127
|
+
smtp_port = 587
|
|
128
|
+
use_tls = true
|
|
129
|
+
username = you@gmail.com
|
|
130
|
+
password = your_app_password
|
|
131
|
+
sender = xputop@yourdomain.com
|
|
132
|
+
recipients = admin@yourdomain.com, ops@yourdomain.com
|
|
133
|
+
subject_prefix = [xputop]
|
|
134
|
+
|
|
135
|
+
# NPU alert rules
|
|
136
|
+
[rule:0]
|
|
137
|
+
metric = temperature
|
|
138
|
+
limit = 80.0
|
|
139
|
+
cooldown = 300
|
|
140
|
+
|
|
141
|
+
[rule:1]
|
|
142
|
+
metric = hbm_usage_percent
|
|
143
|
+
limit = 95.0
|
|
144
|
+
cooldown = 300
|
|
145
|
+
|
|
146
|
+
# System alert rules
|
|
147
|
+
[rule:2]
|
|
148
|
+
metric = cpu_percent
|
|
149
|
+
limit = 95.0
|
|
150
|
+
cooldown = 120
|
|
151
|
+
|
|
152
|
+
[rule:3]
|
|
153
|
+
metric = mem_percent
|
|
154
|
+
limit = 90.0
|
|
155
|
+
cooldown = 300
|
|
156
|
+
|
|
157
|
+
[rule:4]
|
|
158
|
+
metric = disk_percent
|
|
159
|
+
limit = 95.0
|
|
160
|
+
cooldown = 600
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Alert Rule Metrics
|
|
164
|
+
|
|
165
|
+
| Metric | Type | Description | Unit |
|
|
166
|
+
|---------------------|--------|----------------------------------|------|
|
|
167
|
+
| `temperature` | NPU | Device temperature | °C |
|
|
168
|
+
| `power` | NPU | Power consumption | W |
|
|
169
|
+
| `aicore_rate` | NPU | AI Core utilization | % |
|
|
170
|
+
| `hbm_usage_percent` | NPU | HBM memory usage percentage | % |
|
|
171
|
+
| `hbm_used` | NPU | HBM memory used | MiB |
|
|
172
|
+
| `cpu_percent` | System | Overall CPU utilization | % |
|
|
173
|
+
| `mem_percent` | System | RAM usage percentage | % |
|
|
174
|
+
| `disk_percent` | System | Disk usage percentage | % |
|
|
175
|
+
| `swap_percent` | System | Swap usage percentage | % |
|
|
176
|
+
|
|
177
|
+
## Lightweight Design
|
|
178
|
+
|
|
179
|
+
Designed to run alongside model training with minimal overhead:
|
|
180
|
+
|
|
181
|
+
- CPU sampling uses psutil non-blocking mode (`cpu_interval=None`), adding zero extra latency
|
|
182
|
+
- All system metrics are collected in a single call — no extra subprocesses
|
|
183
|
+
- Network I/O detection auto-skips if root privileges are unavailable (no retry)
|
|
184
|
+
- Default 2-second heartbeat; recommended 5–10 seconds for heavy training workloads
|
|
185
|
+
- `npu-smi` calls use a 10-second timeout to prevent hangs
|
|
186
|
+
|
|
187
|
+
## Build & Release
|
|
188
|
+
|
|
189
|
+
See [BUILD.md](BUILD.md) for development setup, building, and publishing instructions.
|
|
190
|
+
|
|
191
|
+
## Requirements
|
|
192
|
+
|
|
193
|
+
- Python >= 3.8
|
|
194
|
+
- Huawei Ascend driver with `npu-smi` in PATH (or use `--demo` mode)
|
|
195
|
+
- `rich` and `psutil` (installed automatically)
|
|
196
|
+
|
|
197
|
+
## License
|
|
198
|
+
|
|
199
|
+
Apache License 2.0
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "xputop"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "An interactive Huawei Ascend NPU process viewer and monitor, inspired by nvitop."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "Apache-2.0"}
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Zander Zhao", email = "zhaozhaongrui@mails.ucas.ac.cn"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["huawei", "ascend", "npu", "monitor", "top", "npu-smi", "gpu"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Environment :: Console",
|
|
19
|
+
"Environment :: Console :: Curses",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Intended Audience :: System Administrators",
|
|
22
|
+
"License :: OSI Approved :: Apache Software License",
|
|
23
|
+
"Operating System :: POSIX :: Linux",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.8",
|
|
26
|
+
"Programming Language :: Python :: 3.9",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Topic :: System :: Monitoring",
|
|
31
|
+
"Topic :: Utilities",
|
|
32
|
+
]
|
|
33
|
+
dependencies = [
|
|
34
|
+
"rich>=13.0.0",
|
|
35
|
+
"psutil>=5.9.0",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=7.0",
|
|
41
|
+
"build",
|
|
42
|
+
"twine",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.urls]
|
|
46
|
+
Homepage = "https://github.com/ZanderZhao/xputop"
|
|
47
|
+
Repository = "https://github.com/ZanderZhao/xputop"
|
|
48
|
+
Issues = "https://github.com/ZanderZhao/xputop/issues"
|
|
49
|
+
|
|
50
|
+
[project.scripts]
|
|
51
|
+
xputop = "xputop.cli:main"
|
|
52
|
+
|
|
53
|
+
[tool.setuptools.packages.find]
|
|
54
|
+
include = ["xputop*"]
|
xputop-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Tests for the AMD rocm-smi / amd-smi backend parsing."""
|
|
2
|
+
|
|
3
|
+
from xputop.core.backends.amd import AmdBackend
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_rocm_smi_regex(monkeypatch):
|
|
7
|
+
backend = AmdBackend()
|
|
8
|
+
backend._cmd = "rocm-smi"
|
|
9
|
+
backend._is_amd_smi = False
|
|
10
|
+
|
|
11
|
+
fake_output = """
|
|
12
|
+
============================ ROCm System Management Interface ============================
|
|
13
|
+
====================================== System Info =======================================
|
|
14
|
+
GPU[0]: Temperature (Sensor edge) (C): 65.0
|
|
15
|
+
GPU[0]: GPU use (%): 99
|
|
16
|
+
GPU[0]: Average Graphics Package Power (W): 300.5
|
|
17
|
+
GPU[0]: VRAM Total Memory (B): 17163091968
|
|
18
|
+
GPU[0]: VRAM Total Used Memory (B): 8581545984
|
|
19
|
+
GPU[1]: Temperature (Sensor edge) (C): 62.0
|
|
20
|
+
GPU[1]: GPU use (%): 45
|
|
21
|
+
GPU[1]: Average Graphics Package Power (W): 150.2
|
|
22
|
+
GPU[1]: VRAM Total Memory (B): 17163091968
|
|
23
|
+
GPU[1]: VRAM Total Used Memory (B): 2000000000
|
|
24
|
+
=========================================================================================
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def mock_run(*args, **kwargs):
|
|
28
|
+
class Res:
|
|
29
|
+
returncode = 0
|
|
30
|
+
stdout = fake_output
|
|
31
|
+
return Res()
|
|
32
|
+
|
|
33
|
+
monkeypatch.setattr("subprocess.run", mock_run)
|
|
34
|
+
|
|
35
|
+
devices, driver, err = backend._collect_rocm_smi()
|
|
36
|
+
assert err == ""
|
|
37
|
+
assert len(devices) == 2
|
|
38
|
+
|
|
39
|
+
d0 = devices[0]
|
|
40
|
+
assert d0.device_id == 0
|
|
41
|
+
assert d0.temperature == 65.0
|
|
42
|
+
assert d0.power == 300.5
|
|
43
|
+
assert d0.utilization_rate == 99.0
|
|
44
|
+
assert abs(d0.mem_total - 16368.0) < 1.0 # 17163091968 B ~ 16368 MB
|
|
45
|
+
assert abs(d0.mem_used - 8184.0) < 1.0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_amd_smi_csv(monkeypatch):
|
|
49
|
+
backend = AmdBackend()
|
|
50
|
+
backend._cmd = "amd-smi"
|
|
51
|
+
backend._is_amd_smi = True
|
|
52
|
+
|
|
53
|
+
fake_output = """gpu,temperature (c),power (w),usage (%),vram total (mb),vram used (mb)
|
|
54
|
+
0,75.0,250.0,100,16384,16384
|
|
55
|
+
1,45.0,80.0,0,16384,1024
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def mock_run(*args, **kwargs):
|
|
59
|
+
class Res:
|
|
60
|
+
returncode = 0
|
|
61
|
+
stdout = fake_output
|
|
62
|
+
return Res()
|
|
63
|
+
|
|
64
|
+
monkeypatch.setattr("subprocess.run", mock_run)
|
|
65
|
+
|
|
66
|
+
devices, driver, err = backend._collect_amd_smi()
|
|
67
|
+
assert err == ""
|
|
68
|
+
assert len(devices) == 2
|
|
69
|
+
|
|
70
|
+
d1 = devices[1]
|
|
71
|
+
assert d1.device_id == 1
|
|
72
|
+
assert d1.temperature == 45.0
|
|
73
|
+
assert d1.power == 80.0
|
|
74
|
+
assert d1.utilization_rate == 0.0
|
|
75
|
+
assert d1.mem_total == 16384.0
|
|
76
|
+
assert d1.mem_used == 1024.0
|