vllmpytop 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllmpytop-0.1.0/LICENSE +21 -0
- vllmpytop-0.1.0/PKG-INFO +156 -0
- vllmpytop-0.1.0/README.md +122 -0
- vllmpytop-0.1.0/pyproject.toml +50 -0
- vllmpytop-0.1.0/setup.cfg +4 -0
- vllmpytop-0.1.0/tests/test_prometheus.py +42 -0
- vllmpytop-0.1.0/tests/test_state.py +86 -0
- vllmpytop-0.1.0/vllmpytop.egg-info/PKG-INFO +156 -0
- vllmpytop-0.1.0/vllmpytop.egg-info/SOURCES.txt +25 -0
- vllmpytop-0.1.0/vllmpytop.egg-info/dependency_links.txt +1 -0
- vllmpytop-0.1.0/vllmpytop.egg-info/entry_points.txt +2 -0
- vllmpytop-0.1.0/vllmpytop.egg-info/requires.txt +7 -0
- vllmpytop-0.1.0/vllmpytop.egg-info/top_level.txt +1 -0
- vllmpytop-0.1.0/vllmtop/__init__.py +3 -0
- vllmpytop-0.1.0/vllmtop/__main__.py +4 -0
- vllmpytop-0.1.0/vllmtop/cli.py +132 -0
- vllmpytop-0.1.0/vllmtop/collectors/__init__.py +0 -0
- vllmpytop-0.1.0/vllmtop/collectors/gpu.py +95 -0
- vllmpytop-0.1.0/vllmtop/collectors/vllm.py +127 -0
- vllmpytop-0.1.0/vllmtop/config.py +34 -0
- vllmpytop-0.1.0/vllmtop/state.py +298 -0
- vllmpytop-0.1.0/vllmtop/ui/__init__.py +0 -0
- vllmpytop-0.1.0/vllmtop/ui/app.py +232 -0
- vllmpytop-0.1.0/vllmtop/ui/layout.py +85 -0
- vllmpytop-0.1.0/vllmtop/ui/panels.py +352 -0
- vllmpytop-0.1.0/vllmtop/ui/theme.py +254 -0
- vllmpytop-0.1.0/vllmtop/ui/widgets.py +150 -0
vllmpytop-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Theo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
vllmpytop-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vllmpytop
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A btop-style terminal UI for monitoring a vLLM instance and its GPU in real time.
|
|
5
|
+
Author-email: Theodore Kirby <theo@kirby.dev>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/theo-kirby/vllmtop
|
|
8
|
+
Project-URL: Repository, https://github.com/theo-kirby/vllmtop
|
|
9
|
+
Project-URL: Issues, https://github.com/theo-kirby/vllmtop/issues
|
|
10
|
+
Keywords: vllm,gpu,monitoring,tui,terminal,btop,nvml,prometheus,llm
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console :: Curses
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: System Administrators
|
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: System :: Monitoring
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: nvidia-ml-py>=12.0
|
|
28
|
+
Requires-Dist: prometheus-client>=0.20
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
31
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
32
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# vllmtop
|
|
36
|
+
|
|
37
|
+
A **btop-style terminal UI** for monitoring a running [vLLM](https://github.com/vllm-project/vllm)
|
|
38
|
+
instance and its GPU in real time. Hand-rolled braille charts, a responsive
|
|
39
|
+
curses layout, and a non-blocking background poller so the UI never stalls on
|
|
40
|
+
network or NVML latency.
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
╭─┐¹ gpu ┌──────────────────────────────────────────────────────────────────╮
|
|
44
|
+
│ NVIDIA GeForce RTX 5090 util 86% 50°C 319/600W SM 2857MHz fan 31% │
|
|
45
|
+
│ ⣿⣿⣿⣿ … braille utilisation chart … │
|
|
46
|
+
│ VRAM 27.3GB/31.8GB ████████████████████░░░ 86% │
|
|
47
|
+
│ PWR ████████████░░░░░░░░░░░░░░░░░░░░░░░░░░ 53% │
|
|
48
|
+
╰─────────────────────────────────────────────────────────────────────────────╯
|
|
49
|
+
╭─┐² throughput ┌────────────╮╭─┐³ requests ┌──────────────╮
|
|
50
|
+
│ gen 149 tok/s ⣀⣤⣶⣿ ││ running ████░░ 1 │
|
|
51
|
+
│ prompt 3.5k tok/s ││ waiting ██████ 2 │
|
|
52
|
+
╰─┘ tok/s └──────────────────╯╰─┘ queue └───────────────────╯
|
|
53
|
+
╭─┐⁴ latency ┌───────────────╮╭─┐⁵ cache ┌──────────────────╮
|
|
54
|
+
│ TTFT 964ms TPOT 6ms … ││ KV █░░░ 3% prefix 0.0% │
|
|
55
|
+
╰─┘ recent avg └─────────────╯╰─────────────────────────────╯
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Rounded corners, superscript panel numbers in the title tabs, and a secondary
|
|
59
|
+
label on the bottom edge — matching [btop](https://github.com/aristocratos/btop)'s box style.
|
|
60
|
+
|
|
61
|
+
## What it shows
|
|
62
|
+
|
|
63
|
+
- **GPU** (via NVML / `pynvml`): utilisation %, VRAM used/total, temperature,
|
|
64
|
+
power draw vs. limit, SM clock, fan — with green/yellow/red thresholds.
|
|
65
|
+
- **Throughput**: generation tok/s and prompt tok/s (rates derived from vLLM
|
|
66
|
+
counters), as big numbers + braille charts.
|
|
67
|
+
- **Requests / Queue**: running vs. waiting requests and preemptions.
|
|
68
|
+
- **Latency** (recent average over the last poll interval — far more useful live
|
|
69
|
+
than the cumulative average): TTFT, inter-token (TPOT), end-to-end, queue time.
|
|
70
|
+
- **Cache**: KV-cache usage % and prefix-cache hit rate.
|
|
71
|
+
|
|
72
|
+
Data comes from vLLM's Prometheus `/metrics` endpoint plus in-process NVML
|
|
73
|
+
polling. If vLLM goes away (e.g. a container restart) the UI shows a disconnect
|
|
74
|
+
banner and keeps the GPU panel live, then reconnects automatically.
|
|
75
|
+
|
|
76
|
+
## Install
|
|
77
|
+
|
|
78
|
+
Requires Python 3.10+ on Linux (curses is stdlib). A working NVIDIA driver is
|
|
79
|
+
needed for the GPU panel.
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install .
|
|
83
|
+
# or, for development:
|
|
84
|
+
pip install -e ".[dev]"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Dependencies: `nvidia-ml-py` (NVML bindings) and `prometheus-client` (exposition
|
|
88
|
+
parser). The `/metrics` fetch uses stdlib `urllib`.
|
|
89
|
+
|
|
90
|
+
## Usage
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
vllmtop # monitor http://localhost:8000
|
|
94
|
+
vllmtop --url http://host:8000 # a remote vLLM server
|
|
95
|
+
vllmtop --interval 0.5 # poll twice a second
|
|
96
|
+
vllmtop --no-gpu # skip the GPU panel
|
|
97
|
+
python -m vllmtop # same thing, without the entry point
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
The server URL can also be set via the `VLLMTOP_URL` environment variable.
|
|
101
|
+
|
|
102
|
+
### Options
|
|
103
|
+
|
|
104
|
+
| Flag | Default | Description |
|
|
105
|
+
|------|---------|-------------|
|
|
106
|
+
| `--url` | `http://localhost:8000` | vLLM base URL (env `VLLMTOP_URL`) |
|
|
107
|
+
| `--interval` | `1.0` | poll interval in seconds |
|
|
108
|
+
| `--gpu-index` | `0` | NVML GPU index |
|
|
109
|
+
| `--no-gpu` | off | disable the GPU panel |
|
|
110
|
+
| `--dump-json` | off | collect one snapshot, print JSON, exit (no TTY) |
|
|
111
|
+
|
|
112
|
+
### Keybindings
|
|
113
|
+
|
|
114
|
+
| Key | Action |
|
|
115
|
+
|-----|--------|
|
|
116
|
+
| `q` / `Esc` | quit |
|
|
117
|
+
| `+` / `-` | faster / slower refresh |
|
|
118
|
+
| `p` | pause / resume polling |
|
|
119
|
+
| `1`–`5` | toggle a panel on/off (¹gpu ²throughput ³requests ⁴latency ⁵cache) |
|
|
120
|
+
| `h` / `?` | toggle help overlay |
|
|
121
|
+
|
|
122
|
+
Each panel's title carries a superscript number (btop-style) showing the key
|
|
123
|
+
that toggles it. Hiding panels reflows the rest to fill the freed space.
|
|
124
|
+
|
|
125
|
+
### Headless smoke test
|
|
126
|
+
|
|
127
|
+
`--dump-json` collects two snapshots an interval apart (so rates are populated),
|
|
128
|
+
prints the result as JSON, and exits. Works without a TTY — handy for CI or
|
|
129
|
+
verifying connectivity:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
python -m vllmtop --dump-json --url http://localhost:8000
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## How it works
|
|
136
|
+
|
|
137
|
+
- A **background poller thread** scrapes `/metrics` and polls NVML every
|
|
138
|
+
`interval` seconds, storing the latest combined snapshot under a lock. This
|
|
139
|
+
keeps all I/O latency off the render path.
|
|
140
|
+
- The **UI loop** wakes on a short tick (250 ms), reads the latest snapshot,
|
|
141
|
+
appends derived values (rates, recent-average latencies) to per-series ring
|
|
142
|
+
buffers, and redraws — so render cadence is independent of poll cadence.
|
|
143
|
+
- **Counters → rates**: `Δvalue / Δt`, guarded against `Δt ≤ 0` and counter
|
|
144
|
+
resets. **Histograms → recent average**: `Δsum / Δcount` between polls.
|
|
145
|
+
- **Braille charts**: each cell is a 2×4 Unicode braille dot matrix, giving
|
|
146
|
+
`2w × 4h`-dot resolution for the smooth btop look.
|
|
147
|
+
|
|
148
|
+
## Development
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
pytest # parser-against-fixture, rate math, braille rendering
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# vllmtop
|
|
2
|
+
|
|
3
|
+
A **btop-style terminal UI** for monitoring a running [vLLM](https://github.com/vllm-project/vllm)
|
|
4
|
+
instance and its GPU in real time. Hand-rolled braille charts, a responsive
|
|
5
|
+
curses layout, and a non-blocking background poller so the UI never stalls on
|
|
6
|
+
network or NVML latency.
|
|
7
|
+
|
|
8
|
+
```
|
|
9
|
+
╭─┐¹ gpu ┌──────────────────────────────────────────────────────────────────╮
|
|
10
|
+
│ NVIDIA GeForce RTX 5090 util 86% 50°C 319/600W SM 2857MHz fan 31% │
|
|
11
|
+
│ ⣿⣿⣿⣿ … braille utilisation chart … │
|
|
12
|
+
│ VRAM 27.3GB/31.8GB ████████████████████░░░ 86% │
|
|
13
|
+
│ PWR ████████████░░░░░░░░░░░░░░░░░░░░░░░░░░ 53% │
|
|
14
|
+
╰─────────────────────────────────────────────────────────────────────────────╯
|
|
15
|
+
╭─┐² throughput ┌────────────╮╭─┐³ requests ┌──────────────╮
|
|
16
|
+
│ gen 149 tok/s ⣀⣤⣶⣿ ││ running ████░░ 1 │
|
|
17
|
+
│ prompt 3.5k tok/s ││ waiting ██████ 2 │
|
|
18
|
+
╰─┘ tok/s └──────────────────╯╰─┘ queue └───────────────────╯
|
|
19
|
+
╭─┐⁴ latency ┌───────────────╮╭─┐⁵ cache ┌──────────────────╮
|
|
20
|
+
│ TTFT 964ms TPOT 6ms … ││ KV █░░░ 3% prefix 0.0% │
|
|
21
|
+
╰─┘ recent avg └─────────────╯╰─────────────────────────────╯
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Rounded corners, superscript panel numbers in the title tabs, and a secondary
|
|
25
|
+
label on the bottom edge — matching [btop](https://github.com/aristocratos/btop)'s box style.
|
|
26
|
+
|
|
27
|
+
## What it shows
|
|
28
|
+
|
|
29
|
+
- **GPU** (via NVML / `pynvml`): utilisation %, VRAM used/total, temperature,
|
|
30
|
+
power draw vs. limit, SM clock, fan — with green/yellow/red thresholds.
|
|
31
|
+
- **Throughput**: generation tok/s and prompt tok/s (rates derived from vLLM
|
|
32
|
+
counters), as big numbers + braille charts.
|
|
33
|
+
- **Requests / Queue**: running vs. waiting requests and preemptions.
|
|
34
|
+
- **Latency** (recent average over the last poll interval — far more useful live
|
|
35
|
+
than the cumulative average): TTFT, inter-token (TPOT), end-to-end, queue time.
|
|
36
|
+
- **Cache**: KV-cache usage % and prefix-cache hit rate.
|
|
37
|
+
|
|
38
|
+
Data comes from vLLM's Prometheus `/metrics` endpoint plus in-process NVML
|
|
39
|
+
polling. If vLLM goes away (e.g. a container restart) the UI shows a disconnect
|
|
40
|
+
banner and keeps the GPU panel live, then reconnects automatically.
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
Requires Python 3.10+ on Linux (curses is stdlib). A working NVIDIA driver is
|
|
45
|
+
needed for the GPU panel.
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install .
|
|
49
|
+
# or, for development:
|
|
50
|
+
pip install -e ".[dev]"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Dependencies: `nvidia-ml-py` (NVML bindings) and `prometheus-client` (exposition
|
|
54
|
+
parser). The `/metrics` fetch uses stdlib `urllib`.
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
vllmtop # monitor http://localhost:8000
|
|
60
|
+
vllmtop --url http://host:8000 # a remote vLLM server
|
|
61
|
+
vllmtop --interval 0.5 # poll twice a second
|
|
62
|
+
vllmtop --no-gpu # skip the GPU panel
|
|
63
|
+
python -m vllmtop # same thing, without the entry point
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
The server URL can also be set via the `VLLMTOP_URL` environment variable.
|
|
67
|
+
|
|
68
|
+
### Options
|
|
69
|
+
|
|
70
|
+
| Flag | Default | Description |
|
|
71
|
+
|------|---------|-------------|
|
|
72
|
+
| `--url` | `http://localhost:8000` | vLLM base URL (env `VLLMTOP_URL`) |
|
|
73
|
+
| `--interval` | `1.0` | poll interval in seconds |
|
|
74
|
+
| `--gpu-index` | `0` | NVML GPU index |
|
|
75
|
+
| `--no-gpu` | off | disable the GPU panel |
|
|
76
|
+
| `--dump-json` | off | collect one snapshot, print JSON, exit (no TTY) |
|
|
77
|
+
|
|
78
|
+
### Keybindings
|
|
79
|
+
|
|
80
|
+
| Key | Action |
|
|
81
|
+
|-----|--------|
|
|
82
|
+
| `q` / `Esc` | quit |
|
|
83
|
+
| `+` / `-` | faster / slower refresh |
|
|
84
|
+
| `p` | pause / resume polling |
|
|
85
|
+
| `1`–`5` | toggle a panel on/off (¹gpu ²throughput ³requests ⁴latency ⁵cache) |
|
|
86
|
+
| `h` / `?` | toggle help overlay |
|
|
87
|
+
|
|
88
|
+
Each panel's title carries a superscript number (btop-style) showing the key
|
|
89
|
+
that toggles it. Hiding panels reflows the rest to fill the freed space.
|
|
90
|
+
|
|
91
|
+
### Headless smoke test
|
|
92
|
+
|
|
93
|
+
`--dump-json` collects two snapshots an interval apart (so rates are populated),
|
|
94
|
+
prints the result as JSON, and exits. Works without a TTY — handy for CI or
|
|
95
|
+
verifying connectivity:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
python -m vllmtop --dump-json --url http://localhost:8000
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## How it works
|
|
102
|
+
|
|
103
|
+
- A **background poller thread** scrapes `/metrics` and polls NVML every
|
|
104
|
+
`interval` seconds, storing the latest combined snapshot under a lock. This
|
|
105
|
+
keeps all I/O latency off the render path.
|
|
106
|
+
- The **UI loop** wakes on a short tick (250 ms), reads the latest snapshot,
|
|
107
|
+
appends derived values (rates, recent-average latencies) to per-series ring
|
|
108
|
+
buffers, and redraws — so render cadence is independent of poll cadence.
|
|
109
|
+
- **Counters → rates**: `Δvalue / Δt`, guarded against `Δt ≤ 0` and counter
|
|
110
|
+
resets. **Histograms → recent average**: `Δsum / Δcount` between polls.
|
|
111
|
+
- **Braille charts**: each cell is a 2×4 Unicode braille dot matrix, giving
|
|
112
|
+
`2w × 4h`-dot resolution for the smooth btop look.
|
|
113
|
+
|
|
114
|
+
## Development
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
pytest # parser-against-fixture, rate math, braille rendering
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## License
|
|
121
|
+
|
|
122
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vllmpytop"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A btop-style terminal UI for monitoring a vLLM instance and its GPU in real time."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
license-files = ["LICENSE"]
|
|
12
|
+
requires-python = ">=3.10"
|
|
13
|
+
authors = [{ name = "Theodore Kirby", email = "theo@kirby.dev" }]
|
|
14
|
+
keywords = ["vllm", "gpu", "monitoring", "tui", "terminal", "btop", "nvml", "prometheus", "llm"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Environment :: Console :: Curses",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: System Administrators",
|
|
20
|
+
"Operating System :: POSIX :: Linux",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Programming Language :: Python :: 3.13",
|
|
27
|
+
"Topic :: System :: Monitoring",
|
|
28
|
+
"Topic :: Utilities",
|
|
29
|
+
]
|
|
30
|
+
dependencies = [
|
|
31
|
+
"nvidia-ml-py>=12.0",
|
|
32
|
+
"prometheus-client>=0.20",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
dev = ["pytest>=8.0", "build>=1.0", "twine>=5.0"]
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/theo-kirby/vllmtop"
|
|
40
|
+
Repository = "https://github.com/theo-kirby/vllmtop"
|
|
41
|
+
Issues = "https://github.com/theo-kirby/vllmtop/issues"
|
|
42
|
+
|
|
43
|
+
[project.scripts]
|
|
44
|
+
vllmtop = "vllmtop.cli:main"
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.packages.find]
|
|
47
|
+
include = ["vllmtop*"]
|
|
48
|
+
|
|
49
|
+
[tool.pytest.ini_options]
|
|
50
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from vllmtop.collectors.vllm import parse_metrics
|
|
5
|
+
|
|
6
|
+
FIXTURE = Path(__file__).parent / "metrics_fixture.txt"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _snap():
|
|
10
|
+
return parse_metrics(FIXTURE.read_text())
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_reachable_and_model():
|
|
14
|
+
snap = _snap()
|
|
15
|
+
assert snap.reachable is True
|
|
16
|
+
assert snap.model_name == "Qwen/Qwen3.6-35B-A3B"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_scalar_values():
|
|
20
|
+
snap = _snap()
|
|
21
|
+
assert snap.num_requests_running == 0.0
|
|
22
|
+
assert snap.num_requests_waiting == 0.0
|
|
23
|
+
assert snap.kv_cache_usage_perc == 0.0
|
|
24
|
+
# 5.7351109e+07 from the fixture.
|
|
25
|
+
assert snap.prompt_tokens_total == 57351109.0
|
|
26
|
+
assert snap.generation_tokens_total == 1799939.0
|
|
27
|
+
assert snap.num_preemptions_total == 0.0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_histograms_parsed():
|
|
31
|
+
snap = _snap()
|
|
32
|
+
# Histograms exist with a +Inf bucket and consistent count.
|
|
33
|
+
for hist in (snap.ttft, snap.inter_token, snap.e2e, snap.queue_time):
|
|
34
|
+
assert math.inf in hist.buckets
|
|
35
|
+
# The +Inf bucket count equals the total observation count.
|
|
36
|
+
assert hist.buckets[math.inf] == hist.count
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_prefix_cache_hit_rate_guarded():
|
|
40
|
+
snap = _snap()
|
|
41
|
+
# queries_total is 0 in the fixture -> guarded to 0, no ZeroDivisionError.
|
|
42
|
+
assert snap.prefix_cache_hit_rate == 0.0
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import math
|
|
2
|
+
|
|
3
|
+
from vllmtop.config import HISTORY_LEN
|
|
4
|
+
from vllmtop.state import (
|
|
5
|
+
GpuSnapshot,
|
|
6
|
+
Histogram,
|
|
7
|
+
History,
|
|
8
|
+
Series,
|
|
9
|
+
Snapshot,
|
|
10
|
+
VllmSnapshot,
|
|
11
|
+
compute_rate,
|
|
12
|
+
histogram_quantile,
|
|
13
|
+
histogram_recent_avg,
|
|
14
|
+
)
|
|
15
|
+
from vllmtop.ui.widgets import braille_chart
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_compute_rate_basic():
|
|
19
|
+
assert compute_rate(100.0, 10.0, 200.0, 12.0) == 50.0
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_compute_rate_guards():
|
|
23
|
+
# Non-positive dt -> 0.
|
|
24
|
+
assert compute_rate(100.0, 10.0, 200.0, 10.0) == 0.0
|
|
25
|
+
# Counter reset (value decreased) -> 0, not negative.
|
|
26
|
+
assert compute_rate(200.0, 10.0, 50.0, 12.0) == 0.0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_histogram_recent_avg():
|
|
30
|
+
prev = Histogram(count=10.0, sum=5.0)
|
|
31
|
+
cur = Histogram(count=14.0, sum=13.0)
|
|
32
|
+
# (13-5)/(14-10) = 2.0
|
|
33
|
+
assert histogram_recent_avg(prev, cur) == 2.0
|
|
34
|
+
# No new observations -> 0.
|
|
35
|
+
assert histogram_recent_avg(cur, cur) == 0.0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_histogram_quantile():
|
|
39
|
+
# Buckets le=1 -> 0, le=2 -> 0, le=5 -> 10 new obs in window.
|
|
40
|
+
prev = Histogram(count=0.0, buckets={1.0: 0.0, 2.0: 0.0, 5.0: 0.0, math.inf: 0.0})
|
|
41
|
+
cur = Histogram(count=10.0, buckets={1.0: 0.0, 2.0: 0.0, 5.0: 10.0, math.inf: 10.0})
|
|
42
|
+
q = histogram_quantile(prev, cur, 0.5)
|
|
43
|
+
# All mass in (2, 5]; median interpolates inside that bucket.
|
|
44
|
+
assert 2.0 <= q <= 5.0
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_series_ring_buffer():
|
|
48
|
+
s = Series(maxlen=3)
|
|
49
|
+
for i in range(5):
|
|
50
|
+
s.append(i)
|
|
51
|
+
assert s.values() == [2, 3, 4]
|
|
52
|
+
assert s.last == 4
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _snap(t, gen_total, running, gpu_util):
|
|
56
|
+
return Snapshot(
|
|
57
|
+
monotonic=t,
|
|
58
|
+
vllm=VllmSnapshot(
|
|
59
|
+
reachable=True,
|
|
60
|
+
generation_tokens_total=gen_total,
|
|
61
|
+
num_requests_running=running,
|
|
62
|
+
),
|
|
63
|
+
gpu=GpuSnapshot(available=True, util_gpu=gpu_util),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_history_rate_from_two_samples():
|
|
68
|
+
h = History(HISTORY_LEN)
|
|
69
|
+
h.update(_snap(0.0, 1000.0, 1, 10.0))
|
|
70
|
+
h.update(_snap(2.0, 1200.0, 2, 20.0))
|
|
71
|
+
# (1200-1000)/2 = 100 tok/s
|
|
72
|
+
assert h.derived["gen_tok_s"] == 100.0
|
|
73
|
+
assert h.derived["running"] == 2.0
|
|
74
|
+
assert h.derived["gpu_util"] == 20.0
|
|
75
|
+
assert h.series["gen_tok_s"].values() == [0.0, 100.0]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_braille_chart_shape():
|
|
79
|
+
series = [math.sin(i / 5.0) for i in range(100)]
|
|
80
|
+
w, height = 20, 4
|
|
81
|
+
rows = braille_chart(series, w, height, 0.0, 1.0)
|
|
82
|
+
assert len(rows) == height
|
|
83
|
+
for row in rows:
|
|
84
|
+
assert len(row) == w
|
|
85
|
+
# Every cell is a braille glyph in the U+2800 block.
|
|
86
|
+
assert all(0x2800 <= ord(ch) <= 0x28FF for ch in row)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vllmpytop
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A btop-style terminal UI for monitoring a vLLM instance and its GPU in real time.
|
|
5
|
+
Author-email: Theodore Kirby <theo@kirby.dev>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/theo-kirby/vllmtop
|
|
8
|
+
Project-URL: Repository, https://github.com/theo-kirby/vllmtop
|
|
9
|
+
Project-URL: Issues, https://github.com/theo-kirby/vllmtop/issues
|
|
10
|
+
Keywords: vllm,gpu,monitoring,tui,terminal,btop,nvml,prometheus,llm
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console :: Curses
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: System Administrators
|
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: System :: Monitoring
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: nvidia-ml-py>=12.0
|
|
28
|
+
Requires-Dist: prometheus-client>=0.20
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
31
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
32
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# vllmtop
|
|
36
|
+
|
|
37
|
+
A **btop-style terminal UI** for monitoring a running [vLLM](https://github.com/vllm-project/vllm)
|
|
38
|
+
instance and its GPU in real time. Hand-rolled braille charts, a responsive
|
|
39
|
+
curses layout, and a non-blocking background poller so the UI never stalls on
|
|
40
|
+
network or NVML latency.
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
╭─┐¹ gpu ┌──────────────────────────────────────────────────────────────────╮
|
|
44
|
+
│ NVIDIA GeForce RTX 5090 util 86% 50°C 319/600W SM 2857MHz fan 31% │
|
|
45
|
+
│ ⣿⣿⣿⣿ … braille utilisation chart … │
|
|
46
|
+
│ VRAM 27.3GB/31.8GB ████████████████████░░░ 86% │
|
|
47
|
+
│ PWR ████████████░░░░░░░░░░░░░░░░░░░░░░░░░░ 53% │
|
|
48
|
+
╰─────────────────────────────────────────────────────────────────────────────╯
|
|
49
|
+
╭─┐² throughput ┌────────────╮╭─┐³ requests ┌──────────────╮
|
|
50
|
+
│ gen 149 tok/s ⣀⣤⣶⣿ ││ running ████░░ 1 │
|
|
51
|
+
│ prompt 3.5k tok/s ││ waiting ██████ 2 │
|
|
52
|
+
╰─┘ tok/s └──────────────────╯╰─┘ queue └───────────────────╯
|
|
53
|
+
╭─┐⁴ latency ┌───────────────╮╭─┐⁵ cache ┌──────────────────╮
|
|
54
|
+
│ TTFT 964ms TPOT 6ms … ││ KV █░░░ 3% prefix 0.0% │
|
|
55
|
+
╰─┘ recent avg └─────────────╯╰─────────────────────────────╯
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Rounded corners, superscript panel numbers in the title tabs, and a secondary
|
|
59
|
+
label on the bottom edge — matching [btop](https://github.com/aristocratos/btop)'s box style.
|
|
60
|
+
|
|
61
|
+
## What it shows
|
|
62
|
+
|
|
63
|
+
- **GPU** (via NVML / `pynvml`): utilisation %, VRAM used/total, temperature,
|
|
64
|
+
power draw vs. limit, SM clock, fan — with green/yellow/red thresholds.
|
|
65
|
+
- **Throughput**: generation tok/s and prompt tok/s (rates derived from vLLM
|
|
66
|
+
counters), as big numbers + braille charts.
|
|
67
|
+
- **Requests / Queue**: running vs. waiting requests and preemptions.
|
|
68
|
+
- **Latency** (recent average over the last poll interval — far more useful live
|
|
69
|
+
than the cumulative average): TTFT, inter-token (TPOT), end-to-end, queue time.
|
|
70
|
+
- **Cache**: KV-cache usage % and prefix-cache hit rate.
|
|
71
|
+
|
|
72
|
+
Data comes from vLLM's Prometheus `/metrics` endpoint plus in-process NVML
|
|
73
|
+
polling. If vLLM goes away (e.g. a container restart) the UI shows a disconnect
|
|
74
|
+
banner and keeps the GPU panel live, then reconnects automatically.
|
|
75
|
+
|
|
76
|
+
## Install
|
|
77
|
+
|
|
78
|
+
Requires Python 3.10+ on Linux (curses is stdlib). A working NVIDIA driver is
|
|
79
|
+
needed for the GPU panel.
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install .
|
|
83
|
+
# or, for development:
|
|
84
|
+
pip install -e ".[dev]"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Dependencies: `nvidia-ml-py` (NVML bindings) and `prometheus-client` (exposition
|
|
88
|
+
parser). The `/metrics` fetch uses stdlib `urllib`.
|
|
89
|
+
|
|
90
|
+
## Usage
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
vllmtop # monitor http://localhost:8000
|
|
94
|
+
vllmtop --url http://host:8000 # a remote vLLM server
|
|
95
|
+
vllmtop --interval 0.5 # poll twice a second
|
|
96
|
+
vllmtop --no-gpu # skip the GPU panel
|
|
97
|
+
python -m vllmtop # same thing, without the entry point
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
The server URL can also be set via the `VLLMTOP_URL` environment variable.
|
|
101
|
+
|
|
102
|
+
### Options
|
|
103
|
+
|
|
104
|
+
| Flag | Default | Description |
|
|
105
|
+
|------|---------|-------------|
|
|
106
|
+
| `--url` | `http://localhost:8000` | vLLM base URL (env `VLLMTOP_URL`) |
|
|
107
|
+
| `--interval` | `1.0` | poll interval in seconds |
|
|
108
|
+
| `--gpu-index` | `0` | NVML GPU index |
|
|
109
|
+
| `--no-gpu` | off | disable the GPU panel |
|
|
110
|
+
| `--dump-json` | off | collect one snapshot, print JSON, exit (no TTY) |
|
|
111
|
+
|
|
112
|
+
### Keybindings
|
|
113
|
+
|
|
114
|
+
| Key | Action |
|
|
115
|
+
|-----|--------|
|
|
116
|
+
| `q` / `Esc` | quit |
|
|
117
|
+
| `+` / `-` | faster / slower refresh |
|
|
118
|
+
| `p` | pause / resume polling |
|
|
119
|
+
| `1`–`5` | toggle a panel on/off (¹gpu ²throughput ³requests ⁴latency ⁵cache) |
|
|
120
|
+
| `h` / `?` | toggle help overlay |
|
|
121
|
+
|
|
122
|
+
Each panel's title carries a superscript number (btop-style) showing the key
|
|
123
|
+
that toggles it. Hiding panels reflows the rest to fill the freed space.
|
|
124
|
+
|
|
125
|
+
### Headless smoke test
|
|
126
|
+
|
|
127
|
+
`--dump-json` collects two snapshots an interval apart (so rates are populated),
|
|
128
|
+
prints the result as JSON, and exits. Works without a TTY — handy for CI or
|
|
129
|
+
verifying connectivity:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
python -m vllmtop --dump-json --url http://localhost:8000
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## How it works
|
|
136
|
+
|
|
137
|
+
- A **background poller thread** scrapes `/metrics` and polls NVML every
|
|
138
|
+
`interval` seconds, storing the latest combined snapshot under a lock. This
|
|
139
|
+
keeps all I/O latency off the render path.
|
|
140
|
+
- The **UI loop** wakes on a short tick (250 ms), reads the latest snapshot,
|
|
141
|
+
appends derived values (rates, recent-average latencies) to per-series ring
|
|
142
|
+
buffers, and redraws — so render cadence is independent of poll cadence.
|
|
143
|
+
- **Counters → rates**: `Δvalue / Δt`, guarded against `Δt ≤ 0` and counter
|
|
144
|
+
resets. **Histograms → recent average**: `Δsum / Δcount` between polls.
|
|
145
|
+
- **Braille charts**: each cell is a 2×4 Unicode braille dot matrix, giving
|
|
146
|
+
`2w × 4h`-dot resolution for the smooth btop look.
|
|
147
|
+
|
|
148
|
+
## Development
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
pytest # parser-against-fixture, rate math, braille rendering
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
tests/test_prometheus.py
|
|
5
|
+
tests/test_state.py
|
|
6
|
+
vllmpytop.egg-info/PKG-INFO
|
|
7
|
+
vllmpytop.egg-info/SOURCES.txt
|
|
8
|
+
vllmpytop.egg-info/dependency_links.txt
|
|
9
|
+
vllmpytop.egg-info/entry_points.txt
|
|
10
|
+
vllmpytop.egg-info/requires.txt
|
|
11
|
+
vllmpytop.egg-info/top_level.txt
|
|
12
|
+
vllmtop/__init__.py
|
|
13
|
+
vllmtop/__main__.py
|
|
14
|
+
vllmtop/cli.py
|
|
15
|
+
vllmtop/config.py
|
|
16
|
+
vllmtop/state.py
|
|
17
|
+
vllmtop/collectors/__init__.py
|
|
18
|
+
vllmtop/collectors/gpu.py
|
|
19
|
+
vllmtop/collectors/vllm.py
|
|
20
|
+
vllmtop/ui/__init__.py
|
|
21
|
+
vllmtop/ui/app.py
|
|
22
|
+
vllmtop/ui/layout.py
|
|
23
|
+
vllmtop/ui/panels.py
|
|
24
|
+
vllmtop/ui/theme.py
|
|
25
|
+
vllmtop/ui/widgets.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|