vllmstat 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllmstat/__init__.py +3 -0
- vllmstat/__main__.py +4 -0
- vllmstat/app.py +182 -0
- vllmstat/cli.py +56 -0
- vllmstat/config.py +47 -0
- vllmstat/core/__init__.py +0 -0
- vllmstat/core/histogram.py +38 -0
- vllmstat/core/history.py +25 -0
- vllmstat/core/kv.py +86 -0
- vllmstat/core/metrics.py +181 -0
- vllmstat/core/parse.py +51 -0
- vllmstat/core/rates.py +26 -0
- vllmstat/core/state.py +89 -0
- vllmstat/format.py +51 -0
- vllmstat/model_dims.py +40 -0
- vllmstat/providers/__init__.py +0 -0
- vllmstat/providers/gpu.py +132 -0
- vllmstat/providers/mock.py +127 -0
- vllmstat/providers/vllm.py +59 -0
- vllmstat/render.py +121 -0
- vllmstat/snapshot_json.py +21 -0
- vllmstat/widgets.py +20 -0
- vllmstat-0.1.0.dist-info/METADATA +147 -0
- vllmstat-0.1.0.dist-info/RECORD +27 -0
- vllmstat-0.1.0.dist-info/WHEEL +4 -0
- vllmstat-0.1.0.dist-info/entry_points.txt +2 -0
- vllmstat-0.1.0.dist-info/licenses/LICENSE +202 -0
vllmstat/__init__.py
ADDED
vllmstat/__main__.py
ADDED
vllmstat/app.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
from textual.app import App, ComposeResult
|
|
6
|
+
from textual.containers import Horizontal
|
|
7
|
+
from textual.timer import Timer
|
|
8
|
+
from textual.widgets import Footer
|
|
9
|
+
|
|
10
|
+
from vllmstat import render
|
|
11
|
+
from vllmstat.config import Config
|
|
12
|
+
from vllmstat.core.history import History
|
|
13
|
+
from vllmstat.core.metrics import MetricsEngine
|
|
14
|
+
from vllmstat.core.parse import parse_metrics
|
|
15
|
+
from vllmstat.core.state import Snapshot
|
|
16
|
+
from vllmstat.model_dims import load_model_dims
|
|
17
|
+
from vllmstat.providers.gpu import GpuProvider
|
|
18
|
+
from vllmstat.providers.mock import MockProvider, mock_gpu_snapshot
|
|
19
|
+
from vllmstat.providers.vllm import VllmProvider
|
|
20
|
+
from vllmstat.widgets import Panel
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class VllmStatApp(App):
|
|
24
|
+
CSS = """
|
|
25
|
+
Panel { border: round $primary; padding: 0 1; height: auto; }
|
|
26
|
+
#row1 { height: auto; }
|
|
27
|
+
#row1 Panel { width: 1fr; }
|
|
28
|
+
#gpu { height: auto; }
|
|
29
|
+
"""
|
|
30
|
+
BINDINGS = [
|
|
31
|
+
("q", "quit", "Quit"),
|
|
32
|
+
("p", "toggle_pause", "Pause"),
|
|
33
|
+
("g", "toggle_gpu", "GPU"),
|
|
34
|
+
("plus,equals_sign", "faster", "Faster"),
|
|
35
|
+
("minus", "slower", "Slower"),
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
def __init__(self, cfg: Config) -> None:
|
|
39
|
+
super().__init__()
|
|
40
|
+
self.cfg = cfg
|
|
41
|
+
self.paused = False
|
|
42
|
+
self.snapshot: Snapshot | None = None
|
|
43
|
+
self._history = History()
|
|
44
|
+
self._engine = MetricsEngine()
|
|
45
|
+
self._gpu = GpuProvider(enabled=cfg.gpu)
|
|
46
|
+
self._mock = MockProvider() if cfg.mock else None
|
|
47
|
+
self._vllm = (
|
|
48
|
+
None
|
|
49
|
+
if cfg.mock
|
|
50
|
+
else VllmProvider(base_url=cfg.url, metrics_path=cfg.metrics_path, api_key=cfg.api_key)
|
|
51
|
+
)
|
|
52
|
+
self._model_names: list[str] = []
|
|
53
|
+
self._dims_loaded = cfg.mock # mock keeps the plain engine; nothing to fetch
|
|
54
|
+
self._start = time.monotonic()
|
|
55
|
+
self._tick_n = 0
|
|
56
|
+
self._timer: Timer | None = None
|
|
57
|
+
self._in_tick = False
|
|
58
|
+
|
|
59
|
+
def compose(self) -> ComposeResult:
|
|
60
|
+
self.p_header = Panel(id="hdr")
|
|
61
|
+
self.p_conc = Panel(id="conc")
|
|
62
|
+
self.p_tput = Panel(id="tput")
|
|
63
|
+
self.p_lat = Panel(id="lat")
|
|
64
|
+
self.p_cache = Panel(id="cache")
|
|
65
|
+
self.p_eff = Panel(id="eff")
|
|
66
|
+
self.p_spec = Panel(id="spec")
|
|
67
|
+
self.p_gpu = Panel(id="gpu")
|
|
68
|
+
yield self.p_header
|
|
69
|
+
with Horizontal(id="row1"):
|
|
70
|
+
yield self.p_conc
|
|
71
|
+
yield self.p_tput
|
|
72
|
+
yield self.p_lat
|
|
73
|
+
yield self.p_cache
|
|
74
|
+
yield self.p_eff
|
|
75
|
+
yield self.p_spec
|
|
76
|
+
yield self.p_gpu
|
|
77
|
+
yield Footer()
|
|
78
|
+
|
|
79
|
+
def on_mount(self) -> None:
|
|
80
|
+
self._timer = self.set_interval(self.cfg.interval, self.tick)
|
|
81
|
+
self.call_later(self.tick)
|
|
82
|
+
|
|
83
|
+
async def _ensure_dims(self) -> None:
|
|
84
|
+
"""Once, in vLLM mode, fetch model info and rebuild the engine with KV dims."""
|
|
85
|
+
if self._dims_loaded or self._vllm is None:
|
|
86
|
+
return
|
|
87
|
+
self._dims_loaded = True # set before await so we only attempt once
|
|
88
|
+
info = await self._vllm.fetch_model_info()
|
|
89
|
+
md = load_model_dims(info.root, info.max_model_len)
|
|
90
|
+
self._engine = MetricsEngine(dims=md.dims, max_model_len=md.max_model_len)
|
|
91
|
+
self._model_names = info.model_names
|
|
92
|
+
|
|
93
|
+
async def _sample_text(self) -> tuple[str, bool, str | None]:
|
|
94
|
+
if self._mock is not None:
|
|
95
|
+
return self._mock.metrics_text(), True, None
|
|
96
|
+
assert self._vllm is not None
|
|
97
|
+
raw = await self._vllm.fetch_metrics()
|
|
98
|
+
return raw.text, raw.fetched_ok, raw.error
|
|
99
|
+
|
|
100
|
+
async def tick(self) -> None:
|
|
101
|
+
if self.paused or self._in_tick:
|
|
102
|
+
return
|
|
103
|
+
self._in_tick = True
|
|
104
|
+
try:
|
|
105
|
+
await self._tick_body()
|
|
106
|
+
finally:
|
|
107
|
+
self._in_tick = False
|
|
108
|
+
|
|
109
|
+
async def _tick_body(self) -> None:
|
|
110
|
+
await self._ensure_dims()
|
|
111
|
+
self._tick_n += 1
|
|
112
|
+
text, ok, err = await self._sample_text()
|
|
113
|
+
now = time.monotonic()
|
|
114
|
+
if ok and text:
|
|
115
|
+
fam = parse_metrics(text)
|
|
116
|
+
snap = self._engine.derive(fam, now=now)
|
|
117
|
+
else:
|
|
118
|
+
snap = self.snapshot or Snapshot(ts=now, connected=False, error=err)
|
|
119
|
+
snap.connected = False
|
|
120
|
+
snap.error = err
|
|
121
|
+
if self._mock is not None and self._gpu.enabled:
|
|
122
|
+
snap.gpu = mock_gpu_snapshot(self._tick_n)
|
|
123
|
+
else:
|
|
124
|
+
snap.gpu = self._gpu.sample()
|
|
125
|
+
self._push_history(snap)
|
|
126
|
+
self.snapshot = snap
|
|
127
|
+
self._refresh(snap)
|
|
128
|
+
|
|
129
|
+
def _push_history(self, s: Snapshot) -> None:
|
|
130
|
+
self._history.push("running", s.running)
|
|
131
|
+
self._history.push("waiting", s.waiting)
|
|
132
|
+
self._history.push("gen_tps", s.gen_tps)
|
|
133
|
+
self._history.push("prompt_tps", s.prompt_tps)
|
|
134
|
+
if s.prefix_hit_window is not None:
|
|
135
|
+
self._history.push("prefix_hit", s.prefix_hit_window)
|
|
136
|
+
|
|
137
|
+
def _uptime(self) -> str:
|
|
138
|
+
secs = int(time.monotonic() - self._start)
|
|
139
|
+
h, rem = divmod(secs, 3600)
|
|
140
|
+
m, _ = divmod(rem, 60)
|
|
141
|
+
return f"{h}h{m:02d}m"
|
|
142
|
+
|
|
143
|
+
def _refresh(self, s: Snapshot) -> None:
|
|
144
|
+
self.p_header.update(
|
|
145
|
+
render.header(s, url=self.cfg.url, interval=self.cfg.interval, uptime=self._uptime())
|
|
146
|
+
)
|
|
147
|
+
self.p_conc.update(render.concurrency(s, self._history))
|
|
148
|
+
self.p_tput.update(render.throughput(s, self._history))
|
|
149
|
+
self.p_lat.update(render.latency(s))
|
|
150
|
+
self.p_cache.update(render.cache_kv(s, self._history))
|
|
151
|
+
eff = render.efficiency(s)
|
|
152
|
+
self.p_eff.display = bool(eff)
|
|
153
|
+
self.p_eff.update(eff)
|
|
154
|
+
spec = render.specdecode(s)
|
|
155
|
+
self.p_spec.display = bool(spec)
|
|
156
|
+
self.p_spec.update(spec)
|
|
157
|
+
self.p_gpu.update(render.gpu(s))
|
|
158
|
+
|
|
159
|
+
def action_toggle_pause(self) -> None:
|
|
160
|
+
self.paused = not self.paused
|
|
161
|
+
|
|
162
|
+
def action_toggle_gpu(self) -> None:
|
|
163
|
+
self._gpu.enabled = not self._gpu.enabled
|
|
164
|
+
|
|
165
|
+
def action_faster(self) -> None:
|
|
166
|
+
self.cfg.interval = max(0.1, self.cfg.interval / 2)
|
|
167
|
+
self._reschedule()
|
|
168
|
+
|
|
169
|
+
def action_slower(self) -> None:
|
|
170
|
+
self.cfg.interval = min(10.0, self.cfg.interval * 2)
|
|
171
|
+
self._reschedule()
|
|
172
|
+
|
|
173
|
+
def _reschedule(self) -> None:
|
|
174
|
+
# restart the interval timer at the new cadence (Textual >=8: Timer.stop + recreate)
|
|
175
|
+
if self._timer is not None:
|
|
176
|
+
self._timer.stop()
|
|
177
|
+
self._timer = self.set_interval(self.cfg.interval, self.tick)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def run_app(cfg: Config) -> int:
|
|
181
|
+
VllmStatApp(cfg).run()
|
|
182
|
+
return 0
|
vllmstat/cli.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from vllmstat.config import Config
|
|
9
|
+
from vllmstat.core.metrics import MetricsEngine
|
|
10
|
+
from vllmstat.core.parse import parse_metrics
|
|
11
|
+
from vllmstat.providers.mock import MockProvider
|
|
12
|
+
from vllmstat.snapshot_json import snapshot_to_dict
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def run_once_json(argv: list[str]) -> int:
|
|
16
|
+
cfg = Config.from_sources(argv, dict(os.environ))
|
|
17
|
+
if cfg.mock:
|
|
18
|
+
eng = MetricsEngine(dims=None, max_model_len=None)
|
|
19
|
+
mp = MockProvider()
|
|
20
|
+
eng.derive(parse_metrics(mp.metrics_text()), now=0.0)
|
|
21
|
+
snap = eng.derive(parse_metrics(mp.metrics_text()), now=1.0)
|
|
22
|
+
else:
|
|
23
|
+
import asyncio
|
|
24
|
+
|
|
25
|
+
from vllmstat.model_dims import load_model_dims
|
|
26
|
+
from vllmstat.providers.vllm import VllmProvider
|
|
27
|
+
|
|
28
|
+
async def _go():
|
|
29
|
+
p = VllmProvider(base_url=cfg.url, metrics_path=cfg.metrics_path, api_key=cfg.api_key)
|
|
30
|
+
info = await p.fetch_model_info()
|
|
31
|
+
r0 = await p.fetch_metrics()
|
|
32
|
+
time.sleep(min(cfg.interval, 1.0))
|
|
33
|
+
r1 = await p.fetch_metrics()
|
|
34
|
+
await p.aclose()
|
|
35
|
+
return info, r0, r1
|
|
36
|
+
|
|
37
|
+
info, r0, r1 = asyncio.run(_go())
|
|
38
|
+
if not r1.fetched_ok:
|
|
39
|
+
print(json.dumps({"error": r1.error}), file=sys.stderr)
|
|
40
|
+
return 1
|
|
41
|
+
md = load_model_dims(info.root, info.max_model_len)
|
|
42
|
+
eng = MetricsEngine(dims=md.dims, max_model_len=md.max_model_len)
|
|
43
|
+
eng.derive(parse_metrics(r0.text), now=0.0)
|
|
44
|
+
snap = eng.derive(parse_metrics(r1.text), now=1.0)
|
|
45
|
+
print(json.dumps(snapshot_to_dict(snap), default=str))
|
|
46
|
+
return 0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def main(argv: list[str] | None = None) -> int:
|
|
50
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
51
|
+
cfg = Config.from_sources(argv, dict(os.environ))
|
|
52
|
+
if cfg.once and cfg.json:
|
|
53
|
+
return run_once_json(argv)
|
|
54
|
+
from vllmstat.app import run_app # imported lazily so --once/--json needs no Textual
|
|
55
|
+
|
|
56
|
+
return run_app(cfg)
|
vllmstat/config.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from vllmstat import __version__
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Config:
|
|
11
|
+
url: str = "http://localhost:8000"
|
|
12
|
+
metrics_path: str = "/metrics"
|
|
13
|
+
interval: float = 1.0
|
|
14
|
+
api_key: str | None = None
|
|
15
|
+
gpu: bool = True
|
|
16
|
+
mock: bool = False
|
|
17
|
+
once: bool = False
|
|
18
|
+
json: bool = False
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
22
|
+
p = argparse.ArgumentParser(prog="vllmstat", description="nvtop for vLLM")
|
|
23
|
+
p.add_argument("-u", "--url", default="http://localhost:8000")
|
|
24
|
+
p.add_argument("--metrics-path", default="/metrics")
|
|
25
|
+
p.add_argument("-i", "--interval", type=float, default=1.0)
|
|
26
|
+
p.add_argument("--api-key", default=None)
|
|
27
|
+
p.add_argument("--no-gpu", dest="gpu", action="store_false", default=True)
|
|
28
|
+
p.add_argument("--mock", action="store_true", default=False)
|
|
29
|
+
p.add_argument("--once", action="store_true", default=False)
|
|
30
|
+
p.add_argument("--json", action="store_true", default=False)
|
|
31
|
+
p.add_argument("--version", action="version", version=f"vllmstat {__version__}")
|
|
32
|
+
return p
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_sources(cls, argv: list[str], env: dict[str, str]) -> Config:
|
|
36
|
+
ns = cls.build_parser().parse_args(argv)
|
|
37
|
+
api_key = ns.api_key or env.get("VLLM_API_KEY")
|
|
38
|
+
return cls(
|
|
39
|
+
url=ns.url,
|
|
40
|
+
metrics_path=ns.metrics_path,
|
|
41
|
+
interval=ns.interval,
|
|
42
|
+
api_key=api_key,
|
|
43
|
+
gpu=ns.gpu,
|
|
44
|
+
mock=ns.mock,
|
|
45
|
+
once=ns.once,
|
|
46
|
+
json=ns.json,
|
|
47
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def histogram_quantile(buckets: list[tuple[float, float]], q: float) -> float | None:
|
|
5
|
+
"""Prometheus-style quantile from cumulative buckets [(le, cum_count), ...] sorted asc."""
|
|
6
|
+
if not buckets:
|
|
7
|
+
return None
|
|
8
|
+
total = buckets[-1][1]
|
|
9
|
+
if total <= 0:
|
|
10
|
+
return None
|
|
11
|
+
target = q * total
|
|
12
|
+
prev_le = 0.0
|
|
13
|
+
prev_count = 0.0
|
|
14
|
+
for le, count in buckets:
|
|
15
|
+
if count >= target:
|
|
16
|
+
if le == float("inf"):
|
|
17
|
+
return prev_le
|
|
18
|
+
bucket_count = count - prev_count
|
|
19
|
+
if bucket_count <= 0:
|
|
20
|
+
return le
|
|
21
|
+
frac = (target - prev_count) / bucket_count
|
|
22
|
+
return prev_le + frac * (le - prev_le)
|
|
23
|
+
prev_le, prev_count = le, count
|
|
24
|
+
return buckets[-1][0]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def windowed_buckets(
|
|
28
|
+
prev: list[tuple[float, float]], cur: list[tuple[float, float]]
|
|
29
|
+
) -> list[tuple[float, float]]:
|
|
30
|
+
"""Per-`le` delta cur-prev. If any delta is negative (counter reset), return cur."""
|
|
31
|
+
prev_map = dict(prev)
|
|
32
|
+
out: list[tuple[float, float]] = []
|
|
33
|
+
for le, count in cur:
|
|
34
|
+
delta = count - prev_map.get(le, 0.0)
|
|
35
|
+
if delta < 0:
|
|
36
|
+
return cur
|
|
37
|
+
out.append((le, delta))
|
|
38
|
+
return out
|
vllmstat/core/history.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import deque
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Series:
|
|
7
|
+
def __init__(self, maxlen: int = 120) -> None:
|
|
8
|
+
self.values: deque[float] = deque(maxlen=maxlen)
|
|
9
|
+
|
|
10
|
+
def push(self, v: float) -> None:
|
|
11
|
+
self.values.append(v)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class History:
|
|
15
|
+
def __init__(self, maxlen: int = 120) -> None:
|
|
16
|
+
self._maxlen = maxlen
|
|
17
|
+
self._series: dict[str, Series] = {}
|
|
18
|
+
|
|
19
|
+
def series(self, name: str) -> Series:
|
|
20
|
+
if name not in self._series:
|
|
21
|
+
self._series[name] = Series(self._maxlen)
|
|
22
|
+
return self._series[name]
|
|
23
|
+
|
|
24
|
+
def push(self, name: str, value: float) -> None:
|
|
25
|
+
self.series(name).push(value)
|
vllmstat/core/kv.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
_FP16 = {"auto", "fp16", "float16", "bf16", "bfloat16", "half"}
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_kv_bits(dtype: str | None) -> tuple[int, int] | None:
|
|
10
|
+
if not dtype:
|
|
11
|
+
return None
|
|
12
|
+
d = dtype.lower()
|
|
13
|
+
if d in _FP16:
|
|
14
|
+
return (16, 16)
|
|
15
|
+
if "fp8" in d or "int8" in d or "e4m3" in d or "e5m2" in d:
|
|
16
|
+
return (8, 8)
|
|
17
|
+
m = re.search(r"k(\d+)v(\d+)", d)
|
|
18
|
+
if m:
|
|
19
|
+
return (int(m.group(1)), int(m.group(2)))
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def nominal_ratio(dtype: str | None) -> float | None:
|
|
24
|
+
bits = parse_kv_bits(dtype)
|
|
25
|
+
if not bits:
|
|
26
|
+
return None
|
|
27
|
+
bk, bv = bits
|
|
28
|
+
return (16 + 16) / (bk + bv)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def fp16_bytes_per_token(layers: int, kv_heads: int, head_dim: int) -> int:
|
|
32
|
+
return 2 * layers * kv_heads * head_dim * 2
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class KvInfo:
|
|
37
|
+
dtype: str | None
|
|
38
|
+
capacity_tokens: int | None
|
|
39
|
+
used_tokens: int | None
|
|
40
|
+
ratio: float | None
|
|
41
|
+
ratio_kind: str # "achieved" | "nominal" | "none"
|
|
42
|
+
fp16_equiv_tokens: int | None
|
|
43
|
+
fp16_full_ctx_gb: float | None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def compute_kv(
|
|
47
|
+
*,
|
|
48
|
+
cache_dtype: str | None,
|
|
49
|
+
num_gpu_blocks: int | None,
|
|
50
|
+
block_size: int | None,
|
|
51
|
+
kv_usage: float,
|
|
52
|
+
kv_cache_memory_bytes: int | None,
|
|
53
|
+
dims: dict[str, int] | None,
|
|
54
|
+
max_model_len: int | None,
|
|
55
|
+
) -> KvInfo:
|
|
56
|
+
capacity = None
|
|
57
|
+
if num_gpu_blocks and block_size:
|
|
58
|
+
capacity = num_gpu_blocks * block_size
|
|
59
|
+
used = round(capacity * kv_usage) if capacity is not None else None
|
|
60
|
+
|
|
61
|
+
bpt = None
|
|
62
|
+
if dims and all(k in dims for k in ("layers", "kv_heads", "head_dim")):
|
|
63
|
+
bpt = fp16_bytes_per_token(dims["layers"], dims["kv_heads"], dims["head_dim"])
|
|
64
|
+
|
|
65
|
+
ratio: float | None = None
|
|
66
|
+
kind = "none"
|
|
67
|
+
if capacity and bpt and kv_cache_memory_bytes:
|
|
68
|
+
ratio = (capacity * bpt) / kv_cache_memory_bytes
|
|
69
|
+
kind = "achieved"
|
|
70
|
+
else:
|
|
71
|
+
nominal = nominal_ratio(cache_dtype)
|
|
72
|
+
if nominal is not None:
|
|
73
|
+
ratio, kind = nominal, "nominal"
|
|
74
|
+
|
|
75
|
+
fp16_equiv = round(capacity / ratio) if (capacity and ratio) else None
|
|
76
|
+
fp16_full_ctx_gb = (max_model_len * bpt / 1e9) if (max_model_len and bpt) else None
|
|
77
|
+
|
|
78
|
+
return KvInfo(
|
|
79
|
+
dtype=cache_dtype,
|
|
80
|
+
capacity_tokens=capacity,
|
|
81
|
+
used_tokens=used,
|
|
82
|
+
ratio=ratio,
|
|
83
|
+
ratio_kind=kind,
|
|
84
|
+
fp16_equiv_tokens=fp16_equiv,
|
|
85
|
+
fp16_full_ctx_gb=fp16_full_ctx_gb,
|
|
86
|
+
)
|
vllmstat/core/metrics.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from vllmstat.core.histogram import histogram_quantile, windowed_buckets
|
|
4
|
+
from vllmstat.core.kv import compute_kv
|
|
5
|
+
from vllmstat.core.parse import (
|
|
6
|
+
Families,
|
|
7
|
+
first_value,
|
|
8
|
+
get_buckets,
|
|
9
|
+
info_labels,
|
|
10
|
+
sum_value,
|
|
11
|
+
)
|
|
12
|
+
from vllmstat.core.rates import Rate
|
|
13
|
+
from vllmstat.core.state import Quantiles, Snapshot
|
|
14
|
+
|
|
15
|
+
_LAT = {
|
|
16
|
+
"ttft": "vllm:time_to_first_token_seconds",
|
|
17
|
+
"tpot": "vllm:request_time_per_output_token_seconds",
|
|
18
|
+
"e2e": "vllm:e2e_request_latency_seconds",
|
|
19
|
+
"queue": "vllm:request_queue_time_seconds",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _int(s: str | None) -> int | None:
|
|
24
|
+
try:
|
|
25
|
+
return int(s) if s not in (None, "None", "") else None
|
|
26
|
+
except (TypeError, ValueError):
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class MetricsEngine:
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*,
|
|
34
|
+
alpha: float = 0.3,
|
|
35
|
+
dims: dict[str, int] | None = None,
|
|
36
|
+
max_model_len: int | None = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
self.dims = dims
|
|
39
|
+
self.max_model_len = max_model_len
|
|
40
|
+
self._gen = Rate(alpha)
|
|
41
|
+
self._prompt = Rate(alpha)
|
|
42
|
+
self._req = Rate(alpha)
|
|
43
|
+
self._preempt = Rate(alpha)
|
|
44
|
+
self._flops = Rate(alpha)
|
|
45
|
+
self._rbytes = Rate(alpha)
|
|
46
|
+
self._wbytes = Rate(alpha)
|
|
47
|
+
self._prev: Families | None = None
|
|
48
|
+
|
|
49
|
+
def _quantiles(self, fam: Families, base: str) -> Quantiles:
|
|
50
|
+
cur = get_buckets(fam, base)
|
|
51
|
+
if not cur:
|
|
52
|
+
return Quantiles()
|
|
53
|
+
buckets = cur
|
|
54
|
+
if self._prev is not None:
|
|
55
|
+
prev = get_buckets(self._prev, base)
|
|
56
|
+
if prev:
|
|
57
|
+
delta = windowed_buckets(prev, cur)
|
|
58
|
+
if delta and delta[-1][1] > 0:
|
|
59
|
+
buckets = delta
|
|
60
|
+
return Quantiles(
|
|
61
|
+
p50=histogram_quantile(buckets, 0.50),
|
|
62
|
+
p90=histogram_quantile(buckets, 0.90),
|
|
63
|
+
p99=histogram_quantile(buckets, 0.99),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def derive(self, fam: Families, now: float) -> Snapshot:
|
|
67
|
+
labels = info_labels(fam, "vllm:cache_config_info")
|
|
68
|
+
model_names = sorted(
|
|
69
|
+
{lbl.get("model_name", "") for lbl, _ in fam.get("vllm:num_requests_running", [])}
|
|
70
|
+
- {""}
|
|
71
|
+
)
|
|
72
|
+
engines = {lbl.get("engine") for lbl, _ in fam.get("vllm:num_requests_running", [])}
|
|
73
|
+
|
|
74
|
+
# throughput rates
|
|
75
|
+
gen = self._gen.update(sum_value(fam, "vllm:generation_tokens_total") or 0.0, now)
|
|
76
|
+
prompt = self._prompt.update(sum_value(fam, "vllm:prompt_tokens_total") or 0.0, now)
|
|
77
|
+
req = self._req.update(sum_value(fam, "vllm:request_success_total") or 0.0, now)
|
|
78
|
+
preempt = self._preempt.update(sum_value(fam, "vllm:num_preemptions_total") or 0.0, now)
|
|
79
|
+
|
|
80
|
+
# tokens/iter mean
|
|
81
|
+
it_sum = sum_value(fam, "vllm:iteration_tokens_total_sum")
|
|
82
|
+
it_cnt = sum_value(fam, "vllm:iteration_tokens_total_count")
|
|
83
|
+
tokens_per_iter = (it_sum / it_cnt) if (it_sum and it_cnt) else None
|
|
84
|
+
|
|
85
|
+
# cache reuse
|
|
86
|
+
q = sum_value(fam, "vllm:prefix_cache_queries_total") or 0.0
|
|
87
|
+
h = sum_value(fam, "vllm:prefix_cache_hits_total") or 0.0
|
|
88
|
+
hit_life = (h / q) if q > 0 else None
|
|
89
|
+
hit_win = None
|
|
90
|
+
if self._prev is not None:
|
|
91
|
+
pq = sum_value(self._prev, "vllm:prefix_cache_queries_total") or 0.0
|
|
92
|
+
ph = sum_value(self._prev, "vllm:prefix_cache_hits_total") or 0.0
|
|
93
|
+
dq, dh = q - pq, h - ph
|
|
94
|
+
if dq > 0:
|
|
95
|
+
hit_win = max(0.0, min(1.0, dh / dq))
|
|
96
|
+
|
|
97
|
+
# token sources
|
|
98
|
+
src = {lbl.get("source"): v for lbl, v in fam.get("vllm:prompt_tokens_by_source_total", [])}
|
|
99
|
+
src_total = sum(src.values()) or 0.0
|
|
100
|
+
frac = lambda k: (src.get(k, 0.0) / src_total) if src_total > 0 else None # noqa: E731
|
|
101
|
+
|
|
102
|
+
ext_q = sum_value(fam, "vllm:external_prefix_cache_queries_total") or 0.0
|
|
103
|
+
external_active = ext_q > 0 or (src.get("external_kv_transfer", 0.0) > 0)
|
|
104
|
+
|
|
105
|
+
# kv memory
|
|
106
|
+
kv_usage = first_value(fam, "vllm:kv_cache_usage_perc") or 0.0
|
|
107
|
+
kv = compute_kv(
|
|
108
|
+
cache_dtype=labels.get("cache_dtype"),
|
|
109
|
+
num_gpu_blocks=_int(labels.get("num_gpu_blocks")),
|
|
110
|
+
block_size=_int(labels.get("block_size")),
|
|
111
|
+
kv_usage=kv_usage,
|
|
112
|
+
kv_cache_memory_bytes=_int(labels.get("kv_cache_memory_bytes")),
|
|
113
|
+
dims=self.dims,
|
|
114
|
+
max_model_len=self.max_model_len,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# spec decode
|
|
118
|
+
drafts = sum_value(fam, "vllm:spec_decode_num_drafts_total")
|
|
119
|
+
draft_tokens = sum_value(fam, "vllm:spec_decode_num_draft_tokens_total")
|
|
120
|
+
accepted = sum_value(fam, "vllm:spec_decode_num_accepted_tokens_total")
|
|
121
|
+
spec_active = bool(drafts and draft_tokens)
|
|
122
|
+
spec_acceptance = (
|
|
123
|
+
(accepted / draft_tokens)
|
|
124
|
+
if (spec_active and accepted is not None and draft_tokens is not None)
|
|
125
|
+
else None
|
|
126
|
+
)
|
|
127
|
+
spec_per_draft = (
|
|
128
|
+
(accepted / drafts) if (spec_active and drafts and accepted is not None) else None
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# efficiency (conditional)
|
|
132
|
+
flops = self._flops.update(sum_value(fam, "vllm:estimated_flops_per_gpu_total") or 0.0, now)
|
|
133
|
+
rbytes = self._rbytes.update(
|
|
134
|
+
sum_value(fam, "vllm:estimated_read_bytes_per_gpu_total") or 0.0, now
|
|
135
|
+
)
|
|
136
|
+
wbytes = self._wbytes.update(
|
|
137
|
+
sum_value(fam, "vllm:estimated_write_bytes_per_gpu_total") or 0.0, now
|
|
138
|
+
)
|
|
139
|
+
eff_active = (flops > 0) or (rbytes + wbytes > 0)
|
|
140
|
+
|
|
141
|
+
snap = Snapshot(
|
|
142
|
+
ts=now,
|
|
143
|
+
connected=True,
|
|
144
|
+
model_names=model_names or ([mn] if (mn := labels.get("model_name")) else []),
|
|
145
|
+
engine_count=len([e for e in engines if e is not None]) or 1,
|
|
146
|
+
running=sum_value(fam, "vllm:num_requests_running") or 0.0,
|
|
147
|
+
waiting=sum_value(fam, "vllm:num_requests_waiting") or 0.0,
|
|
148
|
+
preempt_rate=preempt,
|
|
149
|
+
gen_tps=gen,
|
|
150
|
+
prompt_tps=prompt,
|
|
151
|
+
req_rate=req,
|
|
152
|
+
tokens_per_iter=tokens_per_iter,
|
|
153
|
+
prefix_hit_window=hit_win,
|
|
154
|
+
prefix_hit_lifetime=hit_life,
|
|
155
|
+
src_compute=frac("local_compute"),
|
|
156
|
+
src_cache_hit=frac("local_cache_hit"),
|
|
157
|
+
src_external=frac("external_kv_transfer"),
|
|
158
|
+
cached_tokens_total=sum_value(fam, "vllm:prompt_tokens_cached_total") or 0.0,
|
|
159
|
+
recomputed_tokens_total=sum_value(fam, "vllm:prompt_tokens_recomputed_total") or 0.0,
|
|
160
|
+
external_kv_active=external_active,
|
|
161
|
+
kv_usage=kv_usage,
|
|
162
|
+
kv_capacity_tokens=kv.capacity_tokens,
|
|
163
|
+
kv_used_tokens=kv.used_tokens,
|
|
164
|
+
kv_dtype=kv.dtype,
|
|
165
|
+
kv_ratio=kv.ratio,
|
|
166
|
+
kv_ratio_kind=kv.ratio_kind,
|
|
167
|
+
kv_fp16_equiv_tokens=kv.fp16_equiv_tokens,
|
|
168
|
+
kv_fp16_full_ctx_gb=kv.fp16_full_ctx_gb,
|
|
169
|
+
ttft=self._quantiles(fam, _LAT["ttft"]),
|
|
170
|
+
tpot=self._quantiles(fam, _LAT["tpot"]),
|
|
171
|
+
e2e=self._quantiles(fam, _LAT["e2e"]),
|
|
172
|
+
queue=self._quantiles(fam, _LAT["queue"]),
|
|
173
|
+
spec_active=spec_active,
|
|
174
|
+
spec_acceptance=spec_acceptance,
|
|
175
|
+
spec_accepted_per_draft=spec_per_draft,
|
|
176
|
+
eff_active=eff_active,
|
|
177
|
+
gflops=(flops / 1e9) if eff_active else None,
|
|
178
|
+
gbps=((rbytes + wbytes) / 1e9) if eff_active else None,
|
|
179
|
+
)
|
|
180
|
+
self._prev = fam
|
|
181
|
+
return snap
|
vllmstat/core/parse.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from prometheus_client.parser import text_string_to_metric_families
|
|
4
|
+
|
|
5
|
+
Families = dict[str, list[tuple[dict[str, str], float]]]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse_metrics(text: str) -> Families:
|
|
9
|
+
"""Parse Prometheus exposition text into {sample_name: [(labels, value), ...]}."""
|
|
10
|
+
families: Families = {}
|
|
11
|
+
for family in text_string_to_metric_families(text):
|
|
12
|
+
for sample in family.samples:
|
|
13
|
+
families.setdefault(sample.name, []).append((dict(sample.labels), sample.value))
|
|
14
|
+
return families
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def sum_value(families: Families, name: str) -> float | None:
|
|
18
|
+
rows = families.get(name)
|
|
19
|
+
if not rows:
|
|
20
|
+
return None
|
|
21
|
+
return sum(v for _, v in rows)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def first_value(families: Families, name: str) -> float | None:
|
|
25
|
+
rows = families.get(name)
|
|
26
|
+
if not rows:
|
|
27
|
+
return None
|
|
28
|
+
return rows[0][1]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def info_labels(families: Families, name: str) -> dict[str, str]:
|
|
32
|
+
rows = families.get(name)
|
|
33
|
+
return rows[0][0] if rows else {}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_buckets(families: Families, base: str) -> list[tuple[float, float]]:
|
|
37
|
+
"""Aggregate `<base>_bucket` samples across labels, summing counts per `le`."""
|
|
38
|
+
rows = families.get(base + "_bucket", [])
|
|
39
|
+
agg: dict[float, float] = {}
|
|
40
|
+
for labels, value in rows:
|
|
41
|
+
le = float(labels["le"])
|
|
42
|
+
agg[le] = agg.get(le, 0.0) + value
|
|
43
|
+
return sorted(agg.items())
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def hist_count(families: Families, base: str) -> float | None:
|
|
47
|
+
return sum_value(families, base + "_count")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def hist_sum(families: Families, base: str) -> float | None:
|
|
51
|
+
return sum_value(families, base + "_sum")
|