vllmstat 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vllmstat/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """vllmstat — an interactive terminal dashboard for vLLM serving performance."""
2
+
3
+ __version__ = "0.1.0"
vllmstat/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from vllmstat.cli import main
2
+
3
+ if __name__ == "__main__":
4
+ raise SystemExit(main())
vllmstat/app.py ADDED
@@ -0,0 +1,182 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+
5
+ from textual.app import App, ComposeResult
6
+ from textual.containers import Horizontal
7
+ from textual.timer import Timer
8
+ from textual.widgets import Footer
9
+
10
+ from vllmstat import render
11
+ from vllmstat.config import Config
12
+ from vllmstat.core.history import History
13
+ from vllmstat.core.metrics import MetricsEngine
14
+ from vllmstat.core.parse import parse_metrics
15
+ from vllmstat.core.state import Snapshot
16
+ from vllmstat.model_dims import load_model_dims
17
+ from vllmstat.providers.gpu import GpuProvider
18
+ from vllmstat.providers.mock import MockProvider, mock_gpu_snapshot
19
+ from vllmstat.providers.vllm import VllmProvider
20
+ from vllmstat.widgets import Panel
21
+
22
+
23
+ class VllmStatApp(App):
24
+ CSS = """
25
+ Panel { border: round $primary; padding: 0 1; height: auto; }
26
+ #row1 { height: auto; }
27
+ #row1 Panel { width: 1fr; }
28
+ #gpu { height: auto; }
29
+ """
30
+ BINDINGS = [
31
+ ("q", "quit", "Quit"),
32
+ ("p", "toggle_pause", "Pause"),
33
+ ("g", "toggle_gpu", "GPU"),
34
+ ("plus,equals_sign", "faster", "Faster"),
35
+ ("minus", "slower", "Slower"),
36
+ ]
37
+
38
+ def __init__(self, cfg: Config) -> None:
39
+ super().__init__()
40
+ self.cfg = cfg
41
+ self.paused = False
42
+ self.snapshot: Snapshot | None = None
43
+ self._history = History()
44
+ self._engine = MetricsEngine()
45
+ self._gpu = GpuProvider(enabled=cfg.gpu)
46
+ self._mock = MockProvider() if cfg.mock else None
47
+ self._vllm = (
48
+ None
49
+ if cfg.mock
50
+ else VllmProvider(base_url=cfg.url, metrics_path=cfg.metrics_path, api_key=cfg.api_key)
51
+ )
52
+ self._model_names: list[str] = []
53
+ self._dims_loaded = cfg.mock # mock keeps the plain engine; nothing to fetch
54
+ self._start = time.monotonic()
55
+ self._tick_n = 0
56
+ self._timer: Timer | None = None
57
+ self._in_tick = False
58
+
59
+ def compose(self) -> ComposeResult:
60
+ self.p_header = Panel(id="hdr")
61
+ self.p_conc = Panel(id="conc")
62
+ self.p_tput = Panel(id="tput")
63
+ self.p_lat = Panel(id="lat")
64
+ self.p_cache = Panel(id="cache")
65
+ self.p_eff = Panel(id="eff")
66
+ self.p_spec = Panel(id="spec")
67
+ self.p_gpu = Panel(id="gpu")
68
+ yield self.p_header
69
+ with Horizontal(id="row1"):
70
+ yield self.p_conc
71
+ yield self.p_tput
72
+ yield self.p_lat
73
+ yield self.p_cache
74
+ yield self.p_eff
75
+ yield self.p_spec
76
+ yield self.p_gpu
77
+ yield Footer()
78
+
79
+ def on_mount(self) -> None:
80
+ self._timer = self.set_interval(self.cfg.interval, self.tick)
81
+ self.call_later(self.tick)
82
+
83
+ async def _ensure_dims(self) -> None:
84
+ """Once, in vLLM mode, fetch model info and rebuild the engine with KV dims."""
85
+ if self._dims_loaded or self._vllm is None:
86
+ return
87
+ self._dims_loaded = True # set before await so we only attempt once
88
+ info = await self._vllm.fetch_model_info()
89
+ md = load_model_dims(info.root, info.max_model_len)
90
+ self._engine = MetricsEngine(dims=md.dims, max_model_len=md.max_model_len)
91
+ self._model_names = info.model_names
92
+
93
+ async def _sample_text(self) -> tuple[str, bool, str | None]:
94
+ if self._mock is not None:
95
+ return self._mock.metrics_text(), True, None
96
+ assert self._vllm is not None
97
+ raw = await self._vllm.fetch_metrics()
98
+ return raw.text, raw.fetched_ok, raw.error
99
+
100
+ async def tick(self) -> None:
101
+ if self.paused or self._in_tick:
102
+ return
103
+ self._in_tick = True
104
+ try:
105
+ await self._tick_body()
106
+ finally:
107
+ self._in_tick = False
108
+
109
+ async def _tick_body(self) -> None:
110
+ await self._ensure_dims()
111
+ self._tick_n += 1
112
+ text, ok, err = await self._sample_text()
113
+ now = time.monotonic()
114
+ if ok and text:
115
+ fam = parse_metrics(text)
116
+ snap = self._engine.derive(fam, now=now)
117
+ else:
118
+ snap = self.snapshot or Snapshot(ts=now, connected=False, error=err)
119
+ snap.connected = False
120
+ snap.error = err
121
+ if self._mock is not None and self._gpu.enabled:
122
+ snap.gpu = mock_gpu_snapshot(self._tick_n)
123
+ else:
124
+ snap.gpu = self._gpu.sample()
125
+ self._push_history(snap)
126
+ self.snapshot = snap
127
+ self._refresh(snap)
128
+
129
+ def _push_history(self, s: Snapshot) -> None:
130
+ self._history.push("running", s.running)
131
+ self._history.push("waiting", s.waiting)
132
+ self._history.push("gen_tps", s.gen_tps)
133
+ self._history.push("prompt_tps", s.prompt_tps)
134
+ if s.prefix_hit_window is not None:
135
+ self._history.push("prefix_hit", s.prefix_hit_window)
136
+
137
+ def _uptime(self) -> str:
138
+ secs = int(time.monotonic() - self._start)
139
+ h, rem = divmod(secs, 3600)
140
+ m, _ = divmod(rem, 60)
141
+ return f"{h}h{m:02d}m"
142
+
143
+ def _refresh(self, s: Snapshot) -> None:
144
+ self.p_header.update(
145
+ render.header(s, url=self.cfg.url, interval=self.cfg.interval, uptime=self._uptime())
146
+ )
147
+ self.p_conc.update(render.concurrency(s, self._history))
148
+ self.p_tput.update(render.throughput(s, self._history))
149
+ self.p_lat.update(render.latency(s))
150
+ self.p_cache.update(render.cache_kv(s, self._history))
151
+ eff = render.efficiency(s)
152
+ self.p_eff.display = bool(eff)
153
+ self.p_eff.update(eff)
154
+ spec = render.specdecode(s)
155
+ self.p_spec.display = bool(spec)
156
+ self.p_spec.update(spec)
157
+ self.p_gpu.update(render.gpu(s))
158
+
159
+ def action_toggle_pause(self) -> None:
160
+ self.paused = not self.paused
161
+
162
+ def action_toggle_gpu(self) -> None:
163
+ self._gpu.enabled = not self._gpu.enabled
164
+
165
+ def action_faster(self) -> None:
166
+ self.cfg.interval = max(0.1, self.cfg.interval / 2)
167
+ self._reschedule()
168
+
169
+ def action_slower(self) -> None:
170
+ self.cfg.interval = min(10.0, self.cfg.interval * 2)
171
+ self._reschedule()
172
+
173
+ def _reschedule(self) -> None:
174
+ # restart the interval timer at the new cadence (Textual >=8: Timer.stop + recreate)
175
+ if self._timer is not None:
176
+ self._timer.stop()
177
+ self._timer = self.set_interval(self.cfg.interval, self.tick)
178
+
179
+
180
+ def run_app(cfg: Config) -> int:
181
+ VllmStatApp(cfg).run()
182
+ return 0
vllmstat/cli.py ADDED
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ import time
7
+
8
+ from vllmstat.config import Config
9
+ from vllmstat.core.metrics import MetricsEngine
10
+ from vllmstat.core.parse import parse_metrics
11
+ from vllmstat.providers.mock import MockProvider
12
+ from vllmstat.snapshot_json import snapshot_to_dict
13
+
14
+
15
+ def run_once_json(argv: list[str]) -> int:
16
+ cfg = Config.from_sources(argv, dict(os.environ))
17
+ if cfg.mock:
18
+ eng = MetricsEngine(dims=None, max_model_len=None)
19
+ mp = MockProvider()
20
+ eng.derive(parse_metrics(mp.metrics_text()), now=0.0)
21
+ snap = eng.derive(parse_metrics(mp.metrics_text()), now=1.0)
22
+ else:
23
+ import asyncio
24
+
25
+ from vllmstat.model_dims import load_model_dims
26
+ from vllmstat.providers.vllm import VllmProvider
27
+
28
+ async def _go():
29
+ p = VllmProvider(base_url=cfg.url, metrics_path=cfg.metrics_path, api_key=cfg.api_key)
30
+ info = await p.fetch_model_info()
31
+ r0 = await p.fetch_metrics()
32
+ time.sleep(min(cfg.interval, 1.0))
33
+ r1 = await p.fetch_metrics()
34
+ await p.aclose()
35
+ return info, r0, r1
36
+
37
+ info, r0, r1 = asyncio.run(_go())
38
+ if not r1.fetched_ok:
39
+ print(json.dumps({"error": r1.error}), file=sys.stderr)
40
+ return 1
41
+ md = load_model_dims(info.root, info.max_model_len)
42
+ eng = MetricsEngine(dims=md.dims, max_model_len=md.max_model_len)
43
+ eng.derive(parse_metrics(r0.text), now=0.0)
44
+ snap = eng.derive(parse_metrics(r1.text), now=1.0)
45
+ print(json.dumps(snapshot_to_dict(snap), default=str))
46
+ return 0
47
+
48
+
49
+ def main(argv: list[str] | None = None) -> int:
50
+ argv = list(sys.argv[1:] if argv is None else argv)
51
+ cfg = Config.from_sources(argv, dict(os.environ))
52
+ if cfg.once and cfg.json:
53
+ return run_once_json(argv)
54
+ from vllmstat.app import run_app # imported lazily so --once/--json needs no Textual
55
+
56
+ return run_app(cfg)
vllmstat/config.py ADDED
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from dataclasses import dataclass
5
+
6
+ from vllmstat import __version__
7
+
8
+
9
+ @dataclass
10
+ class Config:
11
+ url: str = "http://localhost:8000"
12
+ metrics_path: str = "/metrics"
13
+ interval: float = 1.0
14
+ api_key: str | None = None
15
+ gpu: bool = True
16
+ mock: bool = False
17
+ once: bool = False
18
+ json: bool = False
19
+
20
+ @staticmethod
21
+ def build_parser() -> argparse.ArgumentParser:
22
+ p = argparse.ArgumentParser(prog="vllmstat", description="nvtop for vLLM")
23
+ p.add_argument("-u", "--url", default="http://localhost:8000")
24
+ p.add_argument("--metrics-path", default="/metrics")
25
+ p.add_argument("-i", "--interval", type=float, default=1.0)
26
+ p.add_argument("--api-key", default=None)
27
+ p.add_argument("--no-gpu", dest="gpu", action="store_false", default=True)
28
+ p.add_argument("--mock", action="store_true", default=False)
29
+ p.add_argument("--once", action="store_true", default=False)
30
+ p.add_argument("--json", action="store_true", default=False)
31
+ p.add_argument("--version", action="version", version=f"vllmstat {__version__}")
32
+ return p
33
+
34
+ @classmethod
35
+ def from_sources(cls, argv: list[str], env: dict[str, str]) -> Config:
36
+ ns = cls.build_parser().parse_args(argv)
37
+ api_key = ns.api_key or env.get("VLLM_API_KEY")
38
+ return cls(
39
+ url=ns.url,
40
+ metrics_path=ns.metrics_path,
41
+ interval=ns.interval,
42
+ api_key=api_key,
43
+ gpu=ns.gpu,
44
+ mock=ns.mock,
45
+ once=ns.once,
46
+ json=ns.json,
47
+ )
File without changes
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def histogram_quantile(buckets: list[tuple[float, float]], q: float) -> float | None:
5
+ """Prometheus-style quantile from cumulative buckets [(le, cum_count), ...] sorted asc."""
6
+ if not buckets:
7
+ return None
8
+ total = buckets[-1][1]
9
+ if total <= 0:
10
+ return None
11
+ target = q * total
12
+ prev_le = 0.0
13
+ prev_count = 0.0
14
+ for le, count in buckets:
15
+ if count >= target:
16
+ if le == float("inf"):
17
+ return prev_le
18
+ bucket_count = count - prev_count
19
+ if bucket_count <= 0:
20
+ return le
21
+ frac = (target - prev_count) / bucket_count
22
+ return prev_le + frac * (le - prev_le)
23
+ prev_le, prev_count = le, count
24
+ return buckets[-1][0]
25
+
26
+
27
+ def windowed_buckets(
28
+ prev: list[tuple[float, float]], cur: list[tuple[float, float]]
29
+ ) -> list[tuple[float, float]]:
30
+ """Per-`le` delta cur-prev. If any delta is negative (counter reset), return cur."""
31
+ prev_map = dict(prev)
32
+ out: list[tuple[float, float]] = []
33
+ for le, count in cur:
34
+ delta = count - prev_map.get(le, 0.0)
35
+ if delta < 0:
36
+ return cur
37
+ out.append((le, delta))
38
+ return out
@@ -0,0 +1,25 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import deque
4
+
5
+
6
+ class Series:
7
+ def __init__(self, maxlen: int = 120) -> None:
8
+ self.values: deque[float] = deque(maxlen=maxlen)
9
+
10
+ def push(self, v: float) -> None:
11
+ self.values.append(v)
12
+
13
+
14
+ class History:
15
+ def __init__(self, maxlen: int = 120) -> None:
16
+ self._maxlen = maxlen
17
+ self._series: dict[str, Series] = {}
18
+
19
+ def series(self, name: str) -> Series:
20
+ if name not in self._series:
21
+ self._series[name] = Series(self._maxlen)
22
+ return self._series[name]
23
+
24
+ def push(self, name: str, value: float) -> None:
25
+ self.series(name).push(value)
vllmstat/core/kv.py ADDED
@@ -0,0 +1,86 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+
6
+ _FP16 = {"auto", "fp16", "float16", "bf16", "bfloat16", "half"}
7
+
8
+
9
+ def parse_kv_bits(dtype: str | None) -> tuple[int, int] | None:
10
+ if not dtype:
11
+ return None
12
+ d = dtype.lower()
13
+ if d in _FP16:
14
+ return (16, 16)
15
+ if "fp8" in d or "int8" in d or "e4m3" in d or "e5m2" in d:
16
+ return (8, 8)
17
+ m = re.search(r"k(\d+)v(\d+)", d)
18
+ if m:
19
+ return (int(m.group(1)), int(m.group(2)))
20
+ return None
21
+
22
+
23
+ def nominal_ratio(dtype: str | None) -> float | None:
24
+ bits = parse_kv_bits(dtype)
25
+ if not bits:
26
+ return None
27
+ bk, bv = bits
28
+ return (16 + 16) / (bk + bv)
29
+
30
+
31
+ def fp16_bytes_per_token(layers: int, kv_heads: int, head_dim: int) -> int:
32
+ return 2 * layers * kv_heads * head_dim * 2
33
+
34
+
35
+ @dataclass
36
+ class KvInfo:
37
+ dtype: str | None
38
+ capacity_tokens: int | None
39
+ used_tokens: int | None
40
+ ratio: float | None
41
+ ratio_kind: str # "achieved" | "nominal" | "none"
42
+ fp16_equiv_tokens: int | None
43
+ fp16_full_ctx_gb: float | None
44
+
45
+
46
+ def compute_kv(
47
+ *,
48
+ cache_dtype: str | None,
49
+ num_gpu_blocks: int | None,
50
+ block_size: int | None,
51
+ kv_usage: float,
52
+ kv_cache_memory_bytes: int | None,
53
+ dims: dict[str, int] | None,
54
+ max_model_len: int | None,
55
+ ) -> KvInfo:
56
+ capacity = None
57
+ if num_gpu_blocks and block_size:
58
+ capacity = num_gpu_blocks * block_size
59
+ used = round(capacity * kv_usage) if capacity is not None else None
60
+
61
+ bpt = None
62
+ if dims and all(k in dims for k in ("layers", "kv_heads", "head_dim")):
63
+ bpt = fp16_bytes_per_token(dims["layers"], dims["kv_heads"], dims["head_dim"])
64
+
65
+ ratio: float | None = None
66
+ kind = "none"
67
+ if capacity and bpt and kv_cache_memory_bytes:
68
+ ratio = (capacity * bpt) / kv_cache_memory_bytes
69
+ kind = "achieved"
70
+ else:
71
+ nominal = nominal_ratio(cache_dtype)
72
+ if nominal is not None:
73
+ ratio, kind = nominal, "nominal"
74
+
75
+ fp16_equiv = round(capacity / ratio) if (capacity and ratio) else None
76
+ fp16_full_ctx_gb = (max_model_len * bpt / 1e9) if (max_model_len and bpt) else None
77
+
78
+ return KvInfo(
79
+ dtype=cache_dtype,
80
+ capacity_tokens=capacity,
81
+ used_tokens=used,
82
+ ratio=ratio,
83
+ ratio_kind=kind,
84
+ fp16_equiv_tokens=fp16_equiv,
85
+ fp16_full_ctx_gb=fp16_full_ctx_gb,
86
+ )
@@ -0,0 +1,181 @@
1
+ from __future__ import annotations
2
+
3
+ from vllmstat.core.histogram import histogram_quantile, windowed_buckets
4
+ from vllmstat.core.kv import compute_kv
5
+ from vllmstat.core.parse import (
6
+ Families,
7
+ first_value,
8
+ get_buckets,
9
+ info_labels,
10
+ sum_value,
11
+ )
12
+ from vllmstat.core.rates import Rate
13
+ from vllmstat.core.state import Quantiles, Snapshot
14
+
15
+ _LAT = {
16
+ "ttft": "vllm:time_to_first_token_seconds",
17
+ "tpot": "vllm:request_time_per_output_token_seconds",
18
+ "e2e": "vllm:e2e_request_latency_seconds",
19
+ "queue": "vllm:request_queue_time_seconds",
20
+ }
21
+
22
+
23
+ def _int(s: str | None) -> int | None:
24
+ try:
25
+ return int(s) if s not in (None, "None", "") else None
26
+ except (TypeError, ValueError):
27
+ return None
28
+
29
+
30
+ class MetricsEngine:
31
+ def __init__(
32
+ self,
33
+ *,
34
+ alpha: float = 0.3,
35
+ dims: dict[str, int] | None = None,
36
+ max_model_len: int | None = None,
37
+ ) -> None:
38
+ self.dims = dims
39
+ self.max_model_len = max_model_len
40
+ self._gen = Rate(alpha)
41
+ self._prompt = Rate(alpha)
42
+ self._req = Rate(alpha)
43
+ self._preempt = Rate(alpha)
44
+ self._flops = Rate(alpha)
45
+ self._rbytes = Rate(alpha)
46
+ self._wbytes = Rate(alpha)
47
+ self._prev: Families | None = None
48
+
49
+ def _quantiles(self, fam: Families, base: str) -> Quantiles:
50
+ cur = get_buckets(fam, base)
51
+ if not cur:
52
+ return Quantiles()
53
+ buckets = cur
54
+ if self._prev is not None:
55
+ prev = get_buckets(self._prev, base)
56
+ if prev:
57
+ delta = windowed_buckets(prev, cur)
58
+ if delta and delta[-1][1] > 0:
59
+ buckets = delta
60
+ return Quantiles(
61
+ p50=histogram_quantile(buckets, 0.50),
62
+ p90=histogram_quantile(buckets, 0.90),
63
+ p99=histogram_quantile(buckets, 0.99),
64
+ )
65
+
66
+ def derive(self, fam: Families, now: float) -> Snapshot:
67
+ labels = info_labels(fam, "vllm:cache_config_info")
68
+ model_names = sorted(
69
+ {lbl.get("model_name", "") for lbl, _ in fam.get("vllm:num_requests_running", [])}
70
+ - {""}
71
+ )
72
+ engines = {lbl.get("engine") for lbl, _ in fam.get("vllm:num_requests_running", [])}
73
+
74
+ # throughput rates
75
+ gen = self._gen.update(sum_value(fam, "vllm:generation_tokens_total") or 0.0, now)
76
+ prompt = self._prompt.update(sum_value(fam, "vllm:prompt_tokens_total") or 0.0, now)
77
+ req = self._req.update(sum_value(fam, "vllm:request_success_total") or 0.0, now)
78
+ preempt = self._preempt.update(sum_value(fam, "vllm:num_preemptions_total") or 0.0, now)
79
+
80
+ # tokens/iter mean
81
+ it_sum = sum_value(fam, "vllm:iteration_tokens_total_sum")
82
+ it_cnt = sum_value(fam, "vllm:iteration_tokens_total_count")
83
+ tokens_per_iter = (it_sum / it_cnt) if (it_sum and it_cnt) else None
84
+
85
+ # cache reuse
86
+ q = sum_value(fam, "vllm:prefix_cache_queries_total") or 0.0
87
+ h = sum_value(fam, "vllm:prefix_cache_hits_total") or 0.0
88
+ hit_life = (h / q) if q > 0 else None
89
+ hit_win = None
90
+ if self._prev is not None:
91
+ pq = sum_value(self._prev, "vllm:prefix_cache_queries_total") or 0.0
92
+ ph = sum_value(self._prev, "vllm:prefix_cache_hits_total") or 0.0
93
+ dq, dh = q - pq, h - ph
94
+ if dq > 0:
95
+ hit_win = max(0.0, min(1.0, dh / dq))
96
+
97
+ # token sources
98
+ src = {lbl.get("source"): v for lbl, v in fam.get("vllm:prompt_tokens_by_source_total", [])}
99
+ src_total = sum(src.values()) or 0.0
100
+ frac = lambda k: (src.get(k, 0.0) / src_total) if src_total > 0 else None # noqa: E731
101
+
102
+ ext_q = sum_value(fam, "vllm:external_prefix_cache_queries_total") or 0.0
103
+ external_active = ext_q > 0 or (src.get("external_kv_transfer", 0.0) > 0)
104
+
105
+ # kv memory
106
+ kv_usage = first_value(fam, "vllm:kv_cache_usage_perc") or 0.0
107
+ kv = compute_kv(
108
+ cache_dtype=labels.get("cache_dtype"),
109
+ num_gpu_blocks=_int(labels.get("num_gpu_blocks")),
110
+ block_size=_int(labels.get("block_size")),
111
+ kv_usage=kv_usage,
112
+ kv_cache_memory_bytes=_int(labels.get("kv_cache_memory_bytes")),
113
+ dims=self.dims,
114
+ max_model_len=self.max_model_len,
115
+ )
116
+
117
+ # spec decode
118
+ drafts = sum_value(fam, "vllm:spec_decode_num_drafts_total")
119
+ draft_tokens = sum_value(fam, "vllm:spec_decode_num_draft_tokens_total")
120
+ accepted = sum_value(fam, "vllm:spec_decode_num_accepted_tokens_total")
121
+ spec_active = bool(drafts and draft_tokens)
122
+ spec_acceptance = (
123
+ (accepted / draft_tokens)
124
+ if (spec_active and accepted is not None and draft_tokens is not None)
125
+ else None
126
+ )
127
+ spec_per_draft = (
128
+ (accepted / drafts) if (spec_active and drafts and accepted is not None) else None
129
+ )
130
+
131
+ # efficiency (conditional)
132
+ flops = self._flops.update(sum_value(fam, "vllm:estimated_flops_per_gpu_total") or 0.0, now)
133
+ rbytes = self._rbytes.update(
134
+ sum_value(fam, "vllm:estimated_read_bytes_per_gpu_total") or 0.0, now
135
+ )
136
+ wbytes = self._wbytes.update(
137
+ sum_value(fam, "vllm:estimated_write_bytes_per_gpu_total") or 0.0, now
138
+ )
139
+ eff_active = (flops > 0) or (rbytes + wbytes > 0)
140
+
141
+ snap = Snapshot(
142
+ ts=now,
143
+ connected=True,
144
+ model_names=model_names or ([mn] if (mn := labels.get("model_name")) else []),
145
+ engine_count=len([e for e in engines if e is not None]) or 1,
146
+ running=sum_value(fam, "vllm:num_requests_running") or 0.0,
147
+ waiting=sum_value(fam, "vllm:num_requests_waiting") or 0.0,
148
+ preempt_rate=preempt,
149
+ gen_tps=gen,
150
+ prompt_tps=prompt,
151
+ req_rate=req,
152
+ tokens_per_iter=tokens_per_iter,
153
+ prefix_hit_window=hit_win,
154
+ prefix_hit_lifetime=hit_life,
155
+ src_compute=frac("local_compute"),
156
+ src_cache_hit=frac("local_cache_hit"),
157
+ src_external=frac("external_kv_transfer"),
158
+ cached_tokens_total=sum_value(fam, "vllm:prompt_tokens_cached_total") or 0.0,
159
+ recomputed_tokens_total=sum_value(fam, "vllm:prompt_tokens_recomputed_total") or 0.0,
160
+ external_kv_active=external_active,
161
+ kv_usage=kv_usage,
162
+ kv_capacity_tokens=kv.capacity_tokens,
163
+ kv_used_tokens=kv.used_tokens,
164
+ kv_dtype=kv.dtype,
165
+ kv_ratio=kv.ratio,
166
+ kv_ratio_kind=kv.ratio_kind,
167
+ kv_fp16_equiv_tokens=kv.fp16_equiv_tokens,
168
+ kv_fp16_full_ctx_gb=kv.fp16_full_ctx_gb,
169
+ ttft=self._quantiles(fam, _LAT["ttft"]),
170
+ tpot=self._quantiles(fam, _LAT["tpot"]),
171
+ e2e=self._quantiles(fam, _LAT["e2e"]),
172
+ queue=self._quantiles(fam, _LAT["queue"]),
173
+ spec_active=spec_active,
174
+ spec_acceptance=spec_acceptance,
175
+ spec_accepted_per_draft=spec_per_draft,
176
+ eff_active=eff_active,
177
+ gflops=(flops / 1e9) if eff_active else None,
178
+ gbps=((rbytes + wbytes) / 1e9) if eff_active else None,
179
+ )
180
+ self._prev = fam
181
+ return snap
vllmstat/core/parse.py ADDED
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from prometheus_client.parser import text_string_to_metric_families
4
+
5
+ Families = dict[str, list[tuple[dict[str, str], float]]]
6
+
7
+
8
+ def parse_metrics(text: str) -> Families:
9
+ """Parse Prometheus exposition text into {sample_name: [(labels, value), ...]}."""
10
+ families: Families = {}
11
+ for family in text_string_to_metric_families(text):
12
+ for sample in family.samples:
13
+ families.setdefault(sample.name, []).append((dict(sample.labels), sample.value))
14
+ return families
15
+
16
+
17
+ def sum_value(families: Families, name: str) -> float | None:
18
+ rows = families.get(name)
19
+ if not rows:
20
+ return None
21
+ return sum(v for _, v in rows)
22
+
23
+
24
+ def first_value(families: Families, name: str) -> float | None:
25
+ rows = families.get(name)
26
+ if not rows:
27
+ return None
28
+ return rows[0][1]
29
+
30
+
31
+ def info_labels(families: Families, name: str) -> dict[str, str]:
32
+ rows = families.get(name)
33
+ return rows[0][0] if rows else {}
34
+
35
+
36
+ def get_buckets(families: Families, base: str) -> list[tuple[float, float]]:
37
+ """Aggregate `<base>_bucket` samples across labels, summing counts per `le`."""
38
+ rows = families.get(base + "_bucket", [])
39
+ agg: dict[float, float] = {}
40
+ for labels, value in rows:
41
+ le = float(labels["le"])
42
+ agg[le] = agg.get(le, 0.0) + value
43
+ return sorted(agg.items())
44
+
45
+
46
+ def hist_count(families: Families, base: str) -> float | None:
47
+ return sum_value(families, base + "_count")
48
+
49
+
50
+ def hist_sum(families: Families, base: str) -> float | None:
51
+ return sum_value(families, base + "_sum")