benchmaker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
benchmaker/monitors.py ADDED
@@ -0,0 +1,228 @@
1
+ """Optional periodic monitors.
2
+
3
+ A `Monitor` runs alongside the benchmark and samples something external every
4
+ `interval_s`. Each tick returns a flat `dict[str, float]` of values; the runner
5
+ records them as a time-series and the aggregator summarizes them in the final
6
+ report.
7
+
8
+ Typical use cases:
9
+ * scrape vLLM / SGLang `/metrics` (Prometheus) for queue depth, KV-cache
10
+ utilization, throughput, etc.
11
+ * sample GPU utilization (`pynvml`)
12
+ * pull a Slurm/k8s queue depth
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import time
19
+ from abc import ABC, abstractmethod
20
+ from typing import Any, Awaitable, Callable, Optional, Union
21
+
22
+ import aiohttp
23
+
24
+
25
+ class Monitor(ABC):
26
+ """Periodic side-channel sampler.
27
+
28
+ Subclasses implement `tick()` which is called every `interval_s` seconds.
29
+ Return a flat `{metric_name: float}` dict (or `None` to skip this tick).
30
+ """
31
+
32
+ name: str = "monitor"
33
+ interval_s: float = 1.0
34
+ tick_at_start: bool = True # whether to fire one immediate tick at t=0
35
+
36
+ async def setup(self) -> None:
37
+ """Called once before the first tick. Use for opening sessions, etc."""
38
+
39
+ @abstractmethod
40
+ async def tick(self) -> Optional[dict[str, float]]:
41
+ """Return one observation. Called every `interval_s` seconds."""
42
+
43
+ async def aclose(self) -> None:
44
+ """Called once after the last tick. Use for cleanup."""
45
+
46
+
47
+ class FunctionMonitor(Monitor):
48
+ """Wrap a sync or async callable that returns a metrics dict.
49
+
50
+ The callable receives no arguments. If it raises, the exception is logged
51
+ once (to stderr) but does not kill the benchmark.
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ fn: Callable[[], Union[Optional[dict[str, float]], Awaitable[Optional[dict[str, float]]]]],
57
+ name: str = "fn",
58
+ interval_s: float = 1.0,
59
+ tick_at_start: bool = True,
60
+ ):
61
+ self._fn = fn
62
+ self.name = name
63
+ self.interval_s = interval_s
64
+ self.tick_at_start = tick_at_start
65
+
66
+ async def tick(self) -> Optional[dict[str, float]]:
67
+ result = self._fn()
68
+ if hasattr(result, "__await__"):
69
+ result = await result # type: ignore[assignment]
70
+ return result # type: ignore[return-value]
71
+
72
+
73
+ class PrometheusMonitor(Monitor):
74
+ """Scrape a Prometheus `/metrics` endpoint each tick.
75
+
76
+ Args:
77
+ url: full URL to the metrics endpoint.
78
+ metric_names: optional set of metric names (without labels) to keep.
79
+ If `None`, all metrics are recorded.
80
+ labelled_keys: if True (default), series with labels are stored as
81
+ `name{label="value"}`. If False, label info is dropped and series
82
+ with identical names are summed.
83
+ headers: HTTP headers (e.g. Authorization).
84
+ interval_s: scrape interval.
85
+ """
86
+
87
+ def __init__(
88
+ self,
89
+ url: str,
90
+ metric_names: Optional[set[str]] = None,
91
+ labelled_keys: bool = True,
92
+ headers: Optional[dict[str, str]] = None,
93
+ interval_s: float = 1.0,
94
+ name: str = "prometheus",
95
+ tick_at_start: bool = True,
96
+ timeout_s: float = 5.0,
97
+ ):
98
+ self._url = url
99
+ self._names = set(metric_names) if metric_names else None
100
+ self._labelled = labelled_keys
101
+ self._headers = headers or {}
102
+ self.interval_s = interval_s
103
+ self.name = name
104
+ self.tick_at_start = tick_at_start
105
+ self._timeout = aiohttp.ClientTimeout(total=timeout_s)
106
+ self._session: Optional[aiohttp.ClientSession] = None
107
+
108
+ async def setup(self) -> None:
109
+ self._session = aiohttp.ClientSession(timeout=self._timeout)
110
+
111
+ async def tick(self) -> Optional[dict[str, float]]:
112
+ assert self._session is not None
113
+ try:
114
+ async with self._session.get(self._url, headers=self._headers) as r:
115
+ if r.status >= 400:
116
+ return None
117
+ text = await r.text()
118
+ except (aiohttp.ClientError, asyncio.TimeoutError):
119
+ return None
120
+ return parse_prometheus(text, names=self._names, labelled_keys=self._labelled)
121
+
122
+ async def aclose(self) -> None:
123
+ if self._session is not None:
124
+ await self._session.close()
125
+ self._session = None
126
+
127
+
128
+ def parse_prometheus(
129
+ text: str,
130
+ names: Optional[set[str]] = None,
131
+ labelled_keys: bool = True,
132
+ ) -> dict[str, float]:
133
+ """Minimal Prometheus text-format parser.
134
+
135
+ Skips `# HELP` / `# TYPE` lines, comments, and malformed lines. Handles
136
+ name + optional `{labels}` + value (timestamp ignored).
137
+ """
138
+ out: dict[str, float] = {}
139
+ for line in text.splitlines():
140
+ line = line.strip()
141
+ if not line or line.startswith("#"):
142
+ continue
143
+
144
+ # Split `name[{labels}]` from value(+timestamp). Labels can contain
145
+ # spaces inside quoted values, but the value field always follows the
146
+ # closing brace (or the name itself) with whitespace. Use the position
147
+ # of the last `}` if present, else the first space.
148
+ if "{" in line:
149
+ close = line.find("}")
150
+ if close == -1:
151
+ continue
152
+ name_part = line[: close + 1]
153
+ rest = line[close + 1:].strip()
154
+ else:
155
+ sp = line.find(" ")
156
+ if sp == -1:
157
+ continue
158
+ name_part = line[:sp]
159
+ rest = line[sp + 1:].strip()
160
+
161
+ if not rest:
162
+ continue
163
+ value_token = rest.split()[0]
164
+ try:
165
+ value = float(value_token)
166
+ except ValueError:
167
+ continue
168
+
169
+ bare = name_part.split("{", 1)[0]
170
+ if names is not None and bare not in names:
171
+ continue
172
+
173
+ key = name_part if labelled_keys else bare
174
+ if key in out and not labelled_keys:
175
+ out[key] += value
176
+ else:
177
+ out[key] = value
178
+ return out
179
+
180
+
181
+ async def run_monitor_loop(
182
+ monitor: Monitor,
183
+ samples: list[tuple[float, dict[str, float]]],
184
+ start_mono: float,
185
+ stop_event: asyncio.Event,
186
+ ) -> None:
187
+ """Drive a single monitor until `stop_event` is set.
188
+
189
+ Records `(elapsed_s, values)` tuples into `samples`. Any tick exception is
190
+ swallowed (logged to stderr) so monitor failure doesn't kill the bench.
191
+ """
192
+ import sys
193
+
194
+ try:
195
+ await monitor.setup()
196
+ except Exception as e:
197
+ sys.stderr.write(f"[monitor:{monitor.name}] setup failed: {e}\n")
198
+ return
199
+
200
+ try:
201
+ if monitor.tick_at_start:
202
+ await _safe_tick(monitor, samples, start_mono)
203
+
204
+ while not stop_event.is_set():
205
+ try:
206
+ await asyncio.wait_for(stop_event.wait(), timeout=monitor.interval_s)
207
+ # If we get here, stop was signalled — do one final tick and exit.
208
+ await _safe_tick(monitor, samples, start_mono)
209
+ break
210
+ except asyncio.TimeoutError:
211
+ await _safe_tick(monitor, samples, start_mono)
212
+ finally:
213
+ try:
214
+ await monitor.aclose()
215
+ except Exception as e:
216
+ sys.stderr.write(f"[monitor:{monitor.name}] aclose failed: {e}\n")
217
+
218
+
219
+ async def _safe_tick(monitor: Monitor, samples: list, start_mono: float) -> None:
220
+ import sys
221
+ try:
222
+ values = await monitor.tick()
223
+ except Exception as e:
224
+ sys.stderr.write(f"[monitor:{monitor.name}] tick error: {e}\n")
225
+ return
226
+ if not values:
227
+ return
228
+ samples.append((time.monotonic() - start_mono, dict(values)))
benchmaker/runner.py ADDED
@@ -0,0 +1,275 @@
1
+ """BenchRunner: ties scheduler -> workload (dataset) -> workload-type ->
2
+ aiohttp session -> metrics."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import asyncio
7
+ import logging
8
+ import time
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Optional
11
+
12
+ import aiohttp
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ from benchmaker.load import LoadModel
17
+ from benchmaker.metrics import MetricsAggregator
18
+ from benchmaker.types import (
19
+ PostResponseHook,
20
+ PreRequestHook,
21
+ Request,
22
+ Response,
23
+ Sample,
24
+ TicketContext,
25
+ maybe_await,
26
+ )
27
+ from benchmaker.monitors import Monitor, run_monitor_loop
28
+ from benchmaker.trace import TraceRecorder
29
+ from benchmaker.workloads.base import WorkloadType
30
+ from benchmaker.workloads.datasets import Workload, StaticWorkload
31
+
32
+
33
+ @dataclass
34
+ class BenchConfig:
35
+ workload_type: WorkloadType # how to talk to the service
36
+ load: LoadModel # when to fire
37
+ workload: Workload = field(default_factory=StaticWorkload) # what to send
38
+ pre_hooks: list[PreRequestHook] = field(default_factory=list)
39
+ post_hooks: list[PostResponseHook] = field(default_factory=list)
40
+ monitors: list[Monitor] = field(default_factory=list) # optional periodic samplers
41
+ # Optional trace recorder. When set, each fired request is appended to a
42
+ # JSONL file (with relative timestamp) so a later run can replay the bench
43
+ # deterministically via `benchmaker.trace.TracePacedLoad` + `ReplayWorkloadType`.
44
+ recorder: Optional[TraceRecorder] = None
45
+ connection_limit: int = 1000
46
+ timeout_s: float = 60.0
47
+ max_in_flight: int = 10000
48
+ progress_every_s: float = 1.0
49
+ stop_on_exhausted: bool = True
50
+
51
+
52
+ @dataclass
53
+ class BenchResult:
54
+ samples: list[Sample]
55
+ summary: dict
56
+
57
+
58
+ class BenchRunner:
59
+ def __init__(self, config: BenchConfig):
60
+ self.cfg = config
61
+ self.metrics = MetricsAggregator()
62
+
63
+ async def run(self) -> BenchResult:
64
+ connector = aiohttp.TCPConnector(
65
+ limit=self.cfg.connection_limit,
66
+ ttl_dns_cache=300,
67
+ force_close=False,
68
+ )
69
+ timeout = aiohttp.ClientTimeout(total=self.cfg.timeout_s)
70
+ if self.cfg.recorder is not None:
71
+ self.cfg.recorder.open(start_mono=self.metrics.start_time)
72
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
73
+ try:
74
+ await self._drive(session)
75
+ finally:
76
+ await self.cfg.workload.aclose()
77
+ await self.cfg.workload_type.aclose()
78
+ if self.cfg.recorder is not None:
79
+ self.cfg.recorder.close()
80
+ self.metrics.finalize()
81
+ return BenchResult(samples=self.metrics.samples, summary=self.metrics.summary())
82
+
83
+ async def _drive(self, session: aiohttp.ClientSession) -> None:
84
+ sem = asyncio.Semaphore(self.cfg.max_in_flight)
85
+ tasks: set[asyncio.Task] = set()
86
+ progress_task = asyncio.create_task(self._progress_loop())
87
+
88
+ # Spawn monitor loops.
89
+ monitor_stop = asyncio.Event()
90
+ monitor_tasks: list[asyncio.Task] = []
91
+ bench_start = self.metrics.start_time
92
+ for mon in self.cfg.monitors:
93
+ buf = self.metrics.monitor_buffer(mon.name)
94
+ monitor_tasks.append(asyncio.create_task(
95
+ run_monitor_loop(mon, buf, bench_start, monitor_stop)
96
+ ))
97
+
98
+ try:
99
+ async for _ in self.cfg.load.tickets():
100
+ try:
101
+ item = await self.cfg.workload.next_item()
102
+ except StopAsyncIteration:
103
+ if self.cfg.stop_on_exhausted:
104
+ break
105
+ else:
106
+ continue
107
+
108
+ await sem.acquire()
109
+ task = asyncio.create_task(self._fire(session, item, sem))
110
+ tasks.add(task)
111
+ task.add_done_callback(tasks.discard)
112
+ finally:
113
+ progress_task.cancel()
114
+ try:
115
+ await progress_task
116
+ except (asyncio.CancelledError, Exception):
117
+ pass
118
+
119
+ if tasks:
120
+ await asyncio.gather(*tasks, return_exceptions=True)
121
+
122
+ # Signal monitors to do one last tick and exit, then wait for them.
123
+ monitor_stop.set()
124
+ if monitor_tasks:
125
+ await asyncio.gather(*monitor_tasks, return_exceptions=True)
126
+
127
+ async def _fire(self, session: aiohttp.ClientSession, item: Any,
128
+ sem: asyncio.Semaphore) -> None:
129
+ start_mono = time.monotonic()
130
+ try:
131
+ async def fire(req: Request) -> Response:
132
+ for hook in self.cfg.pre_hooks:
133
+ req = await maybe_await(hook(req))
134
+ fire_start = time.monotonic()
135
+ if self.cfg.recorder is not None:
136
+ await self.cfg.recorder.record(req, fire_start)
137
+ return await self._execute(session, req, fire_start)
138
+
139
+ ctx = TicketContext(
140
+ item=item,
141
+ start_mono=start_mono,
142
+ fire=fire,
143
+ pre_hooks=tuple(self.cfg.pre_hooks),
144
+ post_hooks=tuple(self.cfg.post_hooks),
145
+ workload_name=self.cfg.workload_type.name,
146
+ )
147
+ sample = await self.cfg.workload_type.run_ticket(ctx)
148
+ self.metrics.add(sample)
149
+ except Exception as e:
150
+ self.metrics.add(_failure_sample(
151
+ f"{type(e).__name__}: {e}",
152
+ self.cfg.workload_type.name,
153
+ ))
154
+ finally:
155
+ sem.release()
156
+ self.cfg.load.on_complete()
157
+
158
+ async def _execute(self, session: aiohttp.ClientSession, req: Request,
159
+ start_mono: float) -> Response:
160
+ kwargs: dict = {"headers": req.headers, "params": req.params}
161
+ if req.json is not None:
162
+ kwargs["json"] = req.json
163
+ elif req.body is not None:
164
+ kwargs["data"] = req.body
165
+ if req.timeout_s is not None:
166
+ kwargs["timeout"] = aiohttp.ClientTimeout(total=req.timeout_s)
167
+
168
+ try:
169
+ async with session.request(req.method, req.url, **kwargs) as resp:
170
+ if self.cfg.workload_type.streaming:
171
+ chunks: list[bytes] = []
172
+ chunk_times: list[float] = []
173
+ body_parts: list[bytes] = []
174
+ async for chunk in resp.content.iter_any():
175
+ chunks.append(chunk)
176
+ chunk_times.append(time.monotonic() - start_mono)
177
+ body_parts.append(chunk)
178
+ body = b"".join(body_parts)
179
+ elapsed = time.monotonic() - start_mono
180
+ return Response(
181
+ status=resp.status,
182
+ headers=dict(resp.headers),
183
+ body=body,
184
+ elapsed_s=elapsed,
185
+ ok=200 <= resp.status < 400,
186
+ stream_chunks=chunks,
187
+ stream_chunk_times=chunk_times,
188
+ )
189
+ else:
190
+ body = await resp.read()
191
+ elapsed = time.monotonic() - start_mono
192
+ return Response(
193
+ status=resp.status,
194
+ headers=dict(resp.headers),
195
+ body=body,
196
+ elapsed_s=elapsed,
197
+ ok=200 <= resp.status < 400,
198
+ )
199
+ except asyncio.TimeoutError:
200
+ return Response(
201
+ status=0, headers={}, body=b"",
202
+ elapsed_s=time.monotonic() - start_mono,
203
+ ok=False, error="timeout",
204
+ )
205
+ except aiohttp.ClientError as e:
206
+ return Response(
207
+ status=0, headers={}, body=b"",
208
+ elapsed_s=time.monotonic() - start_mono,
209
+ ok=False, error=f"{type(e).__name__}: {e}",
210
+ )
211
+
212
+ async def _progress_loop(self) -> None:
213
+ if self.cfg.progress_every_s <= 0:
214
+ return
215
+ last_n = 0
216
+ last_t = time.monotonic()
217
+ seen_errors: set[str] = set()
218
+ try:
219
+ while True:
220
+ await asyncio.sleep(self.cfg.progress_every_s)
221
+ now = time.monotonic()
222
+ n = len(self.metrics.samples)
223
+ dn = n - last_n
224
+ dt = now - last_t
225
+ inst = dn / dt if dt > 0 else 0.0
226
+ window = self.metrics.samples[last_n:]
227
+ ok = sum(1 for s in window if s.ok)
228
+ # Wrong: request was delivered (HTTP success) but a post-hook /
229
+ # workload graded the output as a failure (e.g. eval gate).
230
+ wrong = sum(1 for s in window if not s.ok and s.request_ok)
231
+ fail = dn - ok - wrong
232
+ logger.info(
233
+ "+%5d req (%7.1f rps, %d ok, %d wrong, %d fail) | total=%d",
234
+ dn, inst, ok, wrong, fail, n,
235
+ )
236
+ # Surface the first occurrence of each distinct error string —
237
+ # one short line per kind, so failed runs are diagnosable
238
+ # without grepping samples.jsonl.
239
+ for s in window:
240
+ if s.error and s.error not in seen_errors:
241
+ seen_errors.add(s.error)
242
+ msg = s.error if len(s.error) <= 200 else s.error[:200] + "..."
243
+ bucket = "fail" if not s.request_ok else "wrong"
244
+ logger.warning(" first %s: %s", bucket, msg)
245
+ last_n = n
246
+ last_t = now
247
+ except asyncio.CancelledError:
248
+ return
249
+
250
+ def write_bundle(self, out_dir: str, **kwargs) -> str:
251
+ """Write a per-run directory bundle. See `benchmaker.bundle.write_bundle`."""
252
+ from benchmaker.bundle import write_bundle
253
+ return write_bundle(
254
+ out_dir,
255
+ self.metrics,
256
+ workload_type_name=self.cfg.workload_type.name,
257
+ workload_name=self.cfg.workload.name,
258
+ **kwargs,
259
+ )
260
+
261
+
262
+ def _failure_sample(error: str, workload: str) -> Sample:
263
+ return Sample(
264
+ start_ts=time.monotonic(),
265
+ latency_s=0.0,
266
+ status=0,
267
+ ok=False,
268
+ request_ok=False,
269
+ error=error,
270
+ workload=workload,
271
+ )
272
+
273
+
274
+ async def run_bench(config: BenchConfig) -> BenchResult:
275
+ return await BenchRunner(config).run()