PyPI - benchmaker - Versions diffs - 0.1.0__py3-none-any.whl - Mend

benchmaker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

benchmaker/__init__.py +152 -0
benchmaker/bundle.py +193 -0
benchmaker/cli.py +382 -0
benchmaker/collect.py +178 -0
benchmaker/config.py +448 -0
benchmaker/env.py +87 -0
benchmaker/load.py +326 -0
benchmaker/metrics.py +234 -0
benchmaker/monitors.py +228 -0
benchmaker/runner.py +275 -0
benchmaker/trace.py +217 -0
benchmaker/types.py +98 -0
benchmaker/workloads/__init__.py +53 -0
benchmaker/workloads/agent.py +308 -0
benchmaker/workloads/base.py +79 -0
benchmaker/workloads/datasets.py +156 -0
benchmaker/workloads/eval.py +504 -0
benchmaker/workloads/hf.py +382 -0
benchmaker/workloads/http.py +77 -0
benchmaker/workloads/llm.py +258 -0
benchmaker/workloads/sandbox.py +470 -0
benchmaker-0.1.0.dist-info/METADATA +214 -0
benchmaker-0.1.0.dist-info/RECORD +26 -0
benchmaker-0.1.0.dist-info/WHEEL +5 -0
benchmaker-0.1.0.dist-info/entry_points.txt +2 -0
benchmaker-0.1.0.dist-info/top_level.txt +1 -0

benchmaker/monitors.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""Optional periodic monitors.
+A `Monitor` runs alongside the benchmark and samples something external every
+`interval_s`. Each tick returns a flat `dict[str, float]` of values; the runner
+records them as a time-series and the aggregator summarizes them in the final
+report.
+Typical use cases:
+    * scrape vLLM / SGLang `/metrics` (Prometheus) for queue depth, KV-cache
+      utilization, throughput, etc.
+    * sample GPU utilization (`pynvml`)
+    * pull a Slurm/k8s queue depth
+"""
+from __future__ import annotations
+import asyncio
+import time
+from abc import ABC, abstractmethod
+from typing import Any, Awaitable, Callable, Optional, Union
+import aiohttp
+class Monitor(ABC):
+    """Periodic side-channel sampler.
+    Subclasses implement `tick()` which is called every `interval_s` seconds.
+    Return a flat `{metric_name: float}` dict (or `None` to skip this tick).
+    """
+    name: str = "monitor"
+    interval_s: float = 1.0
+    tick_at_start: bool = True   # whether to fire one immediate tick at t=0
+    async def setup(self) -> None:
+        """Called once before the first tick. Use for opening sessions, etc."""
+    @abstractmethod
+    async def tick(self) -> Optional[dict[str, float]]:
+        """Return one observation. Called every `interval_s` seconds."""
+    async def aclose(self) -> None:
+        """Called once after the last tick. Use for cleanup."""
+class FunctionMonitor(Monitor):
+    """Wrap a sync or async callable that returns a metrics dict.
+    The callable receives no arguments. If it raises, the exception is logged
+    once (to stderr) but does not kill the benchmark.
+    """
+    def __init__(
+        self,
+        fn: Callable[[], Union[Optional[dict[str, float]], Awaitable[Optional[dict[str, float]]]]],
+        name: str = "fn",
+        interval_s: float = 1.0,
+        tick_at_start: bool = True,
+    ):
+        self._fn = fn
+        self.name = name
+        self.interval_s = interval_s
+        self.tick_at_start = tick_at_start
+    async def tick(self) -> Optional[dict[str, float]]:
+        result = self._fn()
+        if hasattr(result, "__await__"):
+            result = await result  # type: ignore[assignment]
+        return result  # type: ignore[return-value]
+class PrometheusMonitor(Monitor):
+    """Scrape a Prometheus `/metrics` endpoint each tick.
+    Args:
+        url: full URL to the metrics endpoint.
+        metric_names: optional set of metric names (without labels) to keep.
+            If `None`, all metrics are recorded.
+        labelled_keys: if True (default), series with labels are stored as
+            `name{label="value"}`. If False, label info is dropped and series
+            with identical names are summed.
+        headers: HTTP headers (e.g. Authorization).
+        interval_s: scrape interval.
+    """
+    def __init__(
+        self,
+        url: str,
+        metric_names: Optional[set[str]] = None,
+        labelled_keys: bool = True,
+        headers: Optional[dict[str, str]] = None,
+        interval_s: float = 1.0,
+        name: str = "prometheus",
+        tick_at_start: bool = True,
+        timeout_s: float = 5.0,
+    ):
+        self._url = url
+        self._names = set(metric_names) if metric_names else None
+        self._labelled = labelled_keys
+        self._headers = headers or {}
+        self.interval_s = interval_s
+        self.name = name
+        self.tick_at_start = tick_at_start
+        self._timeout = aiohttp.ClientTimeout(total=timeout_s)
+        self._session: Optional[aiohttp.ClientSession] = None
+    async def setup(self) -> None:
+        self._session = aiohttp.ClientSession(timeout=self._timeout)
+    async def tick(self) -> Optional[dict[str, float]]:
+        assert self._session is not None
+        try:
+            async with self._session.get(self._url, headers=self._headers) as r:
+                if r.status >= 400:
+                    return None
+                text = await r.text()
+        except (aiohttp.ClientError, asyncio.TimeoutError):
+            return None
+        return parse_prometheus(text, names=self._names, labelled_keys=self._labelled)
+    async def aclose(self) -> None:
+        if self._session is not None:
+            await self._session.close()
+            self._session = None
+def parse_prometheus(
+    text: str,
+    names: Optional[set[str]] = None,
+    labelled_keys: bool = True,
+) -> dict[str, float]:
+    """Minimal Prometheus text-format parser.
+    Skips `# HELP` / `# TYPE` lines, comments, and malformed lines. Handles
+    name + optional `{labels}` + value (timestamp ignored).
+    """
+    out: dict[str, float] = {}
+    for line in text.splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        # Split `name[{labels}]` from value(+timestamp). Labels can contain
+        # spaces inside quoted values, but the value field always follows the
+        # closing brace (or the name itself) with whitespace. Use the position
+        # of the last `}` if present, else the first space.
+        if "{" in line:
+            close = line.find("}")
+            if close == -1:
+                continue
+            name_part = line[: close + 1]
+            rest = line[close + 1:].strip()
+        else:
+            sp = line.find(" ")
+            if sp == -1:
+                continue
+            name_part = line[:sp]
+            rest = line[sp + 1:].strip()
+        if not rest:
+            continue
+        value_token = rest.split()[0]
+        try:
+            value = float(value_token)
+        except ValueError:
+            continue
+        bare = name_part.split("{", 1)[0]
+        if names is not None and bare not in names:
+            continue
+        key = name_part if labelled_keys else bare
+        if key in out and not labelled_keys:
+            out[key] += value
+        else:
+            out[key] = value
+    return out
+async def run_monitor_loop(
+    monitor: Monitor,
+    samples: list[tuple[float, dict[str, float]]],
+    start_mono: float,
+    stop_event: asyncio.Event,
+) -> None:
+    """Drive a single monitor until `stop_event` is set.
+    Records `(elapsed_s, values)` tuples into `samples`. Any tick exception is
+    swallowed (logged to stderr) so monitor failure doesn't kill the bench.
+    """
+    import sys
+    try:
+        await monitor.setup()
+    except Exception as e:
+        sys.stderr.write(f"[monitor:{monitor.name}] setup failed: {e}\n")
+        return
+    try:
+        if monitor.tick_at_start:
+            await _safe_tick(monitor, samples, start_mono)
+        while not stop_event.is_set():
+            try:
+                await asyncio.wait_for(stop_event.wait(), timeout=monitor.interval_s)
+                # If we get here, stop was signalled — do one final tick and exit.
+                await _safe_tick(monitor, samples, start_mono)
+                break
+            except asyncio.TimeoutError:
+                await _safe_tick(monitor, samples, start_mono)
+    finally:
+        try:
+            await monitor.aclose()
+        except Exception as e:
+            sys.stderr.write(f"[monitor:{monitor.name}] aclose failed: {e}\n")
+async def _safe_tick(monitor: Monitor, samples: list, start_mono: float) -> None:
+    import sys
+    try:
+        values = await monitor.tick()
+    except Exception as e:
+        sys.stderr.write(f"[monitor:{monitor.name}] tick error: {e}\n")
+        return
+    if not values:
+        return
+    samples.append((time.monotonic() - start_mono, dict(values)))

benchmaker/runner.py ADDED Viewed

@@ -0,0 +1,275 @@
+"""BenchRunner: ties scheduler -> workload (dataset) -> workload-type ->
+aiohttp session -> metrics."""
+from __future__ import annotations
+import asyncio
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Any, Optional
+import aiohttp
+logger = logging.getLogger(__name__)
+from benchmaker.load import LoadModel
+from benchmaker.metrics import MetricsAggregator
+from benchmaker.types import (
+    PostResponseHook,
+    PreRequestHook,
+    Request,
+    Response,
+    Sample,
+    TicketContext,
+    maybe_await,
+)
+from benchmaker.monitors import Monitor, run_monitor_loop
+from benchmaker.trace import TraceRecorder
+from benchmaker.workloads.base import WorkloadType
+from benchmaker.workloads.datasets import Workload, StaticWorkload
+@dataclass
+class BenchConfig:
+    workload_type: WorkloadType                # how to talk to the service
+    load: LoadModel                            # when to fire
+    workload: Workload = field(default_factory=StaticWorkload)  # what to send
+    pre_hooks: list[PreRequestHook] = field(default_factory=list)
+    post_hooks: list[PostResponseHook] = field(default_factory=list)
+    monitors: list[Monitor] = field(default_factory=list)  # optional periodic samplers
+    # Optional trace recorder. When set, each fired request is appended to a
+    # JSONL file (with relative timestamp) so a later run can replay the bench
+    # deterministically via `benchmaker.trace.TracePacedLoad` + `ReplayWorkloadType`.
+    recorder: Optional[TraceRecorder] = None
+    connection_limit: int = 1000
+    timeout_s: float = 60.0
+    max_in_flight: int = 10000
+    progress_every_s: float = 1.0
+    stop_on_exhausted: bool = True
+@dataclass
+class BenchResult:
+    samples: list[Sample]
+    summary: dict
+class BenchRunner:
+    def __init__(self, config: BenchConfig):
+        self.cfg = config
+        self.metrics = MetricsAggregator()
+    async def run(self) -> BenchResult:
+        connector = aiohttp.TCPConnector(
+            limit=self.cfg.connection_limit,
+            ttl_dns_cache=300,
+            force_close=False,
+        )
+        timeout = aiohttp.ClientTimeout(total=self.cfg.timeout_s)
+        if self.cfg.recorder is not None:
+            self.cfg.recorder.open(start_mono=self.metrics.start_time)
+        async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+            try:
+                await self._drive(session)
+            finally:
+                await self.cfg.workload.aclose()
+                await self.cfg.workload_type.aclose()
+                if self.cfg.recorder is not None:
+                    self.cfg.recorder.close()
+        self.metrics.finalize()
+        return BenchResult(samples=self.metrics.samples, summary=self.metrics.summary())
+    async def _drive(self, session: aiohttp.ClientSession) -> None:
+        sem = asyncio.Semaphore(self.cfg.max_in_flight)
+        tasks: set[asyncio.Task] = set()
+        progress_task = asyncio.create_task(self._progress_loop())
+        # Spawn monitor loops.
+        monitor_stop = asyncio.Event()
+        monitor_tasks: list[asyncio.Task] = []
+        bench_start = self.metrics.start_time
+        for mon in self.cfg.monitors:
+            buf = self.metrics.monitor_buffer(mon.name)
+            monitor_tasks.append(asyncio.create_task(
+                run_monitor_loop(mon, buf, bench_start, monitor_stop)
+            ))
+        try:
+            async for _ in self.cfg.load.tickets():
+                try:
+                    item = await self.cfg.workload.next_item()
+                except StopAsyncIteration:
+                    if self.cfg.stop_on_exhausted:
+                        break
+                    else:
+                        continue
+                await sem.acquire()
+                task = asyncio.create_task(self._fire(session, item, sem))
+                tasks.add(task)
+                task.add_done_callback(tasks.discard)
+        finally:
+            progress_task.cancel()
+            try:
+                await progress_task
+            except (asyncio.CancelledError, Exception):
+                pass
+            if tasks:
+                await asyncio.gather(*tasks, return_exceptions=True)
+            # Signal monitors to do one last tick and exit, then wait for them.
+            monitor_stop.set()
+            if monitor_tasks:
+                await asyncio.gather(*monitor_tasks, return_exceptions=True)
+    async def _fire(self, session: aiohttp.ClientSession, item: Any,
+                    sem: asyncio.Semaphore) -> None:
+        start_mono = time.monotonic()
+        try:
+            async def fire(req: Request) -> Response:
+                for hook in self.cfg.pre_hooks:
+                    req = await maybe_await(hook(req))
+                fire_start = time.monotonic()
+                if self.cfg.recorder is not None:
+                    await self.cfg.recorder.record(req, fire_start)
+                return await self._execute(session, req, fire_start)
+            ctx = TicketContext(
+                item=item,
+                start_mono=start_mono,
+                fire=fire,
+                pre_hooks=tuple(self.cfg.pre_hooks),
+                post_hooks=tuple(self.cfg.post_hooks),
+                workload_name=self.cfg.workload_type.name,
+            )
+            sample = await self.cfg.workload_type.run_ticket(ctx)
+            self.metrics.add(sample)
+        except Exception as e:
+            self.metrics.add(_failure_sample(
+                f"{type(e).__name__}: {e}",
+                self.cfg.workload_type.name,
+            ))
+        finally:
+            sem.release()
+            self.cfg.load.on_complete()
+    async def _execute(self, session: aiohttp.ClientSession, req: Request,
+                       start_mono: float) -> Response:
+        kwargs: dict = {"headers": req.headers, "params": req.params}
+        if req.json is not None:
+            kwargs["json"] = req.json
+        elif req.body is not None:
+            kwargs["data"] = req.body
+        if req.timeout_s is not None:
+            kwargs["timeout"] = aiohttp.ClientTimeout(total=req.timeout_s)
+        try:
+            async with session.request(req.method, req.url, **kwargs) as resp:
+                if self.cfg.workload_type.streaming:
+                    chunks: list[bytes] = []
+                    chunk_times: list[float] = []
+                    body_parts: list[bytes] = []
+                    async for chunk in resp.content.iter_any():
+                        chunks.append(chunk)
+                        chunk_times.append(time.monotonic() - start_mono)
+                        body_parts.append(chunk)
+                    body = b"".join(body_parts)
+                    elapsed = time.monotonic() - start_mono
+                    return Response(
+                        status=resp.status,
+                        headers=dict(resp.headers),
+                        body=body,
+                        elapsed_s=elapsed,
+                        ok=200 <= resp.status < 400,
+                        stream_chunks=chunks,
+                        stream_chunk_times=chunk_times,
+                    )
+                else:
+                    body = await resp.read()
+                    elapsed = time.monotonic() - start_mono
+                    return Response(
+                        status=resp.status,
+                        headers=dict(resp.headers),
+                        body=body,
+                        elapsed_s=elapsed,
+                        ok=200 <= resp.status < 400,
+                    )
+        except asyncio.TimeoutError:
+            return Response(
+                status=0, headers={}, body=b"",
+                elapsed_s=time.monotonic() - start_mono,
+                ok=False, error="timeout",
+            )
+        except aiohttp.ClientError as e:
+            return Response(
+                status=0, headers={}, body=b"",
+                elapsed_s=time.monotonic() - start_mono,
+                ok=False, error=f"{type(e).__name__}: {e}",
+            )
+    async def _progress_loop(self) -> None:
+        if self.cfg.progress_every_s <= 0:
+            return
+        last_n = 0
+        last_t = time.monotonic()
+        seen_errors: set[str] = set()
+        try:
+            while True:
+                await asyncio.sleep(self.cfg.progress_every_s)
+                now = time.monotonic()
+                n = len(self.metrics.samples)
+                dn = n - last_n
+                dt = now - last_t
+                inst = dn / dt if dt > 0 else 0.0
+                window = self.metrics.samples[last_n:]
+                ok = sum(1 for s in window if s.ok)
+                # Wrong: request was delivered (HTTP success) but a post-hook /
+                # workload graded the output as a failure (e.g. eval gate).
+                wrong = sum(1 for s in window if not s.ok and s.request_ok)
+                fail = dn - ok - wrong
+                logger.info(
+                    "+%5d req  (%7.1f rps, %d ok, %d wrong, %d fail) | total=%d",
+                    dn, inst, ok, wrong, fail, n,
+                )
+                # Surface the first occurrence of each distinct error string —
+                # one short line per kind, so failed runs are diagnosable
+                # without grepping samples.jsonl.
+                for s in window:
+                    if s.error and s.error not in seen_errors:
+                        seen_errors.add(s.error)
+                        msg = s.error if len(s.error) <= 200 else s.error[:200] + "..."
+                        bucket = "fail" if not s.request_ok else "wrong"
+                        logger.warning("  first %s: %s", bucket, msg)
+                last_n = n
+                last_t = now
+        except asyncio.CancelledError:
+            return
+    def write_bundle(self, out_dir: str, **kwargs) -> str:
+        """Write a per-run directory bundle. See `benchmaker.bundle.write_bundle`."""
+        from benchmaker.bundle import write_bundle
+        return write_bundle(
+            out_dir,
+            self.metrics,
+            workload_type_name=self.cfg.workload_type.name,
+            workload_name=self.cfg.workload.name,
+            **kwargs,
+        )
+def _failure_sample(error: str, workload: str) -> Sample:
+    return Sample(
+        start_ts=time.monotonic(),
+        latency_s=0.0,
+        status=0,
+        ok=False,
+        request_ok=False,
+        error=error,
+        workload=workload,
+    )
+async def run_bench(config: BenchConfig) -> BenchResult:
+    return await BenchRunner(config).run()