PyPI - benchmaker - Versions diffs - 0.1.0__py3-none-any.whl - Mend

benchmaker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

benchmaker/__init__.py +152 -0
benchmaker/bundle.py +193 -0
benchmaker/cli.py +382 -0
benchmaker/collect.py +178 -0
benchmaker/config.py +448 -0
benchmaker/env.py +87 -0
benchmaker/load.py +326 -0
benchmaker/metrics.py +234 -0
benchmaker/monitors.py +228 -0
benchmaker/runner.py +275 -0
benchmaker/trace.py +217 -0
benchmaker/types.py +98 -0
benchmaker/workloads/__init__.py +53 -0
benchmaker/workloads/agent.py +308 -0
benchmaker/workloads/base.py +79 -0
benchmaker/workloads/datasets.py +156 -0
benchmaker/workloads/eval.py +504 -0
benchmaker/workloads/hf.py +382 -0
benchmaker/workloads/http.py +77 -0
benchmaker/workloads/llm.py +258 -0
benchmaker/workloads/sandbox.py +470 -0
benchmaker-0.1.0.dist-info/METADATA +214 -0
benchmaker-0.1.0.dist-info/RECORD +26 -0
benchmaker-0.1.0.dist-info/WHEEL +5 -0
benchmaker-0.1.0.dist-info/entry_points.txt +2 -0
benchmaker-0.1.0.dist-info/top_level.txt +1 -0

benchmaker/load.py ADDED Viewed

@@ -0,0 +1,326 @@
+"""Load models.
+A LoadModel yields admission "tickets" — each ticket means "fire one request
+now". Open-loop models yield based on a target arrival schedule; closed-loop
+models yield based on completions (the runner returns a ticket to the model
+when a request finishes).
+"""
+from __future__ import annotations
+import asyncio
+import random
+import re
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import AsyncIterator, Optional, Union
+class LoadModel(ABC):
+    """Drives the timing of request admissions."""
+    @abstractmethod
+    async def tickets(self) -> AsyncIterator[None]:
+        """Yield None each time a request should be admitted."""
+        ...
+    def on_complete(self) -> None:
+        """Called by the runner when a request completes. Closed-loop uses this."""
+        pass
+    @property
+    def is_open_loop(self) -> bool:
+        return True
+# -------- Open-loop --------
+class ConstantRPS(LoadModel):
+    """Fire requests at a constant target rate (rps), regardless of in-flight count."""
+    def __init__(self, rps: float, duration_s: Optional[float] = None,
+                 max_requests: Optional[int] = None):
+        if rps <= 0:
+            raise ValueError("rps must be > 0")
+        self.rps = rps
+        self.duration_s = duration_s
+        self.max_requests = max_requests
+    async def tickets(self):
+        interval = 1.0 / self.rps
+        start = time.monotonic()
+        next_fire = start
+        n = 0
+        while True:
+            now = time.monotonic()
+            if self.duration_s is not None and (now - start) >= self.duration_s:
+                return
+            if self.max_requests is not None and n >= self.max_requests:
+                return
+            sleep_for = next_fire - now
+            if sleep_for > 0:
+                await asyncio.sleep(sleep_for)
+            yield None
+            n += 1
+            next_fire += interval
+class PoissonRPS(LoadModel):
+    """Open-loop Poisson arrivals with mean rate `rps`."""
+    def __init__(self, rps: float, duration_s: Optional[float] = None,
+                 max_requests: Optional[int] = None, seed: Optional[int] = None):
+        if rps <= 0:
+            raise ValueError("rps must be > 0")
+        self.rps = rps
+        self.duration_s = duration_s
+        self.max_requests = max_requests
+        self._rng = random.Random(seed)
+    async def tickets(self):
+        start = time.monotonic()
+        n = 0
+        while True:
+            now = time.monotonic()
+            if self.duration_s is not None and (now - start) >= self.duration_s:
+                return
+            if self.max_requests is not None and n >= self.max_requests:
+                return
+            # Exponential inter-arrival time with mean 1/rps.
+            gap = self._rng.expovariate(self.rps)
+            await asyncio.sleep(gap)
+            yield None
+            n += 1
+# -------- Closed-loop --------
+class ClosedLoop(LoadModel):
+    """N concurrent workers; each fires the next request as soon as the previous
+    completes. Total in-flight is bounded by `concurrency`.
+    This is implemented as: yield up to `concurrency` tickets initially, then
+    one more for every completion the runner reports.
+    """
+    def __init__(self, concurrency: int, duration_s: Optional[float] = None,
+                 max_requests: Optional[int] = None):
+        if concurrency <= 0:
+            raise ValueError("concurrency must be > 0")
+        self.concurrency = concurrency
+        self.duration_s = duration_s
+        self.max_requests = max_requests
+        self._sem = asyncio.Semaphore(concurrency)
+        self._completions: Optional[asyncio.Queue] = None
+    @property
+    def is_open_loop(self) -> bool:
+        return False
+    def on_complete(self) -> None:
+        if self._completions is not None:
+            self._completions.put_nowait(None)
+    async def tickets(self):
+        self._completions = asyncio.Queue()
+        # Seed: fire N concurrent immediately.
+        for _ in range(self.concurrency):
+            self._completions.put_nowait(None)
+        start = time.monotonic()
+        n = 0
+        while True:
+            now = time.monotonic()
+            if self.duration_s is not None and (now - start) >= self.duration_s:
+                return
+            if self.max_requests is not None and n >= self.max_requests:
+                return
+            await self._completions.get()
+            yield None
+            n += 1
+# -------- Composite: Sweep / Ramp --------
+@dataclass
+class _Stage:
+    model: LoadModel
+    label: str
+class Sweep(LoadModel):
+    """Run a sequence of sub-load-models in order (each for its own duration).
+    Useful for: sweep across RPS values to find saturation, e.g.
+        Sweep([ConstantRPS(10, 30), ConstantRPS(50, 30), ConstantRPS(100, 30)])
+    """
+    def __init__(self, stages: list[LoadModel], labels: Optional[list[str]] = None):
+        if not stages:
+            raise ValueError("Sweep requires at least one stage")
+        self.stages = stages
+        self.labels = labels or [f"stage_{i}" for i in range(len(stages))]
+        self.current_label: Optional[str] = None
+    @property
+    def is_open_loop(self) -> bool:
+        # Mixed sweeps are treated as open-loop for runner-level scheduling.
+        return all(s.is_open_loop for s in self.stages)
+    def on_complete(self) -> None:
+        for s in self.stages:
+            s.on_complete()
+    async def tickets(self):
+        for label, stage in zip(self.labels, self.stages):
+            self.current_label = label
+            async for t in stage.tickets():
+                yield t
+        self.current_label = None
+class Ramp(LoadModel):
+    """Linearly ramp RPS from `start_rps` to `end_rps` over `duration_s`."""
+    def __init__(self, start_rps: float, end_rps: float, duration_s: float,
+                 poisson: bool = False, seed: Optional[int] = None):
+        if start_rps <= 0 or end_rps <= 0 or duration_s <= 0:
+            raise ValueError("start_rps, end_rps, duration_s must all be > 0")
+        self.start_rps = start_rps
+        self.end_rps = end_rps
+        self.duration_s = duration_s
+        self.poisson = poisson
+        self._rng = random.Random(seed)
+    def _rps_at(self, t: float) -> float:
+        frac = min(max(t / self.duration_s, 0.0), 1.0)
+        return self.start_rps + (self.end_rps - self.start_rps) * frac
+    async def tickets(self):
+        start = time.monotonic()
+        next_fire = start
+        while True:
+            now = time.monotonic()
+            elapsed = now - start
+            if elapsed >= self.duration_s:
+                return
+            rps = self._rps_at(elapsed)
+            if self.poisson:
+                gap = self._rng.expovariate(rps)
+                await asyncio.sleep(gap)
+                yield None
+            else:
+                sleep_for = next_fire - now
+                if sleep_for > 0:
+                    await asyncio.sleep(sleep_for)
+                yield None
+                next_fire += 1.0 / rps
+# -------- User-friendly spec parser --------
+_DURATION_RE = re.compile(r"^([0-9]*\.?[0-9]+)(ms|s|m|h)?$")
+def parse_duration(s: Union[str, int, float]) -> float:
+    """Parse '30s', '500ms', '2m', '1h', or a bare number (seconds)."""
+    if isinstance(s, (int, float)):
+        return float(s)
+    m = _DURATION_RE.match(s.strip())
+    if not m:
+        raise ValueError(f"Cannot parse duration {s!r}")
+    val = float(m.group(1))
+    unit = (m.group(2) or "s").lower()
+    return val * {"ms": 0.001, "s": 1.0, "m": 60.0, "h": 3600.0}[unit]
+def parse_rate_spec(
+    spec: Union[str, int, float, dict],
+    duration_s: Optional[float] = None,
+    max_requests: Optional[int] = None,
+) -> LoadModel:
+    """Friendly load-model factory.
+    Accepted forms:
+        100                              -> ConstantRPS(100)
+        "100"                            -> ConstantRPS(100)
+        "100rps"                         -> ConstantRPS(100)
+        "poisson:100"                    -> PoissonRPS(100)
+        "closed:32"  or "concurrency:32" -> ClosedLoop(32)
+        "ramp:10..500:30s"               -> Ramp(10, 500, 30)
+        "ramp-poisson:10..500:30s"       -> Ramp(..., poisson=True)
+        "sweep:10,50,100,500@30s"        -> Sweep of ConstantRPS, each 30s
+        {"type": "constant", "rps": 100, "duration": "60s"}   (dict form)
+    """
+    if isinstance(spec, dict):
+        return _parse_rate_dict(spec)
+    if isinstance(spec, (int, float)):
+        return ConstantRPS(float(spec), duration_s=duration_s, max_requests=max_requests)
+    s = spec.strip().lower()
+    if s.startswith("poisson:"):
+        rps = float(s.split(":", 1)[1])
+        return PoissonRPS(rps, duration_s=duration_s, max_requests=max_requests)
+    if s.startswith("closed:") or s.startswith("concurrency:"):
+        n = int(s.split(":", 1)[1])
+        return ClosedLoop(n, duration_s=duration_s, max_requests=max_requests)
+    if s.startswith("ramp-poisson:") or s.startswith("ramp:"):
+        poisson = s.startswith("ramp-poisson:")
+        rest = s.split(":", 1)[1]
+        # rest like "10..500:30s"
+        rng, dur = rest.rsplit(":", 1)
+        a, b = rng.split("..")
+        return Ramp(float(a), float(b), parse_duration(dur), poisson=poisson)
+    if s.startswith("sweep:"):
+        rest = s.split(":", 1)[1]
+        # e.g. "10,50,100,500@30s"
+        if "@" in rest:
+            vals, dur = rest.split("@", 1)
+            per_stage = parse_duration(dur)
+        else:
+            vals = rest
+            if duration_s is None:
+                raise ValueError("sweep: needs '@duration' or a top-level duration")
+            per_stage = duration_s / len(vals.split(","))
+        rates = [float(v) for v in vals.split(",")]
+        stages = [ConstantRPS(r, duration_s=per_stage) for r in rates]
+        labels = [f"{r:g}rps" for r in rates]
+        return Sweep(stages, labels)
+    # Plain number with optional 'rps' suffix.
+    if s.endswith("rps"):
+        s = s[:-3]
+    return ConstantRPS(float(s), duration_s=duration_s, max_requests=max_requests)
+def _parse_rate_dict(d: dict) -> LoadModel:
+    t = d.get("type", "constant").lower()
+    duration = d.get("duration")
+    if duration is not None and isinstance(duration, str):
+        duration = parse_duration(duration)
+    max_requests = d.get("max_requests")
+    if t == "constant":
+        return ConstantRPS(float(d["rps"]), duration_s=duration, max_requests=max_requests)
+    if t == "poisson":
+        return PoissonRPS(float(d["rps"]), duration_s=duration, max_requests=max_requests,
+                          seed=d.get("seed"))
+    if t in ("closed", "closed-loop", "concurrency"):
+        return ClosedLoop(int(d["concurrency"]), duration_s=duration, max_requests=max_requests)
+    if t == "ramp":
+        return Ramp(float(d["start_rps"]), float(d["end_rps"]),
+                    parse_duration(d.get("duration", duration)),
+                    poisson=d.get("poisson", False), seed=d.get("seed"))
+    if t == "sweep":
+        stages_spec = d["stages"]
+        stages = [parse_rate_spec(s) for s in stages_spec]
+        labels = [s.get("label") if isinstance(s, dict) else None for s in stages_spec]
+        labels = [lab or f"stage_{i}" for i, lab in enumerate(labels)]
+        return Sweep(stages, labels)
+    raise ValueError(f"Unknown load model type: {t}")

benchmaker/metrics.py ADDED Viewed

@@ -0,0 +1,234 @@
+"""Aggregation + reporting of Samples."""
+from __future__ import annotations
+import json
+import statistics
+import time
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from typing import Optional, TextIO
+from benchmaker.types import Sample
+def _pct(xs: list[float], p: float) -> float:
+    if not xs:
+        return 0.0
+    xs = sorted(xs)
+    k = (len(xs) - 1) * (p / 100.0)
+    f = int(k)
+    c = min(f + 1, len(xs) - 1)
+    if f == c:
+        return xs[f]
+    return xs[f] + (xs[c] - xs[f]) * (k - f)
+@dataclass
+class MetricsAggregator:
+    """Accumulates samples and produces a summary."""
+    samples: list[Sample] = field(default_factory=list)
+    start_time: float = field(default_factory=time.monotonic)
+    end_time: Optional[float] = None
+    # Wall-clock (time.time) markers, only used when writing the run bundle.
+    start_wall: float = field(default_factory=time.time)
+    end_wall: Optional[float] = None
+    # name -> list of (elapsed_s, {metric_name: value})
+    monitor_samples: dict[str, list[tuple[float, dict[str, float]]]] = field(default_factory=dict)
+    def add(self, sample: Sample) -> None:
+        self.samples.append(sample)
+    def monitor_buffer(self, name: str) -> list[tuple[float, dict[str, float]]]:
+        if name not in self.monitor_samples:
+            self.monitor_samples[name] = []
+        return self.monitor_samples[name]
+    def finalize(self) -> None:
+        self.end_time = time.monotonic()
+        self.end_wall = time.time()
+    def summary(self) -> dict:
+        end = self.end_time or time.monotonic()
+        wall_s = max(end - self.start_time, 1e-9)
+        ok = [s for s in self.samples if s.ok]
+        fail = [s for s in self.samples if not s.ok]
+        # Split fail into transport failures vs. delivered-but-graded-wrong.
+        wrong = [s for s in fail if s.request_ok]
+        request_failed = [s for s in fail if not s.request_ok]
+        latencies = [s.latency_s for s in ok]
+        status_counts = Counter(s.status for s in self.samples)
+        error_counts = Counter(s.error for s in fail if s.error)
+        out: dict = {
+            "wall_time_s": wall_s,
+            "total_requests": len(self.samples),
+            "success": len(ok),
+            "failed": len(fail),
+            "request_failed": len(request_failed),
+            "wrong_output": len(wrong),
+            "error_rate": (len(fail) / len(self.samples)) if self.samples else 0.0,
+            "request_failure_rate": (
+                (len(request_failed) / len(self.samples)) if self.samples else 0.0
+            ),
+            "throughput_rps": len(self.samples) / wall_s,
+            "goodput_rps": len(ok) / wall_s,
+            "bytes_sent": sum(s.bytes_sent for s in self.samples),
+            "bytes_recv": sum(s.bytes_recv for s in self.samples),
+            "status_codes": dict(status_counts),
+            "errors": dict(error_counts),
+        }
+        if latencies:
+            out["latency_s"] = {
+                "mean": statistics.mean(latencies),
+                "min": min(latencies),
+                "max": max(latencies),
+                "p50": _pct(latencies, 50),
+                "p90": _pct(latencies, 90),
+                "p95": _pct(latencies, 95),
+                "p99": _pct(latencies, 99),
+                "p999": _pct(latencies, 99.9),
+            }
+        # Aggregate workload-specific `extra` metrics generically: mean + percentiles.
+        extras: dict[str, list[float]] = defaultdict(list)
+        for s in ok:
+            for k, v in s.extra.items():
+                if isinstance(v, (int, float)):
+                    extras[k].append(float(v))
+        if extras:
+            ext_summary = {}
+            for k, vals in extras.items():
+                ext_summary[k] = {
+                    "mean": statistics.mean(vals),
+                    "p50": _pct(vals, 50),
+                    "p90": _pct(vals, 90),
+                    "p99": _pct(vals, 99),
+                    "min": min(vals),
+                    "max": max(vals),
+                }
+            out["workload_metrics"] = ext_summary
+        # Monitor time-series: summarize each metric per monitor.
+        if self.monitor_samples:
+            monitors_summary: dict[str, dict] = {}
+            for mon_name, ticks in self.monitor_samples.items():
+                if not ticks:
+                    continue
+                by_metric: dict[str, list[float]] = defaultdict(list)
+                for _t, values in ticks:
+                    for k, v in values.items():
+                        if isinstance(v, (int, float)):
+                            by_metric[k].append(float(v))
+                per_metric = {}
+                for k, vals in by_metric.items():
+                    per_metric[k] = {
+                        "mean": statistics.mean(vals),
+                        "min": min(vals),
+                        "max": max(vals),
+                        "p50": _pct(vals, 50),
+                        "p90": _pct(vals, 90),
+                        "p99": _pct(vals, 99),
+                        "first": vals[0],
+                        "last": vals[-1],
+                    }
+                monitors_summary[mon_name] = {
+                    "tick_count": len(ticks),
+                    "metrics": per_metric,
+                }
+            if monitors_summary:
+                out["monitors"] = monitors_summary
+        return out
+    def render(self, out: TextIO) -> None:
+        s = self.summary()
+        lines: list[str] = []
+        lines.append("=" * 60)
+        lines.append(f"[benchmaker] results  ({s['total_requests']} requests, "
+                     f"{s['wall_time_s']:.2f}s wall)")
+        lines.append("=" * 60)
+        lines.append(f"  throughput     : {s['throughput_rps']:>10.2f} req/s")
+        lines.append(f"  goodput        : {s['goodput_rps']:>10.2f} req/s")
+        lines.append(f"  success        : {s['success']}")
+        lines.append(f"  failed         : {s['failed']}  ({s['error_rate']*100:.2f}%)")
+        lines.append(
+            f"    request failed : {s['request_failed']}  "
+            f"({s['request_failure_rate']*100:.2f}%)"
+        )
+        lines.append(f"    wrong output   : {s['wrong_output']}")
+        if s.get("latency_s"):
+            l = s["latency_s"]
+            lines.append("")
+            lines.append("  latency (s)")
+            for k in ("mean", "p50", "p90", "p95", "p99", "p999", "max"):
+                lines.append(f"    {k:<6}: {l[k]:.4f}")
+        if s["status_codes"]:
+            lines.append("")
+            lines.append("  status codes")
+            for code, n in sorted(s["status_codes"].items()):
+                lines.append(f"    {code:<4} : {n}")
+        if s["errors"]:
+            lines.append("")
+            lines.append("  errors")
+            for err, n in sorted(s["errors"].items(), key=lambda kv: -kv[1])[:10]:
+                lines.append(f"    {n:<4} x {err}")
+        if s.get("workload_metrics"):
+            lines.append("")
+            lines.append("  workload metrics")
+            for k, v in s["workload_metrics"].items():
+                lines.append(f"    {k}")
+                for kk in ("mean", "p50", "p90", "p99", "max"):
+                    lines.append(f"      {kk:<6}: {v[kk]:.4f}")
+        if s.get("monitors"):
+            for mon_name, mon in s["monitors"].items():
+                lines.append("")
+                lines.append(f"  monitor: {mon_name}  ({mon['tick_count']} ticks)")
+                for k, v in mon["metrics"].items():
+                    lines.append(f"    {k}")
+                    for kk in ("mean", "min", "max", "p50", "p99", "last"):
+                        lines.append(f"      {kk:<6}: {v[kk]:.4f}")
+        lines.append("=" * 60)
+        out.write("\n".join(lines) + "\n")
+        out.flush()
+    def dump_samples_jsonl(self, path: str) -> None:
+        """Write per-request records for offline analysis."""
+        with open(path, "w") as f:
+            for s in self.samples:
+                f.write(json.dumps({
+                    "start_ts": s.start_ts,
+                    "latency_s": s.latency_s,
+                    "status": s.status,
+                    "ok": s.ok,
+                    "request_ok": s.request_ok,
+                    "bytes_sent": s.bytes_sent,
+                    "bytes_recv": s.bytes_recv,
+                    "error": s.error,
+                    "workload": s.workload,
+                    "meta": _safe_meta(s.meta),
+                    "extra": s.extra,
+                }) + "\n")
+    def dump_monitor_jsonl(self, path: str) -> None:
+        """Write monitor time-series ticks as JSONL for plotting/analysis."""
+        with open(path, "w") as f:
+            for mon_name, ticks in self.monitor_samples.items():
+                for t, values in ticks:
+                    f.write(json.dumps({
+                        "monitor": mon_name,
+                        "elapsed_s": t,
+                        "values": values,
+                    }) + "\n")
+def _safe_meta(meta: dict) -> dict:
+    out = {}
+    for k, v in meta.items():
+        try:
+            json.dumps(v)
+            out[k] = v
+        except (TypeError, ValueError):
+            out[k] = repr(v)
+    return out