PyPI - benchmaker - Versions diffs - 0.1.3__tar.gz → 0.1.4__tar.gz - Mend

benchmaker 0.1.3tar.gz → 0.1.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

{benchmaker-0.1.3 → benchmaker-0.1.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: benchmaker
-Version: 0.1.3
+Version: 0.1.4
 Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
 Author: Xiaozhe Yao
 License: MIT
@@ -187,6 +187,7 @@ Full docs live in [`docs/`](docs/):
 - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
 - [CLI & YAML reference](docs/cli-and-yaml.md)
 - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
+- [DeepRAG and mixed lanes](docs/deeprag-mix.md) — prefill-heavy RAG and phase-swinging dataset lanes
 - [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
 - [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay

{benchmaker-0.1.3 → benchmaker-0.1.4}/README.md RENAMED Viewed

@@ -157,6 +157,7 @@ Full docs live in [`docs/`](docs/):
 - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
 - [CLI & YAML reference](docs/cli-and-yaml.md)
 - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
+- [DeepRAG and mixed lanes](docs/deeprag-mix.md) — prefill-heavy RAG and phase-swinging dataset lanes
 - [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
 - [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay

{benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/__init__.py RENAMED Viewed

@@ -19,6 +19,7 @@ from benchmaker.workloads.http import HttpWorkloadType
 from benchmaker.workloads.llm import OpenAIChatWorkloadType
 from benchmaker.workloads.sandbox import SandboxWorkloadType
 from benchmaker.workloads.hf import HFDatasetWorkload
+from benchmaker.workloads.rag import DeepRAGWorkload
 from benchmaker.workloads.sglang import SGLangGenerateWorkloadType
 from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
 from benchmaker.workloads.agent import (
@@ -59,7 +60,7 @@ from benchmaker.core.monitors import (
     PrometheusMonitor,
     parse_prometheus,
 )
-from benchmaker.core.runner import BenchRunner, BenchConfig, BenchResult
+from benchmaker.core.runner import BenchLane, BenchRunner, BenchConfig, BenchResult
 from benchmaker.core.trace import (
     ReplayWorkloadType,
     TracePacedLoad,
@@ -89,6 +90,7 @@ __all__ = [
     "OpenAIChatWorkloadType",
     "SandboxWorkloadType",
     "HFDatasetWorkload",
+    "DeepRAGWorkload",
     "SGLangGenerateWorkloadType",
     "TrajectoryReplayWorkload",
     # agent workload (pluggable user-defined agents)
@@ -136,6 +138,7 @@ __all__ = [
     # runner
     "BenchRunner",
     "BenchConfig",
+    "BenchLane",
     "BenchResult",
     # trace: record & replay
     "TraceRecorder",
@@ -153,4 +156,4 @@ __all__ = [
     "write_bundle",
 ]
-__version__ = "0.1.3"
+__version__ = "0.1.4"

{benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/config.py RENAMED Viewed

@@ -22,7 +22,7 @@ from typing import Any, Callable, Optional
 from benchmaker.env import interpolate, load_dotenv
 from benchmaker.core.load import parse_duration, parse_rate_spec
 from benchmaker.core.monitors import FunctionMonitor, Monitor, PrometheusMonitor
-from benchmaker.core.runner import BenchConfig
+from benchmaker.core.runner import BenchConfig, BenchLane
 from benchmaker.workloads.base import WorkloadType
 from benchmaker.workloads.datasets import (
     CallableWorkload,
@@ -31,6 +31,7 @@ from benchmaker.workloads.datasets import (
     Workload,
 )
 from benchmaker.workloads.hf import HFDatasetWorkload
+from benchmaker.workloads.rag import DeepRAGWorkload
 from benchmaker.workloads.http import HttpWorkloadType
 from benchmaker.workloads.llm import OpenAIChatWorkloadType
 from benchmaker.workloads.sandbox import SandboxWorkloadType
@@ -154,6 +155,8 @@ def build_workload(spec: Any) -> Workload:
         return CallableWorkload(fn=fn, **kwargs)
     if t in ("hf", "huggingface"):
         return HFDatasetWorkload(**kwargs)
+    if t in ("deeprag", "deep-rag", "rag"):
+        return DeepRAGWorkload(**kwargs)
     if t == "trajectory":
         from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
         return TrajectoryReplayWorkload(**kwargs)
@@ -365,8 +368,12 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
         cfg = interpolate(cfg)
     replay_spec = cfg.get("replay")
+    mix_spec = cfg.get("mix")
+    if replay_spec is not None and mix_spec is not None:
+        raise ValueError("'replay' and 'mix' are mutually exclusive")
     if replay_spec is not None:
         workload_type, workload, load_model = _build_replay(replay_spec)
+        lanes: list[BenchLane] = []
     else:
         wt_spec = cfg.get("workload_type")
         if not wt_spec:
@@ -382,16 +389,27 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
                 raise ValueError("config must define 'workload_type' or 'replay'")
         workload_type = build_workload_type(wt_spec)
-        workload = build_workload(cfg.get("workload"))
-        load_spec = cfg.get("load")
-        if load_spec is None:
-            raise ValueError("config must define 'load'")
         duration = cfg.get("duration") or cfg.get("duration_s")
         if duration is not None and isinstance(duration, str):
             duration = parse_duration(duration)
-        load_model = parse_rate_spec(load_spec, duration_s=duration,
-                                     max_requests=cfg.get("max_requests"))
+        if mix_spec is not None:
+            if cfg.get("load") is not None:
+                raise ValueError("a mixed config cannot also define top-level 'load'")
+            workload = StaticWorkload()
+            load_model = None
+            lanes = _build_lanes(
+                mix_spec,
+                duration_s=duration,
+                max_requests=cfg.get("max_requests"),
+            )
+        else:
+            workload = build_workload(cfg.get("workload"))
+            load_spec = cfg.get("load")
+            if load_spec is None:
+                raise ValueError("config must define 'load' or 'mix.lanes'")
+            load_model = parse_rate_spec(load_spec, duration_s=duration,
+                                         max_requests=cfg.get("max_requests"))
+            lanes = []
     pre_hooks = [resolve_callable(h) for h in (cfg.get("pre_hooks") or [])]
     post_hooks = [resolve_callable(h) for h in (cfg.get("post_hooks") or [])]
@@ -410,9 +428,11 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
     # A workload that schedules on per-request completion (e.g. interleaved
     # trajectory replay) declares the post-hook it needs; install it so a YAML
     # config can't silently stall waiting for a signal it never wired up.
-    wl_hook = workload.completion_hook()
-    if wl_hook is not None and wl_hook not in post_hooks:
-        post_hooks = list(post_hooks) + [wl_hook]
+    workloads = [lane.workload for lane in lanes] if lanes else [workload]
+    for lane_workload in workloads:
+        wl_hook = lane_workload.completion_hook()
+        if wl_hook is not None and wl_hook not in post_hooks:
+            post_hooks = list(post_hooks) + [wl_hook]
     recorder = _build_recorder(cfg.get("record"))
@@ -420,6 +440,7 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
         workload_type=workload_type,
         workload=workload,
         load=load_model,
+        lanes=lanes,
         pre_hooks=pre_hooks,
         post_hooks=post_hooks,
         monitors=monitors,
@@ -428,9 +449,48 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
         timeout_s=float(cfg.get("timeout_s", 60.0)),
         max_in_flight=int(cfg.get("max_in_flight", 10000)),
         progress_every_s=float(cfg.get("progress_every_s", 1.0)),
+        stop_on_exhausted=bool(cfg.get("stop_on_exhausted", True)),
     )
+def _build_lanes(spec: Any, *, duration_s: Optional[float],
+                 max_requests: Optional[int]) -> list[BenchLane]:
+    """Build independent workload/load pairs from a ``mix:`` YAML block."""
+    if not isinstance(spec, dict):
+        raise TypeError("'mix' must be a mapping with a 'lanes' list")
+    lane_specs = spec.get("lanes")
+    if not isinstance(lane_specs, list) or not lane_specs:
+        raise ValueError("'mix.lanes' must be a non-empty list")
+    lanes: list[BenchLane] = []
+    for index, lane_spec in enumerate(lane_specs):
+        if not isinstance(lane_spec, dict):
+            raise TypeError(f"mix.lanes[{index}] must be a mapping")
+        name = lane_spec.get("name")
+        if not isinstance(name, str) or not name.strip():
+            raise ValueError(f"mix.lanes[{index}].name must be a non-empty string")
+        if "workload" not in lane_spec:
+            raise ValueError(f"mix.lanes[{index}] must define a workload")
+        rate = lane_spec.get("rate", lane_spec.get("load"))
+        if rate is None:
+            raise ValueError(f"mix.lanes[{index}] must define rate (or load)")
+        lane_duration = lane_spec.get("duration", duration_s)
+        if isinstance(lane_duration, str):
+            lane_duration = parse_duration(lane_duration)
+        lane_max_requests = lane_spec.get("max_requests", max_requests)
+        lanes.append(BenchLane(
+            name=name,
+            workload=build_workload(lane_spec["workload"]),
+            load=parse_rate_spec(
+                rate,
+                duration_s=lane_duration,
+                max_requests=lane_max_requests,
+            ),
+        ))
+    return lanes
 def _build_recorder(spec: Any) -> Optional[TraceRecorder]:
     if spec is None:
         return None
@@ -458,4 +518,3 @@ def _build_replay(spec: Any) -> tuple[WorkloadType, Workload, Any]:
         TracePacedLoad(trace, speed=speed),
     )

{benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/metrics.py RENAMED Viewed

@@ -52,64 +52,22 @@ class MetricsAggregator:
     def summary(self) -> dict:
         end = self.end_time or time.monotonic()
         wall_s = max(end - self.start_time, 1e-9)
-        ok = [s for s in self.samples if s.ok]
-        fail = [s for s in self.samples if not s.ok]
-        # Split fail into transport failures vs. delivered-but-graded-wrong.
-        wrong = [s for s in fail if s.request_ok]
-        request_failed = [s for s in fail if not s.request_ok]
-        latencies = [s.latency_s for s in ok]
-        status_counts = Counter(s.status for s in self.samples)
-        error_counts = Counter(s.error for s in fail if s.error)
-        out: dict = {
-            "wall_time_s": wall_s,
-            "total_requests": len(self.samples),
-            "success": len(ok),
-            "failed": len(fail),
-            "request_failed": len(request_failed),
-            "wrong_output": len(wrong),
-            "error_rate": (len(fail) / len(self.samples)) if self.samples else 0.0,
-            "request_failure_rate": (
-                (len(request_failed) / len(self.samples)) if self.samples else 0.0
-            ),
-            "throughput_rps": len(self.samples) / wall_s,
-            "goodput_rps": len(ok) / wall_s,
-            "bytes_sent": sum(s.bytes_sent for s in self.samples),
-            "bytes_recv": sum(s.bytes_recv for s in self.samples),
-            "status_codes": dict(status_counts),
-            "errors": dict(error_counts),
-        }
-        if latencies:
-            out["latency_s"] = {
-                "mean": statistics.mean(latencies),
-                "min": min(latencies),
-                "max": max(latencies),
-                "p50": _pct(latencies, 50),
-                "p90": _pct(latencies, 90),
-                "p95": _pct(latencies, 95),
-                "p99": _pct(latencies, 99),
-                "p999": _pct(latencies, 99.9),
-            }
+        out = _summary_for_samples(self.samples, wall_s)
-        # Aggregate workload-specific `extra` metrics generically: mean + percentiles.
-        extras: dict[str, list[float]] = defaultdict(list)
-        for s in ok:
-            for k, v in s.extra.items():
-                if isinstance(v, (int, float)):
-                    extras[k].append(float(v))
-        if extras:
-            ext_summary = {}
-            for k, vals in extras.items():
-                ext_summary[k] = {
-                    "mean": statistics.mean(vals),
-                    "p50": _pct(vals, 50),
-                    "p90": _pct(vals, 90),
-                    "p99": _pct(vals, 99),
-                    "min": min(vals),
-                    "max": max(vals),
-                }
-            out["workload_metrics"] = ext_summary
+        # A mixed benchmark needs each lane's SLO signal independently.  Use
+        # the same wall-clock interval as the aggregate so lane throughput is
+        # directly comparable to the total, while latency and workload metrics
+        # remain scoped to that lane's samples.
+        lanes: dict[str, list[Sample]] = defaultdict(list)
+        for sample in self.samples:
+            lane = sample.meta.get("lane")
+            if isinstance(lane, str) and lane:
+                lanes[lane].append(sample)
+        if lanes:
+            out["lanes"] = {
+                name: _summary_for_samples(samples, wall_s)
+                for name, samples in sorted(lanes.items())
+            }
         # Monitor time-series: summarize each metric per monitor.
         if self.monitor_samples:
@@ -181,6 +139,22 @@ class MetricsAggregator:
                 lines.append(f"    {k}")
                 for kk in ("mean", "p50", "p90", "p99", "max"):
                     lines.append(f"      {kk:<6}: {v[kk]:.4f}")
+        if s.get("lanes"):
+            lines.append("")
+            lines.append("  lanes")
+            for name, lane in s["lanes"].items():
+                lines.append(
+                    f"    {name}: {lane['total_requests']} requests, "
+                    f"{lane['throughput_rps']:.2f} req/s, "
+                    f"{lane['success']} success"
+                )
+                for metric in ("ttft_s", "itl_ms_mean", "tokens_per_s"):
+                    values = lane.get("workload_metrics", {}).get(metric)
+                    if values:
+                        lines.append(
+                            f"      {metric}: p50={values['p50']:.4f}, "
+                            f"p99={values['p99']:.4f}"
+                        )
         if s.get("monitors"):
             for mon_name, mon in s["monitors"].items():
                 lines.append("")
@@ -223,6 +197,70 @@ class MetricsAggregator:
                     }) + "\n")
+def _summary_for_samples(samples: list[Sample], wall_s: float) -> dict:
+    """Summarize a sample subset over a shared benchmark wall-clock interval."""
+    ok = [s for s in samples if s.ok]
+    fail = [s for s in samples if not s.ok]
+    # Split fail into transport failures vs. delivered-but-graded-wrong.
+    wrong = [s for s in fail if s.request_ok]
+    request_failed = [s for s in fail if not s.request_ok]
+    latencies = [s.latency_s for s in ok]
+    status_counts = Counter(s.status for s in samples)
+    error_counts = Counter(s.error for s in fail if s.error)
+    out: dict = {
+        "wall_time_s": wall_s,
+        "total_requests": len(samples),
+        "success": len(ok),
+        "failed": len(fail),
+        "request_failed": len(request_failed),
+        "wrong_output": len(wrong),
+        "error_rate": (len(fail) / len(samples)) if samples else 0.0,
+        "request_failure_rate": (
+            (len(request_failed) / len(samples)) if samples else 0.0
+        ),
+        "throughput_rps": len(samples) / wall_s,
+        "goodput_rps": len(ok) / wall_s,
+        "bytes_sent": sum(s.bytes_sent for s in samples),
+        "bytes_recv": sum(s.bytes_recv for s in samples),
+        "status_codes": dict(status_counts),
+        "errors": dict(error_counts),
+    }
+    if latencies:
+        out["latency_s"] = {
+            "mean": statistics.mean(latencies),
+            "min": min(latencies),
+            "max": max(latencies),
+            "p50": _pct(latencies, 50),
+            "p90": _pct(latencies, 90),
+            "p95": _pct(latencies, 95),
+            "p99": _pct(latencies, 99),
+            "p999": _pct(latencies, 99.9),
+        }
+    # Aggregate workload-specific `extra` metrics generically: mean + percentiles.
+    extras: dict[str, list[float]] = defaultdict(list)
+    for s in ok:
+        for k, v in s.extra.items():
+            if isinstance(v, (int, float)):
+                extras[k].append(float(v))
+    if extras:
+        ext_summary = {}
+        for k, vals in extras.items():
+            ext_summary[k] = {
+                "mean": statistics.mean(vals),
+                "p50": _pct(vals, 50),
+                "p90": _pct(vals, 90),
+                "p99": _pct(vals, 99),
+                "min": min(vals),
+                "max": max(vals),
+            }
+        out["workload_metrics"] = ext_summary
+    return out
 def _safe_meta(meta: dict) -> dict:
     out = {}
     for k, v in meta.items():

{benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/runner.py RENAMED Viewed

@@ -30,11 +30,31 @@ from benchmaker.workloads.base import WorkloadType
 from benchmaker.workloads.datasets import Workload, StaticWorkload
+@dataclass
+class BenchLane:
+    """One independently scheduled input lane in a mixed benchmark.
+    All lanes share the enclosing :class:`BenchConfig`'s workload type and
+    endpoint, but each owns its own data source and load model.  This keeps
+    OpenAI/HTTP protocol configuration centralized while preserving independent
+    arrival processes for phase-swing experiments.
+    """
+    name: str
+    workload: Workload
+    load: LoadModel
+    def __post_init__(self) -> None:
+        if not self.name or not self.name.strip():
+            raise ValueError("lane name must be a non-empty string")
 @dataclass
 class BenchConfig:
     workload_type: WorkloadType                # how to talk to the service
-    load: LoadModel                            # when to fire
+    load: Optional[LoadModel] = None           # when to fire (single workload)
     workload: Workload = field(default_factory=StaticWorkload)  # what to send
+    lanes: list[BenchLane] = field(default_factory=list)
     pre_hooks: list[PreRequestHook] = field(default_factory=list)
     post_hooks: list[PostResponseHook] = field(default_factory=list)
     monitors: list[Monitor] = field(default_factory=list)  # optional periodic samplers
@@ -48,6 +68,16 @@ class BenchConfig:
     progress_every_s: float = 1.0
     stop_on_exhausted: bool = True
+    def __post_init__(self) -> None:
+        if self.lanes:
+            if self.load is not None:
+                raise ValueError("configure either load/workload or lanes, not both")
+            names = [lane.name for lane in self.lanes]
+            if len(names) != len(set(names)):
+                raise ValueError("mixed benchmark lane names must be unique")
+        elif self.load is None:
+            raise ValueError("BenchConfig requires a load model or at least one lane")
 @dataclass
 class BenchResult:
@@ -73,7 +103,7 @@ class BenchRunner:
             try:
                 await self._drive(session)
             finally:
-                await self.cfg.workload.aclose()
+                await self._aclose_workloads()
                 await self.cfg.workload_type.aclose()
                 if self.cfg.recorder is not None:
                     self.cfg.recorder.close()
@@ -96,19 +126,10 @@ class BenchRunner:
             ))
         try:
-            async for _ in self.cfg.load.tickets():
-                try:
-                    item = await self.cfg.workload.next_item()
-                except StopAsyncIteration:
-                    if self.cfg.stop_on_exhausted:
-                        break
-                    else:
-                        continue
-                await sem.acquire()
-                task = asyncio.create_task(self._fire(session, item, sem))
-                tasks.add(task)
-                task.add_done_callback(tasks.discard)
+            if self.cfg.lanes:
+                await self._drive_lanes(session, sem, tasks)
+            else:
+                await self._drive_single(session, sem, tasks)
         finally:
             progress_task.cancel()
             try:
@@ -124,11 +145,71 @@ class BenchRunner:
             if monitor_tasks:
                 await asyncio.gather(*monitor_tasks, return_exceptions=True)
+    async def _drive_single(self, session: aiohttp.ClientSession,
+                            sem: asyncio.Semaphore,
+                            tasks: set[asyncio.Task]) -> None:
+        assert self.cfg.load is not None
+        async for _ in self.cfg.load.tickets():
+            try:
+                item = await self.cfg.workload.next_item()
+            except StopAsyncIteration:
+                if self.cfg.stop_on_exhausted:
+                    break
+                continue
+            await sem.acquire()
+            task = asyncio.create_task(
+                self._fire(session, item, sem, self.cfg.load)
+            )
+            tasks.add(task)
+            task.add_done_callback(tasks.discard)
+    async def _drive_lanes(self, session: aiohttp.ClientSession,
+                           sem: asyncio.Semaphore,
+                           tasks: set[asyncio.Task]) -> None:
+        """Run each lane's admission iterator concurrently.
+        A finite workload ends only its own lane.  Other lanes keep producing
+        tickets, which is required when one dataset is short or intentionally
+        bursty and another drives a long complementary phase.
+        """
+        async def produce(lane: BenchLane) -> None:
+            async for _ in lane.load.tickets():
+                try:
+                    item = await lane.workload.next_item()
+                except StopAsyncIteration:
+                    if self.cfg.stop_on_exhausted:
+                        break
+                    continue
+                await sem.acquire()
+                task = asyncio.create_task(
+                    self._fire(session, item, sem, lane.load, lane.name)
+                )
+                tasks.add(task)
+                task.add_done_callback(tasks.discard)
+        producers = [asyncio.create_task(produce(lane)) for lane in self.cfg.lanes]
+        try:
+            await asyncio.gather(*producers)
+        finally:
+            for producer in producers:
+                if not producer.done():
+                    producer.cancel()
+            await asyncio.gather(*producers, return_exceptions=True)
     async def _fire(self, session: aiohttp.ClientSession, item: Any,
-                    sem: asyncio.Semaphore) -> None:
+                    sem: asyncio.Semaphore, load: LoadModel,
+                    lane_name: Optional[str] = None) -> None:
         start_mono = time.monotonic()
         try:
             async def fire(req: Request) -> Response:
+                if lane_name is not None:
+                    # The config-defined lane is authoritative: callers may
+                    # still attach arbitrary metadata, but cannot accidentally
+                    # collapse a mixed run into an incorrect lane.
+                    req.meta["lane"] = lane_name
                 for hook in self.cfg.pre_hooks:
                     req = await maybe_await(hook(req))
                 fire_start = time.monotonic()
@@ -145,15 +226,30 @@ class BenchRunner:
                 workload_name=self.cfg.workload_type.name,
             )
             sample = await self.cfg.workload_type.run_ticket(ctx)
+            if lane_name is not None:
+                sample.meta["lane"] = lane_name
             self.metrics.add(sample)
         except Exception as e:
             self.metrics.add(_failure_sample(
                 f"{type(e).__name__}: {e}",
                 self.cfg.workload_type.name,
+                lane_name,
             ))
         finally:
             sem.release()
-            self.cfg.load.on_complete()
+            load.on_complete()
+    async def _aclose_workloads(self) -> None:
+        workloads = (
+            [lane.workload for lane in self.cfg.lanes]
+            if self.cfg.lanes
+            else [self.cfg.workload]
+        )
+        closed: set[int] = set()
+        for workload in workloads:
+            if id(workload) not in closed:
+                closed.add(id(workload))
+                await workload.aclose()
     async def _execute(self, session: aiohttp.ClientSession, req: Request,
                        start_mono: float) -> Response:
@@ -254,12 +350,16 @@ class BenchRunner:
             out_dir,
             self.metrics,
             workload_type_name=self.cfg.workload_type.name,
-            workload_name=self.cfg.workload.name,
+            workload_name=(
+                "mix:" + ",".join(lane.name for lane in self.cfg.lanes)
+                if self.cfg.lanes else self.cfg.workload.name
+            ),
             **kwargs,
         )
-def _failure_sample(error: str, workload: str) -> Sample:
+def _failure_sample(error: str, workload: str,
+                    lane_name: Optional[str] = None) -> Sample:
     return Sample(
         start_ts=time.monotonic(),
         latency_s=0.0,
@@ -268,6 +368,7 @@ def _failure_sample(error: str, workload: str) -> Sample:
         request_ok=False,
         error=error,
         workload=workload,
+        meta={"lane": lane_name} if lane_name is not None else {},
     )

{benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/swebench_replay.py RENAMED Viewed

@@ -157,6 +157,26 @@ class SWEBenchReplayRecipe(Recipe):
                               "trajectory store recorded with tool_results."),
             click.option("--utilization-interval-sec", "utilization_interval_sec",
                          type=float, default=5.0, show_default=True),
+            click.option("--qos-enabled/--no-qos-enabled", "qos_enabled",
+                         default=False, show_default=True,
+                         help="Demote verifier-phase containers to best_effort "
+                              "cpu.weight (and apply the QoS verifier-timeout "
+                              "multiplier)."),
+            click.option("--on-demand-cpu-weight", "on_demand_cpu_weight",
+                         type=int, default=10000, show_default=True,
+                         help="cpu.weight for on-demand (agent-phase) containers "
+                              "when --qos-enabled. Ignored when --no-qos-enabled."),
+            click.option("--best-effort-cpu-weight", "best_effort_cpu_weight",
+                         type=int, default=10, show_default=True,
+                         help="cpu.weight for best-effort (verifier-phase) "
+                              "containers when --qos-enabled. Ignored when "
+                              "--no-qos-enabled."),
+            click.option("--qos-verifier-timeout-multiplier",
+                         "qos_verifier_timeout_multiplier",
+                         type=float, default=2.0, show_default=True,
+                         help="Verifier timeout multiplier applied only when "
+                              "--qos-enabled (QoS demotes verifier CPU, so the "
+                              "verifier needs more wall-clock time)."),
         ]
     def run(self, shared: SharedOpts, *, job, trajectories, concurrency,
@@ -164,7 +184,9 @@ class SWEBenchReplayRecipe(Recipe):
             dataset, exec_timeout_sec, n_tasks, task, exclude_task, n_attempts,
             timeout_multiplier, backend_type, request_timeout_sec,
             agent_ready_timeout_sec, jobs_dir, timeline,
-            utilization_interval_sec, validate_observations) -> Optional[int]:
+            utilization_interval_sec, validate_observations,
+            qos_enabled, on_demand_cpu_weight, best_effort_cpu_weight,
+            qos_verifier_timeout_multiplier) -> Optional[int]:
         from benchmaker.swebench import harbor_eval as he
         from benchmaker.swebench import trajectory as T
@@ -250,6 +272,10 @@ class SWEBenchReplayRecipe(Recipe):
             request_timeout_sec=request_timeout_sec,
             agent_ready_timeout_sec=agent_ready_timeout_sec,
             jobs_dir=jobs_dir,
+            qos_enabled=qos_enabled,
+            on_demand_cpu_weight=on_demand_cpu_weight,
+            best_effort_cpu_weight=best_effort_cpu_weight,
+            qos_verifier_timeout_multiplier=qos_verifier_timeout_multiplier,
         )
         results: list[tuple] = []

{benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/harbor_eval.py RENAMED Viewed

@@ -219,12 +219,28 @@ def _build_job_config(args: argparse.Namespace) -> JobConfig:
     if jobs_dir:
         job_kwargs["jobs_dir"] = Path(jobs_dir)
+    # QoS: when enabled, splat the cpu.weight knobs into the environment kwargs
+    # (consumed by FlashSandboxEnvironment.__init__) and couple the verifier
+    # timeout — QoS demotes verifier-phase CPU, so the verifier needs more
+    # wall-clock time. Left untouched (None) when QoS is off.
+    verifier_timeout_multiplier = None
+    # QoS is wired only via the swebench-replay recipe CLI; harbor_eval's own
+    # _parse_args does not expose these flags, so this guard no-ops there.
+    if getattr(args, "qos_enabled", False):
+        environment.kwargs.update(
+            qos_enabled=True,
+            on_demand_cpu_weight=args.on_demand_cpu_weight,
+            best_effort_cpu_weight=args.best_effort_cpu_weight,
+        )
+        verifier_timeout_multiplier = args.qos_verifier_timeout_multiplier
     return JobConfig(
         job_name=args.job_name or "",
         n_attempts=args.n_attempts,
         n_concurrent_trials=args.concurrency,
         quiet=False,
         timeout_multiplier=args.timeout_multiplier,
+        verifier_timeout_multiplier=verifier_timeout_multiplier,
         environment=environment,
         agents=[agent],
         datasets=[dataset],

benchmaker 0.1.3__tar.gz → 0.1.4__tar.gz

benchmaker 0.1.3tar.gz → 0.1.4tar.gz