benchmaker 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. {benchmaker-0.1.3 → benchmaker-0.1.4}/PKG-INFO +2 -1
  2. {benchmaker-0.1.3 → benchmaker-0.1.4}/README.md +1 -0
  3. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/__init__.py +5 -2
  4. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/config.py +71 -12
  5. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/metrics.py +95 -57
  6. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/runner.py +120 -19
  7. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/swebench_replay.py +27 -1
  8. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/harbor_eval.py +16 -0
  9. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/__init__.py +2 -0
  10. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/llm.py +7 -0
  11. benchmaker-0.1.4/benchmaker/workloads/rag.py +188 -0
  12. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/PKG-INFO +2 -1
  13. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/SOURCES.txt +5 -0
  14. {benchmaker-0.1.3 → benchmaker-0.1.4}/pyproject.toml +1 -1
  15. benchmaker-0.1.4/tests/test_collect_sweep_data.py +24 -0
  16. benchmaker-0.1.4/tests/test_mix.py +86 -0
  17. benchmaker-0.1.4/tests/test_qos_job_config.py +56 -0
  18. benchmaker-0.1.4/tests/test_rag.py +120 -0
  19. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/cli.py +0 -0
  20. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/__init__.py +0 -0
  21. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/load.py +0 -0
  22. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/monitors.py +0 -0
  23. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/trace.py +0 -0
  24. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/types.py +0 -0
  25. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/env.py +0 -0
  26. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/io/__init__.py +0 -0
  27. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/io/bundle.py +0 -0
  28. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/io/collect.py +0 -0
  29. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/__init__.py +0 -0
  30. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/_cli_shared.py +0 -0
  31. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/_factory.py +0 -0
  32. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/base.py +0 -0
  33. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/http.py +0 -0
  34. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/llm.py +0 -0
  35. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/sandbox.py +0 -0
  36. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/sglang.py +0 -0
  37. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/swebench.py +0 -0
  38. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/trajectory_replay.py +0 -0
  39. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/__init__.py +0 -0
  40. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/_flash_hardening.py +0 -0
  41. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/agent.py +0 -0
  42. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/grading.py +0 -0
  43. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/harbor_agent.py +0 -0
  44. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/observability.py +0 -0
  45. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/pi_agent.py +0 -0
  46. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/max_turns.js +0 -0
  47. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/register_provider.js +0 -0
  48. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/remote_exec.js +0 -0
  49. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/remote_exec_all.js +0 -0
  50. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/replay_server.py +0 -0
  51. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/timeout_load.py +0 -0
  52. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/trajectory.py +0 -0
  53. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/agent.py +0 -0
  54. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/base.py +0 -0
  55. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/datasets.py +0 -0
  56. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/eval.py +0 -0
  57. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/hf.py +0 -0
  58. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/http.py +0 -0
  59. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/sandbox.py +0 -0
  60. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/sglang.py +0 -0
  61. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/trajectory.py +0 -0
  62. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/dependency_links.txt +0 -0
  63. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/entry_points.txt +0 -0
  64. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/requires.txt +0 -0
  65. {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/top_level.txt +0 -0
  66. {benchmaker-0.1.3 → benchmaker-0.1.4}/setup.cfg +0 -0
  67. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_agent.py +0 -0
  68. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_agent_warmup.py +0 -0
  69. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_backfill_trajectory_status.py +0 -0
  70. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_bundle.py +0 -0
  71. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_coding_agent.py +0 -0
  72. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_collect_trajectories.py +0 -0
  73. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_dedupe_trajectories.py +0 -0
  74. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_eval.py +0 -0
  75. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_flash_hardening.py +0 -0
  76. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_hf.py +0 -0
  77. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_observability.py +0 -0
  78. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_passthrough_meta.py +0 -0
  79. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_pi_agent.py +0 -0
  80. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_pi_agent_timeout_injection.py +0 -0
  81. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_recipes_cli.py +0 -0
  82. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_replay_server.py +0 -0
  83. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_sandbox_duration.py +0 -0
  84. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_sglang.py +0 -0
  85. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_smoke.py +0 -0
  86. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_swebench_replay_recipe.py +0 -0
  87. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_timeout_load.py +0 -0
  88. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_trace.py +0 -0
  89. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_trajectory.py +0 -0
  90. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_trajectory_interleave.py +0 -0
  91. {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_trajectory_replay.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmaker
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
5
5
  Author: Xiaozhe Yao
6
6
  License: MIT
@@ -187,6 +187,7 @@ Full docs live in [`docs/`](docs/):
187
187
  - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
188
188
  - [CLI & YAML reference](docs/cli-and-yaml.md)
189
189
  - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
190
+ - [DeepRAG and mixed lanes](docs/deeprag-mix.md) — prefill-heavy RAG and phase-swinging dataset lanes
190
191
  - [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
191
192
  - [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
192
193
 
@@ -157,6 +157,7 @@ Full docs live in [`docs/`](docs/):
157
157
  - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
158
158
  - [CLI & YAML reference](docs/cli-and-yaml.md)
159
159
  - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
160
+ - [DeepRAG and mixed lanes](docs/deeprag-mix.md) — prefill-heavy RAG and phase-swinging dataset lanes
160
161
  - [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
161
162
  - [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
162
163
 
@@ -19,6 +19,7 @@ from benchmaker.workloads.http import HttpWorkloadType
19
19
  from benchmaker.workloads.llm import OpenAIChatWorkloadType
20
20
  from benchmaker.workloads.sandbox import SandboxWorkloadType
21
21
  from benchmaker.workloads.hf import HFDatasetWorkload
22
+ from benchmaker.workloads.rag import DeepRAGWorkload
22
23
  from benchmaker.workloads.sglang import SGLangGenerateWorkloadType
23
24
  from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
24
25
  from benchmaker.workloads.agent import (
@@ -59,7 +60,7 @@ from benchmaker.core.monitors import (
59
60
  PrometheusMonitor,
60
61
  parse_prometheus,
61
62
  )
62
- from benchmaker.core.runner import BenchRunner, BenchConfig, BenchResult
63
+ from benchmaker.core.runner import BenchLane, BenchRunner, BenchConfig, BenchResult
63
64
  from benchmaker.core.trace import (
64
65
  ReplayWorkloadType,
65
66
  TracePacedLoad,
@@ -89,6 +90,7 @@ __all__ = [
89
90
  "OpenAIChatWorkloadType",
90
91
  "SandboxWorkloadType",
91
92
  "HFDatasetWorkload",
93
+ "DeepRAGWorkload",
92
94
  "SGLangGenerateWorkloadType",
93
95
  "TrajectoryReplayWorkload",
94
96
  # agent workload (pluggable user-defined agents)
@@ -136,6 +138,7 @@ __all__ = [
136
138
  # runner
137
139
  "BenchRunner",
138
140
  "BenchConfig",
141
+ "BenchLane",
139
142
  "BenchResult",
140
143
  # trace: record & replay
141
144
  "TraceRecorder",
@@ -153,4 +156,4 @@ __all__ = [
153
156
  "write_bundle",
154
157
  ]
155
158
 
156
- __version__ = "0.1.3"
159
+ __version__ = "0.1.4"
@@ -22,7 +22,7 @@ from typing import Any, Callable, Optional
22
22
  from benchmaker.env import interpolate, load_dotenv
23
23
  from benchmaker.core.load import parse_duration, parse_rate_spec
24
24
  from benchmaker.core.monitors import FunctionMonitor, Monitor, PrometheusMonitor
25
- from benchmaker.core.runner import BenchConfig
25
+ from benchmaker.core.runner import BenchConfig, BenchLane
26
26
  from benchmaker.workloads.base import WorkloadType
27
27
  from benchmaker.workloads.datasets import (
28
28
  CallableWorkload,
@@ -31,6 +31,7 @@ from benchmaker.workloads.datasets import (
31
31
  Workload,
32
32
  )
33
33
  from benchmaker.workloads.hf import HFDatasetWorkload
34
+ from benchmaker.workloads.rag import DeepRAGWorkload
34
35
  from benchmaker.workloads.http import HttpWorkloadType
35
36
  from benchmaker.workloads.llm import OpenAIChatWorkloadType
36
37
  from benchmaker.workloads.sandbox import SandboxWorkloadType
@@ -154,6 +155,8 @@ def build_workload(spec: Any) -> Workload:
154
155
  return CallableWorkload(fn=fn, **kwargs)
155
156
  if t in ("hf", "huggingface"):
156
157
  return HFDatasetWorkload(**kwargs)
158
+ if t in ("deeprag", "deep-rag", "rag"):
159
+ return DeepRAGWorkload(**kwargs)
157
160
  if t == "trajectory":
158
161
  from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
159
162
  return TrajectoryReplayWorkload(**kwargs)
@@ -365,8 +368,12 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
365
368
  cfg = interpolate(cfg)
366
369
 
367
370
  replay_spec = cfg.get("replay")
371
+ mix_spec = cfg.get("mix")
372
+ if replay_spec is not None and mix_spec is not None:
373
+ raise ValueError("'replay' and 'mix' are mutually exclusive")
368
374
  if replay_spec is not None:
369
375
  workload_type, workload, load_model = _build_replay(replay_spec)
376
+ lanes: list[BenchLane] = []
370
377
  else:
371
378
  wt_spec = cfg.get("workload_type")
372
379
  if not wt_spec:
@@ -382,16 +389,27 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
382
389
  raise ValueError("config must define 'workload_type' or 'replay'")
383
390
 
384
391
  workload_type = build_workload_type(wt_spec)
385
- workload = build_workload(cfg.get("workload"))
386
-
387
- load_spec = cfg.get("load")
388
- if load_spec is None:
389
- raise ValueError("config must define 'load'")
390
392
  duration = cfg.get("duration") or cfg.get("duration_s")
391
393
  if duration is not None and isinstance(duration, str):
392
394
  duration = parse_duration(duration)
393
- load_model = parse_rate_spec(load_spec, duration_s=duration,
394
- max_requests=cfg.get("max_requests"))
395
+ if mix_spec is not None:
396
+ if cfg.get("load") is not None:
397
+ raise ValueError("a mixed config cannot also define top-level 'load'")
398
+ workload = StaticWorkload()
399
+ load_model = None
400
+ lanes = _build_lanes(
401
+ mix_spec,
402
+ duration_s=duration,
403
+ max_requests=cfg.get("max_requests"),
404
+ )
405
+ else:
406
+ workload = build_workload(cfg.get("workload"))
407
+ load_spec = cfg.get("load")
408
+ if load_spec is None:
409
+ raise ValueError("config must define 'load' or 'mix.lanes'")
410
+ load_model = parse_rate_spec(load_spec, duration_s=duration,
411
+ max_requests=cfg.get("max_requests"))
412
+ lanes = []
395
413
 
396
414
  pre_hooks = [resolve_callable(h) for h in (cfg.get("pre_hooks") or [])]
397
415
  post_hooks = [resolve_callable(h) for h in (cfg.get("post_hooks") or [])]
@@ -410,9 +428,11 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
410
428
  # A workload that schedules on per-request completion (e.g. interleaved
411
429
  # trajectory replay) declares the post-hook it needs; install it so a YAML
412
430
  # config can't silently stall waiting for a signal it never wired up.
413
- wl_hook = workload.completion_hook()
414
- if wl_hook is not None and wl_hook not in post_hooks:
415
- post_hooks = list(post_hooks) + [wl_hook]
431
+ workloads = [lane.workload for lane in lanes] if lanes else [workload]
432
+ for lane_workload in workloads:
433
+ wl_hook = lane_workload.completion_hook()
434
+ if wl_hook is not None and wl_hook not in post_hooks:
435
+ post_hooks = list(post_hooks) + [wl_hook]
416
436
 
417
437
  recorder = _build_recorder(cfg.get("record"))
418
438
 
@@ -420,6 +440,7 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
420
440
  workload_type=workload_type,
421
441
  workload=workload,
422
442
  load=load_model,
443
+ lanes=lanes,
423
444
  pre_hooks=pre_hooks,
424
445
  post_hooks=post_hooks,
425
446
  monitors=monitors,
@@ -428,9 +449,48 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
428
449
  timeout_s=float(cfg.get("timeout_s", 60.0)),
429
450
  max_in_flight=int(cfg.get("max_in_flight", 10000)),
430
451
  progress_every_s=float(cfg.get("progress_every_s", 1.0)),
452
+ stop_on_exhausted=bool(cfg.get("stop_on_exhausted", True)),
431
453
  )
432
454
 
433
455
 
456
+ def _build_lanes(spec: Any, *, duration_s: Optional[float],
457
+ max_requests: Optional[int]) -> list[BenchLane]:
458
+ """Build independent workload/load pairs from a ``mix:`` YAML block."""
459
+ if not isinstance(spec, dict):
460
+ raise TypeError("'mix' must be a mapping with a 'lanes' list")
461
+ lane_specs = spec.get("lanes")
462
+ if not isinstance(lane_specs, list) or not lane_specs:
463
+ raise ValueError("'mix.lanes' must be a non-empty list")
464
+
465
+ lanes: list[BenchLane] = []
466
+ for index, lane_spec in enumerate(lane_specs):
467
+ if not isinstance(lane_spec, dict):
468
+ raise TypeError(f"mix.lanes[{index}] must be a mapping")
469
+ name = lane_spec.get("name")
470
+ if not isinstance(name, str) or not name.strip():
471
+ raise ValueError(f"mix.lanes[{index}].name must be a non-empty string")
472
+ if "workload" not in lane_spec:
473
+ raise ValueError(f"mix.lanes[{index}] must define a workload")
474
+ rate = lane_spec.get("rate", lane_spec.get("load"))
475
+ if rate is None:
476
+ raise ValueError(f"mix.lanes[{index}] must define rate (or load)")
477
+
478
+ lane_duration = lane_spec.get("duration", duration_s)
479
+ if isinstance(lane_duration, str):
480
+ lane_duration = parse_duration(lane_duration)
481
+ lane_max_requests = lane_spec.get("max_requests", max_requests)
482
+ lanes.append(BenchLane(
483
+ name=name,
484
+ workload=build_workload(lane_spec["workload"]),
485
+ load=parse_rate_spec(
486
+ rate,
487
+ duration_s=lane_duration,
488
+ max_requests=lane_max_requests,
489
+ ),
490
+ ))
491
+ return lanes
492
+
493
+
434
494
  def _build_recorder(spec: Any) -> Optional[TraceRecorder]:
435
495
  if spec is None:
436
496
  return None
@@ -458,4 +518,3 @@ def _build_replay(spec: Any) -> tuple[WorkloadType, Workload, Any]:
458
518
  TracePacedLoad(trace, speed=speed),
459
519
  )
460
520
 
461
-
@@ -52,64 +52,22 @@ class MetricsAggregator:
52
52
  def summary(self) -> dict:
53
53
  end = self.end_time or time.monotonic()
54
54
  wall_s = max(end - self.start_time, 1e-9)
55
- ok = [s for s in self.samples if s.ok]
56
- fail = [s for s in self.samples if not s.ok]
57
- # Split fail into transport failures vs. delivered-but-graded-wrong.
58
- wrong = [s for s in fail if s.request_ok]
59
- request_failed = [s for s in fail if not s.request_ok]
60
- latencies = [s.latency_s for s in ok]
61
-
62
- status_counts = Counter(s.status for s in self.samples)
63
- error_counts = Counter(s.error for s in fail if s.error)
64
-
65
- out: dict = {
66
- "wall_time_s": wall_s,
67
- "total_requests": len(self.samples),
68
- "success": len(ok),
69
- "failed": len(fail),
70
- "request_failed": len(request_failed),
71
- "wrong_output": len(wrong),
72
- "error_rate": (len(fail) / len(self.samples)) if self.samples else 0.0,
73
- "request_failure_rate": (
74
- (len(request_failed) / len(self.samples)) if self.samples else 0.0
75
- ),
76
- "throughput_rps": len(self.samples) / wall_s,
77
- "goodput_rps": len(ok) / wall_s,
78
- "bytes_sent": sum(s.bytes_sent for s in self.samples),
79
- "bytes_recv": sum(s.bytes_recv for s in self.samples),
80
- "status_codes": dict(status_counts),
81
- "errors": dict(error_counts),
82
- }
83
- if latencies:
84
- out["latency_s"] = {
85
- "mean": statistics.mean(latencies),
86
- "min": min(latencies),
87
- "max": max(latencies),
88
- "p50": _pct(latencies, 50),
89
- "p90": _pct(latencies, 90),
90
- "p95": _pct(latencies, 95),
91
- "p99": _pct(latencies, 99),
92
- "p999": _pct(latencies, 99.9),
93
- }
55
+ out = _summary_for_samples(self.samples, wall_s)
94
56
 
95
- # Aggregate workload-specific `extra` metrics generically: mean + percentiles.
96
- extras: dict[str, list[float]] = defaultdict(list)
97
- for s in ok:
98
- for k, v in s.extra.items():
99
- if isinstance(v, (int, float)):
100
- extras[k].append(float(v))
101
- if extras:
102
- ext_summary = {}
103
- for k, vals in extras.items():
104
- ext_summary[k] = {
105
- "mean": statistics.mean(vals),
106
- "p50": _pct(vals, 50),
107
- "p90": _pct(vals, 90),
108
- "p99": _pct(vals, 99),
109
- "min": min(vals),
110
- "max": max(vals),
111
- }
112
- out["workload_metrics"] = ext_summary
57
+ # A mixed benchmark needs each lane's SLO signal independently. Use
58
+ # the same wall-clock interval as the aggregate so lane throughput is
59
+ # directly comparable to the total, while latency and workload metrics
60
+ # remain scoped to that lane's samples.
61
+ lanes: dict[str, list[Sample]] = defaultdict(list)
62
+ for sample in self.samples:
63
+ lane = sample.meta.get("lane")
64
+ if isinstance(lane, str) and lane:
65
+ lanes[lane].append(sample)
66
+ if lanes:
67
+ out["lanes"] = {
68
+ name: _summary_for_samples(samples, wall_s)
69
+ for name, samples in sorted(lanes.items())
70
+ }
113
71
 
114
72
  # Monitor time-series: summarize each metric per monitor.
115
73
  if self.monitor_samples:
@@ -181,6 +139,22 @@ class MetricsAggregator:
181
139
  lines.append(f" {k}")
182
140
  for kk in ("mean", "p50", "p90", "p99", "max"):
183
141
  lines.append(f" {kk:<6}: {v[kk]:.4f}")
142
+ if s.get("lanes"):
143
+ lines.append("")
144
+ lines.append(" lanes")
145
+ for name, lane in s["lanes"].items():
146
+ lines.append(
147
+ f" {name}: {lane['total_requests']} requests, "
148
+ f"{lane['throughput_rps']:.2f} req/s, "
149
+ f"{lane['success']} success"
150
+ )
151
+ for metric in ("ttft_s", "itl_ms_mean", "tokens_per_s"):
152
+ values = lane.get("workload_metrics", {}).get(metric)
153
+ if values:
154
+ lines.append(
155
+ f" {metric}: p50={values['p50']:.4f}, "
156
+ f"p99={values['p99']:.4f}"
157
+ )
184
158
  if s.get("monitors"):
185
159
  for mon_name, mon in s["monitors"].items():
186
160
  lines.append("")
@@ -223,6 +197,70 @@ class MetricsAggregator:
223
197
  }) + "\n")
224
198
 
225
199
 
200
+ def _summary_for_samples(samples: list[Sample], wall_s: float) -> dict:
201
+ """Summarize a sample subset over a shared benchmark wall-clock interval."""
202
+ ok = [s for s in samples if s.ok]
203
+ fail = [s for s in samples if not s.ok]
204
+ # Split fail into transport failures vs. delivered-but-graded-wrong.
205
+ wrong = [s for s in fail if s.request_ok]
206
+ request_failed = [s for s in fail if not s.request_ok]
207
+ latencies = [s.latency_s for s in ok]
208
+
209
+ status_counts = Counter(s.status for s in samples)
210
+ error_counts = Counter(s.error for s in fail if s.error)
211
+
212
+ out: dict = {
213
+ "wall_time_s": wall_s,
214
+ "total_requests": len(samples),
215
+ "success": len(ok),
216
+ "failed": len(fail),
217
+ "request_failed": len(request_failed),
218
+ "wrong_output": len(wrong),
219
+ "error_rate": (len(fail) / len(samples)) if samples else 0.0,
220
+ "request_failure_rate": (
221
+ (len(request_failed) / len(samples)) if samples else 0.0
222
+ ),
223
+ "throughput_rps": len(samples) / wall_s,
224
+ "goodput_rps": len(ok) / wall_s,
225
+ "bytes_sent": sum(s.bytes_sent for s in samples),
226
+ "bytes_recv": sum(s.bytes_recv for s in samples),
227
+ "status_codes": dict(status_counts),
228
+ "errors": dict(error_counts),
229
+ }
230
+ if latencies:
231
+ out["latency_s"] = {
232
+ "mean": statistics.mean(latencies),
233
+ "min": min(latencies),
234
+ "max": max(latencies),
235
+ "p50": _pct(latencies, 50),
236
+ "p90": _pct(latencies, 90),
237
+ "p95": _pct(latencies, 95),
238
+ "p99": _pct(latencies, 99),
239
+ "p999": _pct(latencies, 99.9),
240
+ }
241
+
242
+ # Aggregate workload-specific `extra` metrics generically: mean + percentiles.
243
+ extras: dict[str, list[float]] = defaultdict(list)
244
+ for s in ok:
245
+ for k, v in s.extra.items():
246
+ if isinstance(v, (int, float)):
247
+ extras[k].append(float(v))
248
+ if extras:
249
+ ext_summary = {}
250
+ for k, vals in extras.items():
251
+ ext_summary[k] = {
252
+ "mean": statistics.mean(vals),
253
+ "p50": _pct(vals, 50),
254
+ "p90": _pct(vals, 90),
255
+ "p99": _pct(vals, 99),
256
+ "min": min(vals),
257
+ "max": max(vals),
258
+ }
259
+ out["workload_metrics"] = ext_summary
260
+
261
+ return out
262
+
263
+
226
264
  def _safe_meta(meta: dict) -> dict:
227
265
  out = {}
228
266
  for k, v in meta.items():
@@ -30,11 +30,31 @@ from benchmaker.workloads.base import WorkloadType
30
30
  from benchmaker.workloads.datasets import Workload, StaticWorkload
31
31
 
32
32
 
33
+ @dataclass
34
+ class BenchLane:
35
+ """One independently scheduled input lane in a mixed benchmark.
36
+
37
+ All lanes share the enclosing :class:`BenchConfig`'s workload type and
38
+ endpoint, but each owns its own data source and load model. This keeps
39
+ OpenAI/HTTP protocol configuration centralized while preserving independent
40
+ arrival processes for phase-swing experiments.
41
+ """
42
+
43
+ name: str
44
+ workload: Workload
45
+ load: LoadModel
46
+
47
+ def __post_init__(self) -> None:
48
+ if not self.name or not self.name.strip():
49
+ raise ValueError("lane name must be a non-empty string")
50
+
51
+
33
52
  @dataclass
34
53
  class BenchConfig:
35
54
  workload_type: WorkloadType # how to talk to the service
36
- load: LoadModel # when to fire
55
+ load: Optional[LoadModel] = None # when to fire (single workload)
37
56
  workload: Workload = field(default_factory=StaticWorkload) # what to send
57
+ lanes: list[BenchLane] = field(default_factory=list)
38
58
  pre_hooks: list[PreRequestHook] = field(default_factory=list)
39
59
  post_hooks: list[PostResponseHook] = field(default_factory=list)
40
60
  monitors: list[Monitor] = field(default_factory=list) # optional periodic samplers
@@ -48,6 +68,16 @@ class BenchConfig:
48
68
  progress_every_s: float = 1.0
49
69
  stop_on_exhausted: bool = True
50
70
 
71
+ def __post_init__(self) -> None:
72
+ if self.lanes:
73
+ if self.load is not None:
74
+ raise ValueError("configure either load/workload or lanes, not both")
75
+ names = [lane.name for lane in self.lanes]
76
+ if len(names) != len(set(names)):
77
+ raise ValueError("mixed benchmark lane names must be unique")
78
+ elif self.load is None:
79
+ raise ValueError("BenchConfig requires a load model or at least one lane")
80
+
51
81
 
52
82
  @dataclass
53
83
  class BenchResult:
@@ -73,7 +103,7 @@ class BenchRunner:
73
103
  try:
74
104
  await self._drive(session)
75
105
  finally:
76
- await self.cfg.workload.aclose()
106
+ await self._aclose_workloads()
77
107
  await self.cfg.workload_type.aclose()
78
108
  if self.cfg.recorder is not None:
79
109
  self.cfg.recorder.close()
@@ -96,19 +126,10 @@ class BenchRunner:
96
126
  ))
97
127
 
98
128
  try:
99
- async for _ in self.cfg.load.tickets():
100
- try:
101
- item = await self.cfg.workload.next_item()
102
- except StopAsyncIteration:
103
- if self.cfg.stop_on_exhausted:
104
- break
105
- else:
106
- continue
107
-
108
- await sem.acquire()
109
- task = asyncio.create_task(self._fire(session, item, sem))
110
- tasks.add(task)
111
- task.add_done_callback(tasks.discard)
129
+ if self.cfg.lanes:
130
+ await self._drive_lanes(session, sem, tasks)
131
+ else:
132
+ await self._drive_single(session, sem, tasks)
112
133
  finally:
113
134
  progress_task.cancel()
114
135
  try:
@@ -124,11 +145,71 @@ class BenchRunner:
124
145
  if monitor_tasks:
125
146
  await asyncio.gather(*monitor_tasks, return_exceptions=True)
126
147
 
148
+ async def _drive_single(self, session: aiohttp.ClientSession,
149
+ sem: asyncio.Semaphore,
150
+ tasks: set[asyncio.Task]) -> None:
151
+ assert self.cfg.load is not None
152
+ async for _ in self.cfg.load.tickets():
153
+ try:
154
+ item = await self.cfg.workload.next_item()
155
+ except StopAsyncIteration:
156
+ if self.cfg.stop_on_exhausted:
157
+ break
158
+ continue
159
+
160
+ await sem.acquire()
161
+ task = asyncio.create_task(
162
+ self._fire(session, item, sem, self.cfg.load)
163
+ )
164
+ tasks.add(task)
165
+ task.add_done_callback(tasks.discard)
166
+
167
+ async def _drive_lanes(self, session: aiohttp.ClientSession,
168
+ sem: asyncio.Semaphore,
169
+ tasks: set[asyncio.Task]) -> None:
170
+ """Run each lane's admission iterator concurrently.
171
+
172
+ A finite workload ends only its own lane. Other lanes keep producing
173
+ tickets, which is required when one dataset is short or intentionally
174
+ bursty and another drives a long complementary phase.
175
+ """
176
+
177
+ async def produce(lane: BenchLane) -> None:
178
+ async for _ in lane.load.tickets():
179
+ try:
180
+ item = await lane.workload.next_item()
181
+ except StopAsyncIteration:
182
+ if self.cfg.stop_on_exhausted:
183
+ break
184
+ continue
185
+
186
+ await sem.acquire()
187
+ task = asyncio.create_task(
188
+ self._fire(session, item, sem, lane.load, lane.name)
189
+ )
190
+ tasks.add(task)
191
+ task.add_done_callback(tasks.discard)
192
+
193
+ producers = [asyncio.create_task(produce(lane)) for lane in self.cfg.lanes]
194
+ try:
195
+ await asyncio.gather(*producers)
196
+ finally:
197
+ for producer in producers:
198
+ if not producer.done():
199
+ producer.cancel()
200
+ await asyncio.gather(*producers, return_exceptions=True)
201
+
127
202
  async def _fire(self, session: aiohttp.ClientSession, item: Any,
128
- sem: asyncio.Semaphore) -> None:
203
+ sem: asyncio.Semaphore, load: LoadModel,
204
+ lane_name: Optional[str] = None) -> None:
129
205
  start_mono = time.monotonic()
130
206
  try:
131
207
  async def fire(req: Request) -> Response:
208
+ if lane_name is not None:
209
+ # The config-defined lane is authoritative: callers may
210
+ # still attach arbitrary metadata, but cannot accidentally
211
+ # collapse a mixed run into an incorrect lane.
212
+ req.meta["lane"] = lane_name
132
213
  for hook in self.cfg.pre_hooks:
133
214
  req = await maybe_await(hook(req))
134
215
  fire_start = time.monotonic()
@@ -145,15 +226,30 @@ class BenchRunner:
145
226
  workload_name=self.cfg.workload_type.name,
146
227
  )
147
228
  sample = await self.cfg.workload_type.run_ticket(ctx)
229
+ if lane_name is not None:
230
+ sample.meta["lane"] = lane_name
148
231
  self.metrics.add(sample)
149
232
  except Exception as e:
150
233
  self.metrics.add(_failure_sample(
151
234
  f"{type(e).__name__}: {e}",
152
235
  self.cfg.workload_type.name,
236
+ lane_name,
153
237
  ))
154
238
  finally:
155
239
  sem.release()
156
- self.cfg.load.on_complete()
240
+ load.on_complete()
241
+
242
+ async def _aclose_workloads(self) -> None:
243
+ workloads = (
244
+ [lane.workload for lane in self.cfg.lanes]
245
+ if self.cfg.lanes
246
+ else [self.cfg.workload]
247
+ )
248
+ closed: set[int] = set()
249
+ for workload in workloads:
250
+ if id(workload) not in closed:
251
+ closed.add(id(workload))
252
+ await workload.aclose()
157
253
 
158
254
  async def _execute(self, session: aiohttp.ClientSession, req: Request,
159
255
  start_mono: float) -> Response:
@@ -254,12 +350,16 @@ class BenchRunner:
254
350
  out_dir,
255
351
  self.metrics,
256
352
  workload_type_name=self.cfg.workload_type.name,
257
- workload_name=self.cfg.workload.name,
353
+ workload_name=(
354
+ "mix:" + ",".join(lane.name for lane in self.cfg.lanes)
355
+ if self.cfg.lanes else self.cfg.workload.name
356
+ ),
258
357
  **kwargs,
259
358
  )
260
359
 
261
360
 
262
- def _failure_sample(error: str, workload: str) -> Sample:
361
+ def _failure_sample(error: str, workload: str,
362
+ lane_name: Optional[str] = None) -> Sample:
263
363
  return Sample(
264
364
  start_ts=time.monotonic(),
265
365
  latency_s=0.0,
@@ -268,6 +368,7 @@ def _failure_sample(error: str, workload: str) -> Sample:
268
368
  request_ok=False,
269
369
  error=error,
270
370
  workload=workload,
371
+ meta={"lane": lane_name} if lane_name is not None else {},
271
372
  )
272
373
 
273
374
 
@@ -157,6 +157,26 @@ class SWEBenchReplayRecipe(Recipe):
157
157
  "trajectory store recorded with tool_results."),
158
158
  click.option("--utilization-interval-sec", "utilization_interval_sec",
159
159
  type=float, default=5.0, show_default=True),
160
+ click.option("--qos-enabled/--no-qos-enabled", "qos_enabled",
161
+ default=False, show_default=True,
162
+ help="Demote verifier-phase containers to best_effort "
163
+ "cpu.weight (and apply the QoS verifier-timeout "
164
+ "multiplier)."),
165
+ click.option("--on-demand-cpu-weight", "on_demand_cpu_weight",
166
+ type=int, default=10000, show_default=True,
167
+ help="cpu.weight for on-demand (agent-phase) containers "
168
+ "when --qos-enabled. Ignored when --no-qos-enabled."),
169
+ click.option("--best-effort-cpu-weight", "best_effort_cpu_weight",
170
+ type=int, default=10, show_default=True,
171
+ help="cpu.weight for best-effort (verifier-phase) "
172
+ "containers when --qos-enabled. Ignored when "
173
+ "--no-qos-enabled."),
174
+ click.option("--qos-verifier-timeout-multiplier",
175
+ "qos_verifier_timeout_multiplier",
176
+ type=float, default=2.0, show_default=True,
177
+ help="Verifier timeout multiplier applied only when "
178
+ "--qos-enabled (QoS demotes verifier CPU, so the "
179
+ "verifier needs more wall-clock time)."),
160
180
  ]
161
181
 
162
182
  def run(self, shared: SharedOpts, *, job, trajectories, concurrency,
@@ -164,7 +184,9 @@ class SWEBenchReplayRecipe(Recipe):
164
184
  dataset, exec_timeout_sec, n_tasks, task, exclude_task, n_attempts,
165
185
  timeout_multiplier, backend_type, request_timeout_sec,
166
186
  agent_ready_timeout_sec, jobs_dir, timeline,
167
- utilization_interval_sec, validate_observations) -> Optional[int]:
187
+ utilization_interval_sec, validate_observations,
188
+ qos_enabled, on_demand_cpu_weight, best_effort_cpu_weight,
189
+ qos_verifier_timeout_multiplier) -> Optional[int]:
168
190
  from benchmaker.swebench import harbor_eval as he
169
191
  from benchmaker.swebench import trajectory as T
170
192
 
@@ -250,6 +272,10 @@ class SWEBenchReplayRecipe(Recipe):
250
272
  request_timeout_sec=request_timeout_sec,
251
273
  agent_ready_timeout_sec=agent_ready_timeout_sec,
252
274
  jobs_dir=jobs_dir,
275
+ qos_enabled=qos_enabled,
276
+ on_demand_cpu_weight=on_demand_cpu_weight,
277
+ best_effort_cpu_weight=best_effort_cpu_weight,
278
+ qos_verifier_timeout_multiplier=qos_verifier_timeout_multiplier,
253
279
  )
254
280
 
255
281
  results: list[tuple] = []
@@ -219,12 +219,28 @@ def _build_job_config(args: argparse.Namespace) -> JobConfig:
219
219
  if jobs_dir:
220
220
  job_kwargs["jobs_dir"] = Path(jobs_dir)
221
221
 
222
+ # QoS: when enabled, splat the cpu.weight knobs into the environment kwargs
223
+ # (consumed by FlashSandboxEnvironment.__init__) and couple the verifier
224
+ # timeout — QoS demotes verifier-phase CPU, so the verifier needs more
225
+ # wall-clock time. Left untouched (None) when QoS is off.
226
+ verifier_timeout_multiplier = None
227
+ # QoS is wired only via the swebench-replay recipe CLI; harbor_eval's own
228
+ # _parse_args does not expose these flags, so this guard no-ops there.
229
+ if getattr(args, "qos_enabled", False):
230
+ environment.kwargs.update(
231
+ qos_enabled=True,
232
+ on_demand_cpu_weight=args.on_demand_cpu_weight,
233
+ best_effort_cpu_weight=args.best_effort_cpu_weight,
234
+ )
235
+ verifier_timeout_multiplier = args.qos_verifier_timeout_multiplier
236
+
222
237
  return JobConfig(
223
238
  job_name=args.job_name or "",
224
239
  n_attempts=args.n_attempts,
225
240
  n_concurrent_trials=args.concurrency,
226
241
  quiet=False,
227
242
  timeout_multiplier=args.timeout_multiplier,
243
+ verifier_timeout_multiplier=verifier_timeout_multiplier,
228
244
  environment=environment,
229
245
  agents=[agent],
230
246
  datasets=[dataset],