benchmaker 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {benchmaker-0.1.2 → benchmaker-0.1.4}/PKG-INFO +41 -11
  2. {benchmaker-0.1.2 → benchmaker-0.1.4}/README.md +38 -10
  3. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/__init__.py +5 -2
  4. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/config.py +75 -9
  5. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/metrics.py +95 -57
  6. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/runner.py +120 -19
  7. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/swebench_replay.py +92 -22
  8. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/trajectory_replay.py +45 -6
  9. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/harbor_eval.py +22 -1
  10. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/pi_agent.py +174 -35
  11. benchmaker-0.1.4/benchmaker/swebench/pi_ext/register_provider.js +65 -0
  12. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/remote_exec.js +6 -2
  13. benchmaker-0.1.4/benchmaker/swebench/pi_ext/remote_exec_all.js +231 -0
  14. benchmaker-0.1.4/benchmaker/swebench/replay_server.py +448 -0
  15. benchmaker-0.1.4/benchmaker/swebench/timeout_load.py +107 -0
  16. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/trajectory.py +148 -3
  17. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/__init__.py +2 -0
  18. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/datasets.py +11 -0
  19. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/llm.py +7 -0
  20. benchmaker-0.1.4/benchmaker/workloads/rag.py +188 -0
  21. benchmaker-0.1.4/benchmaker/workloads/trajectory.py +437 -0
  22. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/PKG-INFO +41 -11
  23. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/SOURCES.txt +14 -0
  24. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/requires.txt +3 -0
  25. {benchmaker-0.1.2 → benchmaker-0.1.4}/pyproject.toml +2 -1
  26. benchmaker-0.1.4/tests/test_backfill_trajectory_status.py +93 -0
  27. benchmaker-0.1.4/tests/test_collect_sweep_data.py +24 -0
  28. benchmaker-0.1.4/tests/test_collect_trajectories.py +441 -0
  29. benchmaker-0.1.4/tests/test_dedupe_trajectories.py +167 -0
  30. benchmaker-0.1.4/tests/test_mix.py +86 -0
  31. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_pi_agent.py +98 -0
  32. benchmaker-0.1.4/tests/test_pi_agent_timeout_injection.py +47 -0
  33. benchmaker-0.1.4/tests/test_qos_job_config.py +56 -0
  34. benchmaker-0.1.4/tests/test_rag.py +120 -0
  35. benchmaker-0.1.4/tests/test_replay_server.py +454 -0
  36. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_swebench_replay_recipe.py +23 -3
  37. benchmaker-0.1.4/tests/test_timeout_load.py +177 -0
  38. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_trajectory.py +184 -0
  39. benchmaker-0.1.4/tests/test_trajectory_interleave.py +284 -0
  40. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_trajectory_replay.py +43 -0
  41. benchmaker-0.1.2/benchmaker/swebench/replay_server.py +0 -206
  42. benchmaker-0.1.2/benchmaker/workloads/trajectory.py +0 -209
  43. benchmaker-0.1.2/tests/test_replay_server.py +0 -133
  44. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/cli.py +0 -0
  45. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/__init__.py +0 -0
  46. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/load.py +0 -0
  47. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/monitors.py +0 -0
  48. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/trace.py +0 -0
  49. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/types.py +0 -0
  50. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/env.py +0 -0
  51. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/io/__init__.py +0 -0
  52. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/io/bundle.py +0 -0
  53. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/io/collect.py +0 -0
  54. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/__init__.py +0 -0
  55. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/_cli_shared.py +0 -0
  56. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/_factory.py +0 -0
  57. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/base.py +0 -0
  58. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/http.py +0 -0
  59. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/llm.py +0 -0
  60. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/sandbox.py +0 -0
  61. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/sglang.py +0 -0
  62. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/swebench.py +0 -0
  63. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/__init__.py +0 -0
  64. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/_flash_hardening.py +0 -0
  65. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/agent.py +0 -0
  66. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/grading.py +0 -0
  67. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/harbor_agent.py +0 -0
  68. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/observability.py +0 -0
  69. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/max_turns.js +0 -0
  70. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/agent.py +0 -0
  71. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/base.py +0 -0
  72. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/eval.py +0 -0
  73. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/hf.py +0 -0
  74. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/http.py +0 -0
  75. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/sandbox.py +0 -0
  76. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/sglang.py +0 -0
  77. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/dependency_links.txt +0 -0
  78. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/entry_points.txt +0 -0
  79. {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/top_level.txt +0 -0
  80. {benchmaker-0.1.2 → benchmaker-0.1.4}/setup.cfg +0 -0
  81. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_agent.py +0 -0
  82. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_agent_warmup.py +0 -0
  83. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_bundle.py +0 -0
  84. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_coding_agent.py +0 -0
  85. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_eval.py +0 -0
  86. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_flash_hardening.py +0 -0
  87. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_hf.py +0 -0
  88. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_observability.py +0 -0
  89. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_passthrough_meta.py +0 -0
  90. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_recipes_cli.py +0 -0
  91. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_sandbox_duration.py +0 -0
  92. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_sglang.py +0 -0
  93. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_smoke.py +0 -0
  94. {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_trace.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmaker
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
5
5
  Author: Xiaozhe Yao
6
6
  License: MIT
@@ -18,6 +18,8 @@ Requires-Dist: rich>=13; extra == "rich"
18
18
  Provides-Extra: hf
19
19
  Requires-Dist: datasets>=2.18; extra == "hf"
20
20
  Requires-Dist: transformers>=4.40; extra == "hf"
21
+ Provides-Extra: tokenizer
22
+ Requires-Dist: transformers>=4.40; extra == "tokenizer"
21
23
  Provides-Extra: dev
22
24
  Requires-Dist: pytest>=7; extra == "dev"
23
25
  Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
@@ -73,8 +75,8 @@ asyncio.run(main())
73
75
  ```
74
76
 
75
77
  Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
76
- `benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`, `sglang`,
77
- `trajectory-replay`):
78
+ `benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
79
+ `swebench-replay`, `sglang`, `trajectory-replay`):
78
80
 
79
81
  ```bash
80
82
  benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
@@ -185,16 +187,16 @@ Full docs live in [`docs/`](docs/):
185
187
  - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
186
188
  - [CLI & YAML reference](docs/cli-and-yaml.md)
187
189
  - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
188
- - `benchmaker sglang` native SGLang `/generate` benchmark (see [`docs/sglang.md`](docs/sglang.md)).
189
- - `benchmaker trajectory-replay`multi-turn prefix-cache parity replay of
190
- trajectory datasets like SWE-smith (see [`docs/trajectory-replay.md`](docs/trajectory-replay.md)).
190
+ - [DeepRAG and mixed lanes](docs/deeprag-mix.md) — prefill-heavy RAG and phase-swinging dataset lanes
191
+ - [SGLang benchmark](docs/sglang.md)native SGLang `/generate` benchmark
192
+ - [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
191
193
 
192
194
  ## Deterministic replay (`swebench-replay`)
193
195
 
194
196
  Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
195
197
  real pi + sandbox + verifier pipeline still runs, only the model is served back
196
198
  from recorded outputs, so re-runs are deterministic and free of model
197
- cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
199
+ cost/variance. Vary `--concurrency` (or `--concurrency-sweep`) to study the rest of the
198
200
  pipeline without the model's stochasticity as a confound. Still needs
199
201
  `FLASH_SANDBOX_URL` (the sandbox + verifier are real).
200
202
 
@@ -207,7 +209,7 @@ python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
207
209
  # 2) replay (host mode, localhost) across a concurrency sweep
208
210
  FLASH_SANDBOX_URL=http://localhost:8080 \
209
211
  benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
210
- --mode pi-host --sweep 1,5,25
212
+ --mode pi-host --concurrency-sweep 1,5,25
211
213
 
212
214
  # container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
213
215
  FLASH_SANDBOX_URL=http://localhost:8080 \
@@ -221,6 +223,18 @@ run lacked an instance id) plus the count of assistant messages already in the
221
223
  request — so it is correct at any concurrency. A `MISSES` column in the summary
222
224
  flags any divergence (a request beyond the recorded turns).
223
225
 
226
+ The standalone replay server can also **mock realistic streaming** for
227
+ latency-sensitive benchmarks. Pass a real tokenizer and a per-token delay; the
228
+ first token is emitted immediately (prefill free, TTFT≈0) and each subsequent
229
+ token is spaced by `--inter-token-time` ms. Output stays byte-exact and the
230
+ reported `usage` is the recorded value.
231
+
232
+ ```bash
233
+ pip install 'benchmaker[tokenizer]' # adds transformers for the tokenizer
234
+ python -m benchmaker.swebench.replay_server replay-trajectories.jsonl \
235
+ --tokenizer zai-org/GLM-4.7-Flash --inter-token-time 50
236
+ ```
237
+
224
238
  ## Examples
225
239
 
226
240
  Under [`examples/`](examples/):
@@ -228,9 +242,12 @@ Under [`examples/`](examples/):
228
242
  - `simple_get.py` — minimal library usage
229
243
  - `custom_hooks.py` — request signing + response parsing
230
244
  - `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
245
+ - `llm_from_env.py` — LLM benchmark using `from_env()`
231
246
  - `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
247
+ - `agent_trove.py` — user-defined agent benchmark
232
248
  - `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
233
249
  - `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
250
+ - `bench_sandbox.py` / `bench_sandbox.sh` — sandbox benchmarks
234
251
  - `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
235
252
  - `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
236
253
  - `config.yaml` — generic HTTP YAML config
@@ -254,9 +271,22 @@ benchmaker/ # library code
254
271
  config.py env.py # YAML config loading + .env interpolation
255
272
  core/ # engine: types, load models, runner, metrics, monitors, trace
256
273
  io/ # run output: per-run bundle + cross-run collection
257
- workloads/ # workload-types (http, llm, sandbox, agent, hf, eval)
258
- recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay) + registry
259
- swebench/ # SWE-bench coding agent + grading + harbor adapters
274
+ workloads/
275
+ http.py # HTTP workload-type
276
+ llm.py # OpenAI-compatible chat workload-type
277
+ sandbox.py # Flash Sandbox workload-type
278
+ sglang.py # SGLang native /generate workload-type
279
+ agent.py # user-defined Agent workload-type
280
+ trajectory.py # multi-turn trajectory replay workload
281
+ eval.py # correctness/accuracy evaluation
282
+ hf.py # HuggingFace dataset source
283
+ datasets.py # generic workload/dataset base classes
284
+ base.py # WorkloadType base class
285
+ recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay, sglang, trajectory-replay) + registry
286
+ swebench/
287
+ trajectory.py # convert pi logs to replay trajectories
288
+ replay_server.py # mock-LLM replay server for swebench-replay
289
+ agent.py # SWE-bench coding agent + grading + harbor adapters
260
290
  examples/ # runnable examples (incl. swebench/ coding-agent config)
261
291
  tools/ # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
262
292
  tests/ # pytest smoke tests
@@ -45,8 +45,8 @@ asyncio.run(main())
45
45
  ```
46
46
 
47
47
  Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
48
- `benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`, `sglang`,
49
- `trajectory-replay`):
48
+ `benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
49
+ `swebench-replay`, `sglang`, `trajectory-replay`):
50
50
 
51
51
  ```bash
52
52
  benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
@@ -157,16 +157,16 @@ Full docs live in [`docs/`](docs/):
157
157
  - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
158
158
  - [CLI & YAML reference](docs/cli-and-yaml.md)
159
159
  - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
160
- - `benchmaker sglang` native SGLang `/generate` benchmark (see [`docs/sglang.md`](docs/sglang.md)).
161
- - `benchmaker trajectory-replay`multi-turn prefix-cache parity replay of
162
- trajectory datasets like SWE-smith (see [`docs/trajectory-replay.md`](docs/trajectory-replay.md)).
160
+ - [DeepRAG and mixed lanes](docs/deeprag-mix.md) — prefill-heavy RAG and phase-swinging dataset lanes
161
+ - [SGLang benchmark](docs/sglang.md)native SGLang `/generate` benchmark
162
+ - [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
163
163
 
164
164
  ## Deterministic replay (`swebench-replay`)
165
165
 
166
166
  Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
167
167
  real pi + sandbox + verifier pipeline still runs, only the model is served back
168
168
  from recorded outputs, so re-runs are deterministic and free of model
169
- cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
169
+ cost/variance. Vary `--concurrency` (or `--concurrency-sweep`) to study the rest of the
170
170
  pipeline without the model's stochasticity as a confound. Still needs
171
171
  `FLASH_SANDBOX_URL` (the sandbox + verifier are real).
172
172
 
@@ -179,7 +179,7 @@ python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
179
179
  # 2) replay (host mode, localhost) across a concurrency sweep
180
180
  FLASH_SANDBOX_URL=http://localhost:8080 \
181
181
  benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
182
- --mode pi-host --sweep 1,5,25
182
+ --mode pi-host --concurrency-sweep 1,5,25
183
183
 
184
184
  # container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
185
185
  FLASH_SANDBOX_URL=http://localhost:8080 \
@@ -193,6 +193,18 @@ run lacked an instance id) plus the count of assistant messages already in the
193
193
  request — so it is correct at any concurrency. A `MISSES` column in the summary
194
194
  flags any divergence (a request beyond the recorded turns).
195
195
 
196
+ The standalone replay server can also **mock realistic streaming** for
197
+ latency-sensitive benchmarks. Pass a real tokenizer and a per-token delay; the
198
+ first token is emitted immediately (prefill free, TTFT≈0) and each subsequent
199
+ token is spaced by `--inter-token-time` ms. Output stays byte-exact and the
200
+ reported `usage` is the recorded value.
201
+
202
+ ```bash
203
+ pip install 'benchmaker[tokenizer]' # adds transformers for the tokenizer
204
+ python -m benchmaker.swebench.replay_server replay-trajectories.jsonl \
205
+ --tokenizer zai-org/GLM-4.7-Flash --inter-token-time 50
206
+ ```
207
+
196
208
  ## Examples
197
209
 
198
210
  Under [`examples/`](examples/):
@@ -200,9 +212,12 @@ Under [`examples/`](examples/):
200
212
  - `simple_get.py` — minimal library usage
201
213
  - `custom_hooks.py` — request signing + response parsing
202
214
  - `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
215
+ - `llm_from_env.py` — LLM benchmark using `from_env()`
203
216
  - `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
217
+ - `agent_trove.py` — user-defined agent benchmark
204
218
  - `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
205
219
  - `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
220
+ - `bench_sandbox.py` / `bench_sandbox.sh` — sandbox benchmarks
206
221
  - `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
207
222
  - `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
208
223
  - `config.yaml` — generic HTTP YAML config
@@ -226,9 +241,22 @@ benchmaker/ # library code
226
241
  config.py env.py # YAML config loading + .env interpolation
227
242
  core/ # engine: types, load models, runner, metrics, monitors, trace
228
243
  io/ # run output: per-run bundle + cross-run collection
229
- workloads/ # workload-types (http, llm, sandbox, agent, hf, eval)
230
- recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay) + registry
231
- swebench/ # SWE-bench coding agent + grading + harbor adapters
244
+ workloads/
245
+ http.py # HTTP workload-type
246
+ llm.py # OpenAI-compatible chat workload-type
247
+ sandbox.py # Flash Sandbox workload-type
248
+ sglang.py # SGLang native /generate workload-type
249
+ agent.py # user-defined Agent workload-type
250
+ trajectory.py # multi-turn trajectory replay workload
251
+ eval.py # correctness/accuracy evaluation
252
+ hf.py # HuggingFace dataset source
253
+ datasets.py # generic workload/dataset base classes
254
+ base.py # WorkloadType base class
255
+ recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay, sglang, trajectory-replay) + registry
256
+ swebench/
257
+ trajectory.py # convert pi logs to replay trajectories
258
+ replay_server.py # mock-LLM replay server for swebench-replay
259
+ agent.py # SWE-bench coding agent + grading + harbor adapters
232
260
  examples/ # runnable examples (incl. swebench/ coding-agent config)
233
261
  tools/ # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
234
262
  tests/ # pytest smoke tests
@@ -19,6 +19,7 @@ from benchmaker.workloads.http import HttpWorkloadType
19
19
  from benchmaker.workloads.llm import OpenAIChatWorkloadType
20
20
  from benchmaker.workloads.sandbox import SandboxWorkloadType
21
21
  from benchmaker.workloads.hf import HFDatasetWorkload
22
+ from benchmaker.workloads.rag import DeepRAGWorkload
22
23
  from benchmaker.workloads.sglang import SGLangGenerateWorkloadType
23
24
  from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
24
25
  from benchmaker.workloads.agent import (
@@ -59,7 +60,7 @@ from benchmaker.core.monitors import (
59
60
  PrometheusMonitor,
60
61
  parse_prometheus,
61
62
  )
62
- from benchmaker.core.runner import BenchRunner, BenchConfig, BenchResult
63
+ from benchmaker.core.runner import BenchLane, BenchRunner, BenchConfig, BenchResult
63
64
  from benchmaker.core.trace import (
64
65
  ReplayWorkloadType,
65
66
  TracePacedLoad,
@@ -89,6 +90,7 @@ __all__ = [
89
90
  "OpenAIChatWorkloadType",
90
91
  "SandboxWorkloadType",
91
92
  "HFDatasetWorkload",
93
+ "DeepRAGWorkload",
92
94
  "SGLangGenerateWorkloadType",
93
95
  "TrajectoryReplayWorkload",
94
96
  # agent workload (pluggable user-defined agents)
@@ -136,6 +138,7 @@ __all__ = [
136
138
  # runner
137
139
  "BenchRunner",
138
140
  "BenchConfig",
141
+ "BenchLane",
139
142
  "BenchResult",
140
143
  # trace: record & replay
141
144
  "TraceRecorder",
@@ -153,4 +156,4 @@ __all__ = [
153
156
  "write_bundle",
154
157
  ]
155
158
 
156
- __version__ = "0.1.1"
159
+ __version__ = "0.1.4"
@@ -22,7 +22,7 @@ from typing import Any, Callable, Optional
22
22
  from benchmaker.env import interpolate, load_dotenv
23
23
  from benchmaker.core.load import parse_duration, parse_rate_spec
24
24
  from benchmaker.core.monitors import FunctionMonitor, Monitor, PrometheusMonitor
25
- from benchmaker.core.runner import BenchConfig
25
+ from benchmaker.core.runner import BenchConfig, BenchLane
26
26
  from benchmaker.workloads.base import WorkloadType
27
27
  from benchmaker.workloads.datasets import (
28
28
  CallableWorkload,
@@ -31,6 +31,7 @@ from benchmaker.workloads.datasets import (
31
31
  Workload,
32
32
  )
33
33
  from benchmaker.workloads.hf import HFDatasetWorkload
34
+ from benchmaker.workloads.rag import DeepRAGWorkload
34
35
  from benchmaker.workloads.http import HttpWorkloadType
35
36
  from benchmaker.workloads.llm import OpenAIChatWorkloadType
36
37
  from benchmaker.workloads.sandbox import SandboxWorkloadType
@@ -154,6 +155,8 @@ def build_workload(spec: Any) -> Workload:
154
155
  return CallableWorkload(fn=fn, **kwargs)
155
156
  if t in ("hf", "huggingface"):
156
157
  return HFDatasetWorkload(**kwargs)
158
+ if t in ("deeprag", "deep-rag", "rag"):
159
+ return DeepRAGWorkload(**kwargs)
157
160
  if t == "trajectory":
158
161
  from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
159
162
  return TrajectoryReplayWorkload(**kwargs)
@@ -365,8 +368,12 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
365
368
  cfg = interpolate(cfg)
366
369
 
367
370
  replay_spec = cfg.get("replay")
371
+ mix_spec = cfg.get("mix")
372
+ if replay_spec is not None and mix_spec is not None:
373
+ raise ValueError("'replay' and 'mix' are mutually exclusive")
368
374
  if replay_spec is not None:
369
375
  workload_type, workload, load_model = _build_replay(replay_spec)
376
+ lanes: list[BenchLane] = []
370
377
  else:
371
378
  wt_spec = cfg.get("workload_type")
372
379
  if not wt_spec:
@@ -382,16 +389,27 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
382
389
  raise ValueError("config must define 'workload_type' or 'replay'")
383
390
 
384
391
  workload_type = build_workload_type(wt_spec)
385
- workload = build_workload(cfg.get("workload"))
386
-
387
- load_spec = cfg.get("load")
388
- if load_spec is None:
389
- raise ValueError("config must define 'load'")
390
392
  duration = cfg.get("duration") or cfg.get("duration_s")
391
393
  if duration is not None and isinstance(duration, str):
392
394
  duration = parse_duration(duration)
393
- load_model = parse_rate_spec(load_spec, duration_s=duration,
394
- max_requests=cfg.get("max_requests"))
395
+ if mix_spec is not None:
396
+ if cfg.get("load") is not None:
397
+ raise ValueError("a mixed config cannot also define top-level 'load'")
398
+ workload = StaticWorkload()
399
+ load_model = None
400
+ lanes = _build_lanes(
401
+ mix_spec,
402
+ duration_s=duration,
403
+ max_requests=cfg.get("max_requests"),
404
+ )
405
+ else:
406
+ workload = build_workload(cfg.get("workload"))
407
+ load_spec = cfg.get("load")
408
+ if load_spec is None:
409
+ raise ValueError("config must define 'load' or 'mix.lanes'")
410
+ load_model = parse_rate_spec(load_spec, duration_s=duration,
411
+ max_requests=cfg.get("max_requests"))
412
+ lanes = []
395
413
 
396
414
  pre_hooks = [resolve_callable(h) for h in (cfg.get("pre_hooks") or [])]
397
415
  post_hooks = [resolve_callable(h) for h in (cfg.get("post_hooks") or [])]
@@ -407,12 +425,22 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
407
425
  workload_type, extra_post = apply_correctness(workload_type, correctness_spec)
408
426
  post_hooks = list(post_hooks) + list(extra_post)
409
427
 
428
+ # A workload that schedules on per-request completion (e.g. interleaved
429
+ # trajectory replay) declares the post-hook it needs; install it so a YAML
430
+ # config can't silently stall waiting for a signal it never wired up.
431
+ workloads = [lane.workload for lane in lanes] if lanes else [workload]
432
+ for lane_workload in workloads:
433
+ wl_hook = lane_workload.completion_hook()
434
+ if wl_hook is not None and wl_hook not in post_hooks:
435
+ post_hooks = list(post_hooks) + [wl_hook]
436
+
410
437
  recorder = _build_recorder(cfg.get("record"))
411
438
 
412
439
  return BenchConfig(
413
440
  workload_type=workload_type,
414
441
  workload=workload,
415
442
  load=load_model,
443
+ lanes=lanes,
416
444
  pre_hooks=pre_hooks,
417
445
  post_hooks=post_hooks,
418
446
  monitors=monitors,
@@ -421,9 +449,48 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
421
449
  timeout_s=float(cfg.get("timeout_s", 60.0)),
422
450
  max_in_flight=int(cfg.get("max_in_flight", 10000)),
423
451
  progress_every_s=float(cfg.get("progress_every_s", 1.0)),
452
+ stop_on_exhausted=bool(cfg.get("stop_on_exhausted", True)),
424
453
  )
425
454
 
426
455
 
456
+ def _build_lanes(spec: Any, *, duration_s: Optional[float],
457
+ max_requests: Optional[int]) -> list[BenchLane]:
458
+ """Build independent workload/load pairs from a ``mix:`` YAML block."""
459
+ if not isinstance(spec, dict):
460
+ raise TypeError("'mix' must be a mapping with a 'lanes' list")
461
+ lane_specs = spec.get("lanes")
462
+ if not isinstance(lane_specs, list) or not lane_specs:
463
+ raise ValueError("'mix.lanes' must be a non-empty list")
464
+
465
+ lanes: list[BenchLane] = []
466
+ for index, lane_spec in enumerate(lane_specs):
467
+ if not isinstance(lane_spec, dict):
468
+ raise TypeError(f"mix.lanes[{index}] must be a mapping")
469
+ name = lane_spec.get("name")
470
+ if not isinstance(name, str) or not name.strip():
471
+ raise ValueError(f"mix.lanes[{index}].name must be a non-empty string")
472
+ if "workload" not in lane_spec:
473
+ raise ValueError(f"mix.lanes[{index}] must define a workload")
474
+ rate = lane_spec.get("rate", lane_spec.get("load"))
475
+ if rate is None:
476
+ raise ValueError(f"mix.lanes[{index}] must define rate (or load)")
477
+
478
+ lane_duration = lane_spec.get("duration", duration_s)
479
+ if isinstance(lane_duration, str):
480
+ lane_duration = parse_duration(lane_duration)
481
+ lane_max_requests = lane_spec.get("max_requests", max_requests)
482
+ lanes.append(BenchLane(
483
+ name=name,
484
+ workload=build_workload(lane_spec["workload"]),
485
+ load=parse_rate_spec(
486
+ rate,
487
+ duration_s=lane_duration,
488
+ max_requests=lane_max_requests,
489
+ ),
490
+ ))
491
+ return lanes
492
+
493
+
427
494
  def _build_recorder(spec: Any) -> Optional[TraceRecorder]:
428
495
  if spec is None:
429
496
  return None
@@ -451,4 +518,3 @@ def _build_replay(spec: Any) -> tuple[WorkloadType, Workload, Any]:
451
518
  TracePacedLoad(trace, speed=speed),
452
519
  )
453
520
 
454
-
@@ -52,64 +52,22 @@ class MetricsAggregator:
52
52
  def summary(self) -> dict:
53
53
  end = self.end_time or time.monotonic()
54
54
  wall_s = max(end - self.start_time, 1e-9)
55
- ok = [s for s in self.samples if s.ok]
56
- fail = [s for s in self.samples if not s.ok]
57
- # Split fail into transport failures vs. delivered-but-graded-wrong.
58
- wrong = [s for s in fail if s.request_ok]
59
- request_failed = [s for s in fail if not s.request_ok]
60
- latencies = [s.latency_s for s in ok]
61
-
62
- status_counts = Counter(s.status for s in self.samples)
63
- error_counts = Counter(s.error for s in fail if s.error)
64
-
65
- out: dict = {
66
- "wall_time_s": wall_s,
67
- "total_requests": len(self.samples),
68
- "success": len(ok),
69
- "failed": len(fail),
70
- "request_failed": len(request_failed),
71
- "wrong_output": len(wrong),
72
- "error_rate": (len(fail) / len(self.samples)) if self.samples else 0.0,
73
- "request_failure_rate": (
74
- (len(request_failed) / len(self.samples)) if self.samples else 0.0
75
- ),
76
- "throughput_rps": len(self.samples) / wall_s,
77
- "goodput_rps": len(ok) / wall_s,
78
- "bytes_sent": sum(s.bytes_sent for s in self.samples),
79
- "bytes_recv": sum(s.bytes_recv for s in self.samples),
80
- "status_codes": dict(status_counts),
81
- "errors": dict(error_counts),
82
- }
83
- if latencies:
84
- out["latency_s"] = {
85
- "mean": statistics.mean(latencies),
86
- "min": min(latencies),
87
- "max": max(latencies),
88
- "p50": _pct(latencies, 50),
89
- "p90": _pct(latencies, 90),
90
- "p95": _pct(latencies, 95),
91
- "p99": _pct(latencies, 99),
92
- "p999": _pct(latencies, 99.9),
93
- }
55
+ out = _summary_for_samples(self.samples, wall_s)
94
56
 
95
- # Aggregate workload-specific `extra` metrics generically: mean + percentiles.
96
- extras: dict[str, list[float]] = defaultdict(list)
97
- for s in ok:
98
- for k, v in s.extra.items():
99
- if isinstance(v, (int, float)):
100
- extras[k].append(float(v))
101
- if extras:
102
- ext_summary = {}
103
- for k, vals in extras.items():
104
- ext_summary[k] = {
105
- "mean": statistics.mean(vals),
106
- "p50": _pct(vals, 50),
107
- "p90": _pct(vals, 90),
108
- "p99": _pct(vals, 99),
109
- "min": min(vals),
110
- "max": max(vals),
111
- }
112
- out["workload_metrics"] = ext_summary
57
+ # A mixed benchmark needs each lane's SLO signal independently. Use
58
+ # the same wall-clock interval as the aggregate so lane throughput is
59
+ # directly comparable to the total, while latency and workload metrics
60
+ # remain scoped to that lane's samples.
61
+ lanes: dict[str, list[Sample]] = defaultdict(list)
62
+ for sample in self.samples:
63
+ lane = sample.meta.get("lane")
64
+ if isinstance(lane, str) and lane:
65
+ lanes[lane].append(sample)
66
+ if lanes:
67
+ out["lanes"] = {
68
+ name: _summary_for_samples(samples, wall_s)
69
+ for name, samples in sorted(lanes.items())
70
+ }
113
71
 
114
72
  # Monitor time-series: summarize each metric per monitor.
115
73
  if self.monitor_samples:
@@ -181,6 +139,22 @@ class MetricsAggregator:
181
139
  lines.append(f" {k}")
182
140
  for kk in ("mean", "p50", "p90", "p99", "max"):
183
141
  lines.append(f" {kk:<6}: {v[kk]:.4f}")
142
+ if s.get("lanes"):
143
+ lines.append("")
144
+ lines.append(" lanes")
145
+ for name, lane in s["lanes"].items():
146
+ lines.append(
147
+ f" {name}: {lane['total_requests']} requests, "
148
+ f"{lane['throughput_rps']:.2f} req/s, "
149
+ f"{lane['success']} success"
150
+ )
151
+ for metric in ("ttft_s", "itl_ms_mean", "tokens_per_s"):
152
+ values = lane.get("workload_metrics", {}).get(metric)
153
+ if values:
154
+ lines.append(
155
+ f" {metric}: p50={values['p50']:.4f}, "
156
+ f"p99={values['p99']:.4f}"
157
+ )
184
158
  if s.get("monitors"):
185
159
  for mon_name, mon in s["monitors"].items():
186
160
  lines.append("")
@@ -223,6 +197,70 @@ class MetricsAggregator:
223
197
  }) + "\n")
224
198
 
225
199
 
200
+ def _summary_for_samples(samples: list[Sample], wall_s: float) -> dict:
201
+ """Summarize a sample subset over a shared benchmark wall-clock interval."""
202
+ ok = [s for s in samples if s.ok]
203
+ fail = [s for s in samples if not s.ok]
204
+ # Split fail into transport failures vs. delivered-but-graded-wrong.
205
+ wrong = [s for s in fail if s.request_ok]
206
+ request_failed = [s for s in fail if not s.request_ok]
207
+ latencies = [s.latency_s for s in ok]
208
+
209
+ status_counts = Counter(s.status for s in samples)
210
+ error_counts = Counter(s.error for s in fail if s.error)
211
+
212
+ out: dict = {
213
+ "wall_time_s": wall_s,
214
+ "total_requests": len(samples),
215
+ "success": len(ok),
216
+ "failed": len(fail),
217
+ "request_failed": len(request_failed),
218
+ "wrong_output": len(wrong),
219
+ "error_rate": (len(fail) / len(samples)) if samples else 0.0,
220
+ "request_failure_rate": (
221
+ (len(request_failed) / len(samples)) if samples else 0.0
222
+ ),
223
+ "throughput_rps": len(samples) / wall_s,
224
+ "goodput_rps": len(ok) / wall_s,
225
+ "bytes_sent": sum(s.bytes_sent for s in samples),
226
+ "bytes_recv": sum(s.bytes_recv for s in samples),
227
+ "status_codes": dict(status_counts),
228
+ "errors": dict(error_counts),
229
+ }
230
+ if latencies:
231
+ out["latency_s"] = {
232
+ "mean": statistics.mean(latencies),
233
+ "min": min(latencies),
234
+ "max": max(latencies),
235
+ "p50": _pct(latencies, 50),
236
+ "p90": _pct(latencies, 90),
237
+ "p95": _pct(latencies, 95),
238
+ "p99": _pct(latencies, 99),
239
+ "p999": _pct(latencies, 99.9),
240
+ }
241
+
242
+ # Aggregate workload-specific `extra` metrics generically: mean + percentiles.
243
+ extras: dict[str, list[float]] = defaultdict(list)
244
+ for s in ok:
245
+ for k, v in s.extra.items():
246
+ if isinstance(v, (int, float)):
247
+ extras[k].append(float(v))
248
+ if extras:
249
+ ext_summary = {}
250
+ for k, vals in extras.items():
251
+ ext_summary[k] = {
252
+ "mean": statistics.mean(vals),
253
+ "p50": _pct(vals, 50),
254
+ "p90": _pct(vals, 90),
255
+ "p99": _pct(vals, 99),
256
+ "min": min(vals),
257
+ "max": max(vals),
258
+ }
259
+ out["workload_metrics"] = ext_summary
260
+
261
+ return out
262
+
263
+
226
264
  def _safe_meta(meta: dict) -> dict:
227
265
  out = {}
228
266
  for k, v in meta.items():