benchmaker 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchmaker-0.1.2 → benchmaker-0.1.4}/PKG-INFO +41 -11
- {benchmaker-0.1.2 → benchmaker-0.1.4}/README.md +38 -10
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/__init__.py +5 -2
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/config.py +75 -9
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/metrics.py +95 -57
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/runner.py +120 -19
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/swebench_replay.py +92 -22
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/trajectory_replay.py +45 -6
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/harbor_eval.py +22 -1
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/pi_agent.py +174 -35
- benchmaker-0.1.4/benchmaker/swebench/pi_ext/register_provider.js +65 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/remote_exec.js +6 -2
- benchmaker-0.1.4/benchmaker/swebench/pi_ext/remote_exec_all.js +231 -0
- benchmaker-0.1.4/benchmaker/swebench/replay_server.py +448 -0
- benchmaker-0.1.4/benchmaker/swebench/timeout_load.py +107 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/trajectory.py +148 -3
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/__init__.py +2 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/datasets.py +11 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/llm.py +7 -0
- benchmaker-0.1.4/benchmaker/workloads/rag.py +188 -0
- benchmaker-0.1.4/benchmaker/workloads/trajectory.py +437 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/PKG-INFO +41 -11
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/SOURCES.txt +14 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/requires.txt +3 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/pyproject.toml +2 -1
- benchmaker-0.1.4/tests/test_backfill_trajectory_status.py +93 -0
- benchmaker-0.1.4/tests/test_collect_sweep_data.py +24 -0
- benchmaker-0.1.4/tests/test_collect_trajectories.py +441 -0
- benchmaker-0.1.4/tests/test_dedupe_trajectories.py +167 -0
- benchmaker-0.1.4/tests/test_mix.py +86 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_pi_agent.py +98 -0
- benchmaker-0.1.4/tests/test_pi_agent_timeout_injection.py +47 -0
- benchmaker-0.1.4/tests/test_qos_job_config.py +56 -0
- benchmaker-0.1.4/tests/test_rag.py +120 -0
- benchmaker-0.1.4/tests/test_replay_server.py +454 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_swebench_replay_recipe.py +23 -3
- benchmaker-0.1.4/tests/test_timeout_load.py +177 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_trajectory.py +184 -0
- benchmaker-0.1.4/tests/test_trajectory_interleave.py +284 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_trajectory_replay.py +43 -0
- benchmaker-0.1.2/benchmaker/swebench/replay_server.py +0 -206
- benchmaker-0.1.2/benchmaker/workloads/trajectory.py +0 -209
- benchmaker-0.1.2/tests/test_replay_server.py +0 -133
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/cli.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/__init__.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/load.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/monitors.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/trace.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/core/types.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/env.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/io/__init__.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/io/bundle.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/io/collect.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/__init__.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/_cli_shared.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/_factory.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/base.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/http.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/llm.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/sandbox.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/sglang.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/recipes/swebench.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/__init__.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/_flash_hardening.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/agent.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/grading.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/harbor_agent.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/observability.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/max_turns.js +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/agent.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/base.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/eval.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/hf.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/http.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/sandbox.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker/workloads/sglang.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/dependency_links.txt +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/entry_points.txt +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/benchmaker.egg-info/top_level.txt +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/setup.cfg +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_agent.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_agent_warmup.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_bundle.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_coding_agent.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_eval.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_flash_hardening.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_hf.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_observability.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_passthrough_meta.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_recipes_cli.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_sandbox_duration.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_sglang.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_smoke.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.4}/tests/test_trace.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchmaker
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
|
|
5
5
|
Author: Xiaozhe Yao
|
|
6
6
|
License: MIT
|
|
@@ -18,6 +18,8 @@ Requires-Dist: rich>=13; extra == "rich"
|
|
|
18
18
|
Provides-Extra: hf
|
|
19
19
|
Requires-Dist: datasets>=2.18; extra == "hf"
|
|
20
20
|
Requires-Dist: transformers>=4.40; extra == "hf"
|
|
21
|
+
Provides-Extra: tokenizer
|
|
22
|
+
Requires-Dist: transformers>=4.40; extra == "tokenizer"
|
|
21
23
|
Provides-Extra: dev
|
|
22
24
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
23
25
|
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
@@ -73,8 +75,8 @@ asyncio.run(main())
|
|
|
73
75
|
```
|
|
74
76
|
|
|
75
77
|
Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
|
|
76
|
-
`benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
|
|
77
|
-
`trajectory-replay`):
|
|
78
|
+
`benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
|
|
79
|
+
`swebench-replay`, `sglang`, `trajectory-replay`):
|
|
78
80
|
|
|
79
81
|
```bash
|
|
80
82
|
benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
|
|
@@ -185,16 +187,16 @@ Full docs live in [`docs/`](docs/):
|
|
|
185
187
|
- [Correctness / accuracy eval](docs/eval.md) — grade responses against references
|
|
186
188
|
- [CLI & YAML reference](docs/cli-and-yaml.md)
|
|
187
189
|
- [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
|
|
188
|
-
-
|
|
189
|
-
-
|
|
190
|
-
|
|
190
|
+
- [DeepRAG and mixed lanes](docs/deeprag-mix.md) — prefill-heavy RAG and phase-swinging dataset lanes
|
|
191
|
+
- [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
|
|
192
|
+
- [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
|
|
191
193
|
|
|
192
194
|
## Deterministic replay (`swebench-replay`)
|
|
193
195
|
|
|
194
196
|
Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
|
|
195
197
|
real pi + sandbox + verifier pipeline still runs, only the model is served back
|
|
196
198
|
from recorded outputs, so re-runs are deterministic and free of model
|
|
197
|
-
cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
|
|
199
|
+
cost/variance. Vary `--concurrency` (or `--concurrency-sweep`) to study the rest of the
|
|
198
200
|
pipeline without the model's stochasticity as a confound. Still needs
|
|
199
201
|
`FLASH_SANDBOX_URL` (the sandbox + verifier are real).
|
|
200
202
|
|
|
@@ -207,7 +209,7 @@ python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
|
|
|
207
209
|
# 2) replay (host mode, localhost) across a concurrency sweep
|
|
208
210
|
FLASH_SANDBOX_URL=http://localhost:8080 \
|
|
209
211
|
benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
|
|
210
|
-
--mode pi-host --sweep 1,5,25
|
|
212
|
+
--mode pi-host --concurrency-sweep 1,5,25
|
|
211
213
|
|
|
212
214
|
# container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
|
|
213
215
|
FLASH_SANDBOX_URL=http://localhost:8080 \
|
|
@@ -221,6 +223,18 @@ run lacked an instance id) plus the count of assistant messages already in the
|
|
|
221
223
|
request — so it is correct at any concurrency. A `MISSES` column in the summary
|
|
222
224
|
flags any divergence (a request beyond the recorded turns).
|
|
223
225
|
|
|
226
|
+
The standalone replay server can also **mock realistic streaming** for
|
|
227
|
+
latency-sensitive benchmarks. Pass a real tokenizer and a per-token delay; the
|
|
228
|
+
first token is emitted immediately (prefill free, TTFT≈0) and each subsequent
|
|
229
|
+
token is spaced by `--inter-token-time` ms. Output stays byte-exact and the
|
|
230
|
+
reported `usage` is the recorded value.
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
pip install 'benchmaker[tokenizer]' # adds transformers for the tokenizer
|
|
234
|
+
python -m benchmaker.swebench.replay_server replay-trajectories.jsonl \
|
|
235
|
+
--tokenizer zai-org/GLM-4.7-Flash --inter-token-time 50
|
|
236
|
+
```
|
|
237
|
+
|
|
224
238
|
## Examples
|
|
225
239
|
|
|
226
240
|
Under [`examples/`](examples/):
|
|
@@ -228,9 +242,12 @@ Under [`examples/`](examples/):
|
|
|
228
242
|
- `simple_get.py` — minimal library usage
|
|
229
243
|
- `custom_hooks.py` — request signing + response parsing
|
|
230
244
|
- `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
|
|
245
|
+
- `llm_from_env.py` — LLM benchmark using `from_env()`
|
|
231
246
|
- `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
|
|
247
|
+
- `agent_trove.py` — user-defined agent benchmark
|
|
232
248
|
- `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
|
|
233
249
|
- `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
|
|
250
|
+
- `bench_sandbox.py` / `bench_sandbox.sh` — sandbox benchmarks
|
|
234
251
|
- `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
|
|
235
252
|
- `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
|
|
236
253
|
- `config.yaml` — generic HTTP YAML config
|
|
@@ -254,9 +271,22 @@ benchmaker/ # library code
|
|
|
254
271
|
config.py env.py # YAML config loading + .env interpolation
|
|
255
272
|
core/ # engine: types, load models, runner, metrics, monitors, trace
|
|
256
273
|
io/ # run output: per-run bundle + cross-run collection
|
|
257
|
-
workloads/
|
|
258
|
-
|
|
259
|
-
|
|
274
|
+
workloads/
|
|
275
|
+
http.py # HTTP workload-type
|
|
276
|
+
llm.py # OpenAI-compatible chat workload-type
|
|
277
|
+
sandbox.py # Flash Sandbox workload-type
|
|
278
|
+
sglang.py # SGLang native /generate workload-type
|
|
279
|
+
agent.py # user-defined Agent workload-type
|
|
280
|
+
trajectory.py # multi-turn trajectory replay workload
|
|
281
|
+
eval.py # correctness/accuracy evaluation
|
|
282
|
+
hf.py # HuggingFace dataset source
|
|
283
|
+
datasets.py # generic workload/dataset base classes
|
|
284
|
+
base.py # WorkloadType base class
|
|
285
|
+
recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay, sglang, trajectory-replay) + registry
|
|
286
|
+
swebench/
|
|
287
|
+
trajectory.py # convert pi logs to replay trajectories
|
|
288
|
+
replay_server.py # mock-LLM replay server for swebench-replay
|
|
289
|
+
agent.py # SWE-bench coding agent + grading + harbor adapters
|
|
260
290
|
examples/ # runnable examples (incl. swebench/ coding-agent config)
|
|
261
291
|
tools/ # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
|
|
262
292
|
tests/ # pytest smoke tests
|
|
@@ -45,8 +45,8 @@ asyncio.run(main())
|
|
|
45
45
|
```
|
|
46
46
|
|
|
47
47
|
Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
|
|
48
|
-
`benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
|
|
49
|
-
`trajectory-replay`):
|
|
48
|
+
`benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
|
|
49
|
+
`swebench-replay`, `sglang`, `trajectory-replay`):
|
|
50
50
|
|
|
51
51
|
```bash
|
|
52
52
|
benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
|
|
@@ -157,16 +157,16 @@ Full docs live in [`docs/`](docs/):
|
|
|
157
157
|
- [Correctness / accuracy eval](docs/eval.md) — grade responses against references
|
|
158
158
|
- [CLI & YAML reference](docs/cli-and-yaml.md)
|
|
159
159
|
- [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
|
|
160
|
-
-
|
|
161
|
-
-
|
|
162
|
-
|
|
160
|
+
- [DeepRAG and mixed lanes](docs/deeprag-mix.md) — prefill-heavy RAG and phase-swinging dataset lanes
|
|
161
|
+
- [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
|
|
162
|
+
- [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
|
|
163
163
|
|
|
164
164
|
## Deterministic replay (`swebench-replay`)
|
|
165
165
|
|
|
166
166
|
Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
|
|
167
167
|
real pi + sandbox + verifier pipeline still runs, only the model is served back
|
|
168
168
|
from recorded outputs, so re-runs are deterministic and free of model
|
|
169
|
-
cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
|
|
169
|
+
cost/variance. Vary `--concurrency` (or `--concurrency-sweep`) to study the rest of the
|
|
170
170
|
pipeline without the model's stochasticity as a confound. Still needs
|
|
171
171
|
`FLASH_SANDBOX_URL` (the sandbox + verifier are real).
|
|
172
172
|
|
|
@@ -179,7 +179,7 @@ python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
|
|
|
179
179
|
# 2) replay (host mode, localhost) across a concurrency sweep
|
|
180
180
|
FLASH_SANDBOX_URL=http://localhost:8080 \
|
|
181
181
|
benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
|
|
182
|
-
--mode pi-host --sweep 1,5,25
|
|
182
|
+
--mode pi-host --concurrency-sweep 1,5,25
|
|
183
183
|
|
|
184
184
|
# container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
|
|
185
185
|
FLASH_SANDBOX_URL=http://localhost:8080 \
|
|
@@ -193,6 +193,18 @@ run lacked an instance id) plus the count of assistant messages already in the
|
|
|
193
193
|
request — so it is correct at any concurrency. A `MISSES` column in the summary
|
|
194
194
|
flags any divergence (a request beyond the recorded turns).
|
|
195
195
|
|
|
196
|
+
The standalone replay server can also **mock realistic streaming** for
|
|
197
|
+
latency-sensitive benchmarks. Pass a real tokenizer and a per-token delay; the
|
|
198
|
+
first token is emitted immediately (prefill free, TTFT≈0) and each subsequent
|
|
199
|
+
token is spaced by `--inter-token-time` ms. Output stays byte-exact and the
|
|
200
|
+
reported `usage` is the recorded value.
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
pip install 'benchmaker[tokenizer]' # adds transformers for the tokenizer
|
|
204
|
+
python -m benchmaker.swebench.replay_server replay-trajectories.jsonl \
|
|
205
|
+
--tokenizer zai-org/GLM-4.7-Flash --inter-token-time 50
|
|
206
|
+
```
|
|
207
|
+
|
|
196
208
|
## Examples
|
|
197
209
|
|
|
198
210
|
Under [`examples/`](examples/):
|
|
@@ -200,9 +212,12 @@ Under [`examples/`](examples/):
|
|
|
200
212
|
- `simple_get.py` — minimal library usage
|
|
201
213
|
- `custom_hooks.py` — request signing + response parsing
|
|
202
214
|
- `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
|
|
215
|
+
- `llm_from_env.py` — LLM benchmark using `from_env()`
|
|
203
216
|
- `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
|
|
217
|
+
- `agent_trove.py` — user-defined agent benchmark
|
|
204
218
|
- `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
|
|
205
219
|
- `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
|
|
220
|
+
- `bench_sandbox.py` / `bench_sandbox.sh` — sandbox benchmarks
|
|
206
221
|
- `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
|
|
207
222
|
- `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
|
|
208
223
|
- `config.yaml` — generic HTTP YAML config
|
|
@@ -226,9 +241,22 @@ benchmaker/ # library code
|
|
|
226
241
|
config.py env.py # YAML config loading + .env interpolation
|
|
227
242
|
core/ # engine: types, load models, runner, metrics, monitors, trace
|
|
228
243
|
io/ # run output: per-run bundle + cross-run collection
|
|
229
|
-
workloads/
|
|
230
|
-
|
|
231
|
-
|
|
244
|
+
workloads/
|
|
245
|
+
http.py # HTTP workload-type
|
|
246
|
+
llm.py # OpenAI-compatible chat workload-type
|
|
247
|
+
sandbox.py # Flash Sandbox workload-type
|
|
248
|
+
sglang.py # SGLang native /generate workload-type
|
|
249
|
+
agent.py # user-defined Agent workload-type
|
|
250
|
+
trajectory.py # multi-turn trajectory replay workload
|
|
251
|
+
eval.py # correctness/accuracy evaluation
|
|
252
|
+
hf.py # HuggingFace dataset source
|
|
253
|
+
datasets.py # generic workload/dataset base classes
|
|
254
|
+
base.py # WorkloadType base class
|
|
255
|
+
recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay, sglang, trajectory-replay) + registry
|
|
256
|
+
swebench/
|
|
257
|
+
trajectory.py # convert pi logs to replay trajectories
|
|
258
|
+
replay_server.py # mock-LLM replay server for swebench-replay
|
|
259
|
+
agent.py # SWE-bench coding agent + grading + harbor adapters
|
|
232
260
|
examples/ # runnable examples (incl. swebench/ coding-agent config)
|
|
233
261
|
tools/ # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
|
|
234
262
|
tests/ # pytest smoke tests
|
|
@@ -19,6 +19,7 @@ from benchmaker.workloads.http import HttpWorkloadType
|
|
|
19
19
|
from benchmaker.workloads.llm import OpenAIChatWorkloadType
|
|
20
20
|
from benchmaker.workloads.sandbox import SandboxWorkloadType
|
|
21
21
|
from benchmaker.workloads.hf import HFDatasetWorkload
|
|
22
|
+
from benchmaker.workloads.rag import DeepRAGWorkload
|
|
22
23
|
from benchmaker.workloads.sglang import SGLangGenerateWorkloadType
|
|
23
24
|
from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
|
|
24
25
|
from benchmaker.workloads.agent import (
|
|
@@ -59,7 +60,7 @@ from benchmaker.core.monitors import (
|
|
|
59
60
|
PrometheusMonitor,
|
|
60
61
|
parse_prometheus,
|
|
61
62
|
)
|
|
62
|
-
from benchmaker.core.runner import BenchRunner, BenchConfig, BenchResult
|
|
63
|
+
from benchmaker.core.runner import BenchLane, BenchRunner, BenchConfig, BenchResult
|
|
63
64
|
from benchmaker.core.trace import (
|
|
64
65
|
ReplayWorkloadType,
|
|
65
66
|
TracePacedLoad,
|
|
@@ -89,6 +90,7 @@ __all__ = [
|
|
|
89
90
|
"OpenAIChatWorkloadType",
|
|
90
91
|
"SandboxWorkloadType",
|
|
91
92
|
"HFDatasetWorkload",
|
|
93
|
+
"DeepRAGWorkload",
|
|
92
94
|
"SGLangGenerateWorkloadType",
|
|
93
95
|
"TrajectoryReplayWorkload",
|
|
94
96
|
# agent workload (pluggable user-defined agents)
|
|
@@ -136,6 +138,7 @@ __all__ = [
|
|
|
136
138
|
# runner
|
|
137
139
|
"BenchRunner",
|
|
138
140
|
"BenchConfig",
|
|
141
|
+
"BenchLane",
|
|
139
142
|
"BenchResult",
|
|
140
143
|
# trace: record & replay
|
|
141
144
|
"TraceRecorder",
|
|
@@ -153,4 +156,4 @@ __all__ = [
|
|
|
153
156
|
"write_bundle",
|
|
154
157
|
]
|
|
155
158
|
|
|
156
|
-
__version__ = "0.1.
|
|
159
|
+
__version__ = "0.1.4"
|
|
@@ -22,7 +22,7 @@ from typing import Any, Callable, Optional
|
|
|
22
22
|
from benchmaker.env import interpolate, load_dotenv
|
|
23
23
|
from benchmaker.core.load import parse_duration, parse_rate_spec
|
|
24
24
|
from benchmaker.core.monitors import FunctionMonitor, Monitor, PrometheusMonitor
|
|
25
|
-
from benchmaker.core.runner import BenchConfig
|
|
25
|
+
from benchmaker.core.runner import BenchConfig, BenchLane
|
|
26
26
|
from benchmaker.workloads.base import WorkloadType
|
|
27
27
|
from benchmaker.workloads.datasets import (
|
|
28
28
|
CallableWorkload,
|
|
@@ -31,6 +31,7 @@ from benchmaker.workloads.datasets import (
|
|
|
31
31
|
Workload,
|
|
32
32
|
)
|
|
33
33
|
from benchmaker.workloads.hf import HFDatasetWorkload
|
|
34
|
+
from benchmaker.workloads.rag import DeepRAGWorkload
|
|
34
35
|
from benchmaker.workloads.http import HttpWorkloadType
|
|
35
36
|
from benchmaker.workloads.llm import OpenAIChatWorkloadType
|
|
36
37
|
from benchmaker.workloads.sandbox import SandboxWorkloadType
|
|
@@ -154,6 +155,8 @@ def build_workload(spec: Any) -> Workload:
|
|
|
154
155
|
return CallableWorkload(fn=fn, **kwargs)
|
|
155
156
|
if t in ("hf", "huggingface"):
|
|
156
157
|
return HFDatasetWorkload(**kwargs)
|
|
158
|
+
if t in ("deeprag", "deep-rag", "rag"):
|
|
159
|
+
return DeepRAGWorkload(**kwargs)
|
|
157
160
|
if t == "trajectory":
|
|
158
161
|
from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
|
|
159
162
|
return TrajectoryReplayWorkload(**kwargs)
|
|
@@ -365,8 +368,12 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
|
|
|
365
368
|
cfg = interpolate(cfg)
|
|
366
369
|
|
|
367
370
|
replay_spec = cfg.get("replay")
|
|
371
|
+
mix_spec = cfg.get("mix")
|
|
372
|
+
if replay_spec is not None and mix_spec is not None:
|
|
373
|
+
raise ValueError("'replay' and 'mix' are mutually exclusive")
|
|
368
374
|
if replay_spec is not None:
|
|
369
375
|
workload_type, workload, load_model = _build_replay(replay_spec)
|
|
376
|
+
lanes: list[BenchLane] = []
|
|
370
377
|
else:
|
|
371
378
|
wt_spec = cfg.get("workload_type")
|
|
372
379
|
if not wt_spec:
|
|
@@ -382,16 +389,27 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
|
|
|
382
389
|
raise ValueError("config must define 'workload_type' or 'replay'")
|
|
383
390
|
|
|
384
391
|
workload_type = build_workload_type(wt_spec)
|
|
385
|
-
workload = build_workload(cfg.get("workload"))
|
|
386
|
-
|
|
387
|
-
load_spec = cfg.get("load")
|
|
388
|
-
if load_spec is None:
|
|
389
|
-
raise ValueError("config must define 'load'")
|
|
390
392
|
duration = cfg.get("duration") or cfg.get("duration_s")
|
|
391
393
|
if duration is not None and isinstance(duration, str):
|
|
392
394
|
duration = parse_duration(duration)
|
|
393
|
-
|
|
394
|
-
|
|
395
|
+
if mix_spec is not None:
|
|
396
|
+
if cfg.get("load") is not None:
|
|
397
|
+
raise ValueError("a mixed config cannot also define top-level 'load'")
|
|
398
|
+
workload = StaticWorkload()
|
|
399
|
+
load_model = None
|
|
400
|
+
lanes = _build_lanes(
|
|
401
|
+
mix_spec,
|
|
402
|
+
duration_s=duration,
|
|
403
|
+
max_requests=cfg.get("max_requests"),
|
|
404
|
+
)
|
|
405
|
+
else:
|
|
406
|
+
workload = build_workload(cfg.get("workload"))
|
|
407
|
+
load_spec = cfg.get("load")
|
|
408
|
+
if load_spec is None:
|
|
409
|
+
raise ValueError("config must define 'load' or 'mix.lanes'")
|
|
410
|
+
load_model = parse_rate_spec(load_spec, duration_s=duration,
|
|
411
|
+
max_requests=cfg.get("max_requests"))
|
|
412
|
+
lanes = []
|
|
395
413
|
|
|
396
414
|
pre_hooks = [resolve_callable(h) for h in (cfg.get("pre_hooks") or [])]
|
|
397
415
|
post_hooks = [resolve_callable(h) for h in (cfg.get("post_hooks") or [])]
|
|
@@ -407,12 +425,22 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
|
|
|
407
425
|
workload_type, extra_post = apply_correctness(workload_type, correctness_spec)
|
|
408
426
|
post_hooks = list(post_hooks) + list(extra_post)
|
|
409
427
|
|
|
428
|
+
# A workload that schedules on per-request completion (e.g. interleaved
|
|
429
|
+
# trajectory replay) declares the post-hook it needs; install it so a YAML
|
|
430
|
+
# config can't silently stall waiting for a signal it never wired up.
|
|
431
|
+
workloads = [lane.workload for lane in lanes] if lanes else [workload]
|
|
432
|
+
for lane_workload in workloads:
|
|
433
|
+
wl_hook = lane_workload.completion_hook()
|
|
434
|
+
if wl_hook is not None and wl_hook not in post_hooks:
|
|
435
|
+
post_hooks = list(post_hooks) + [wl_hook]
|
|
436
|
+
|
|
410
437
|
recorder = _build_recorder(cfg.get("record"))
|
|
411
438
|
|
|
412
439
|
return BenchConfig(
|
|
413
440
|
workload_type=workload_type,
|
|
414
441
|
workload=workload,
|
|
415
442
|
load=load_model,
|
|
443
|
+
lanes=lanes,
|
|
416
444
|
pre_hooks=pre_hooks,
|
|
417
445
|
post_hooks=post_hooks,
|
|
418
446
|
monitors=monitors,
|
|
@@ -421,9 +449,48 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
|
|
|
421
449
|
timeout_s=float(cfg.get("timeout_s", 60.0)),
|
|
422
450
|
max_in_flight=int(cfg.get("max_in_flight", 10000)),
|
|
423
451
|
progress_every_s=float(cfg.get("progress_every_s", 1.0)),
|
|
452
|
+
stop_on_exhausted=bool(cfg.get("stop_on_exhausted", True)),
|
|
424
453
|
)
|
|
425
454
|
|
|
426
455
|
|
|
456
|
+
def _build_lanes(spec: Any, *, duration_s: Optional[float],
|
|
457
|
+
max_requests: Optional[int]) -> list[BenchLane]:
|
|
458
|
+
"""Build independent workload/load pairs from a ``mix:`` YAML block."""
|
|
459
|
+
if not isinstance(spec, dict):
|
|
460
|
+
raise TypeError("'mix' must be a mapping with a 'lanes' list")
|
|
461
|
+
lane_specs = spec.get("lanes")
|
|
462
|
+
if not isinstance(lane_specs, list) or not lane_specs:
|
|
463
|
+
raise ValueError("'mix.lanes' must be a non-empty list")
|
|
464
|
+
|
|
465
|
+
lanes: list[BenchLane] = []
|
|
466
|
+
for index, lane_spec in enumerate(lane_specs):
|
|
467
|
+
if not isinstance(lane_spec, dict):
|
|
468
|
+
raise TypeError(f"mix.lanes[{index}] must be a mapping")
|
|
469
|
+
name = lane_spec.get("name")
|
|
470
|
+
if not isinstance(name, str) or not name.strip():
|
|
471
|
+
raise ValueError(f"mix.lanes[{index}].name must be a non-empty string")
|
|
472
|
+
if "workload" not in lane_spec:
|
|
473
|
+
raise ValueError(f"mix.lanes[{index}] must define a workload")
|
|
474
|
+
rate = lane_spec.get("rate", lane_spec.get("load"))
|
|
475
|
+
if rate is None:
|
|
476
|
+
raise ValueError(f"mix.lanes[{index}] must define rate (or load)")
|
|
477
|
+
|
|
478
|
+
lane_duration = lane_spec.get("duration", duration_s)
|
|
479
|
+
if isinstance(lane_duration, str):
|
|
480
|
+
lane_duration = parse_duration(lane_duration)
|
|
481
|
+
lane_max_requests = lane_spec.get("max_requests", max_requests)
|
|
482
|
+
lanes.append(BenchLane(
|
|
483
|
+
name=name,
|
|
484
|
+
workload=build_workload(lane_spec["workload"]),
|
|
485
|
+
load=parse_rate_spec(
|
|
486
|
+
rate,
|
|
487
|
+
duration_s=lane_duration,
|
|
488
|
+
max_requests=lane_max_requests,
|
|
489
|
+
),
|
|
490
|
+
))
|
|
491
|
+
return lanes
|
|
492
|
+
|
|
493
|
+
|
|
427
494
|
def _build_recorder(spec: Any) -> Optional[TraceRecorder]:
|
|
428
495
|
if spec is None:
|
|
429
496
|
return None
|
|
@@ -451,4 +518,3 @@ def _build_replay(spec: Any) -> tuple[WorkloadType, Workload, Any]:
|
|
|
451
518
|
TracePacedLoad(trace, speed=speed),
|
|
452
519
|
)
|
|
453
520
|
|
|
454
|
-
|
|
@@ -52,64 +52,22 @@ class MetricsAggregator:
|
|
|
52
52
|
def summary(self) -> dict:
|
|
53
53
|
end = self.end_time or time.monotonic()
|
|
54
54
|
wall_s = max(end - self.start_time, 1e-9)
|
|
55
|
-
|
|
56
|
-
fail = [s for s in self.samples if not s.ok]
|
|
57
|
-
# Split fail into transport failures vs. delivered-but-graded-wrong.
|
|
58
|
-
wrong = [s for s in fail if s.request_ok]
|
|
59
|
-
request_failed = [s for s in fail if not s.request_ok]
|
|
60
|
-
latencies = [s.latency_s for s in ok]
|
|
61
|
-
|
|
62
|
-
status_counts = Counter(s.status for s in self.samples)
|
|
63
|
-
error_counts = Counter(s.error for s in fail if s.error)
|
|
64
|
-
|
|
65
|
-
out: dict = {
|
|
66
|
-
"wall_time_s": wall_s,
|
|
67
|
-
"total_requests": len(self.samples),
|
|
68
|
-
"success": len(ok),
|
|
69
|
-
"failed": len(fail),
|
|
70
|
-
"request_failed": len(request_failed),
|
|
71
|
-
"wrong_output": len(wrong),
|
|
72
|
-
"error_rate": (len(fail) / len(self.samples)) if self.samples else 0.0,
|
|
73
|
-
"request_failure_rate": (
|
|
74
|
-
(len(request_failed) / len(self.samples)) if self.samples else 0.0
|
|
75
|
-
),
|
|
76
|
-
"throughput_rps": len(self.samples) / wall_s,
|
|
77
|
-
"goodput_rps": len(ok) / wall_s,
|
|
78
|
-
"bytes_sent": sum(s.bytes_sent for s in self.samples),
|
|
79
|
-
"bytes_recv": sum(s.bytes_recv for s in self.samples),
|
|
80
|
-
"status_codes": dict(status_counts),
|
|
81
|
-
"errors": dict(error_counts),
|
|
82
|
-
}
|
|
83
|
-
if latencies:
|
|
84
|
-
out["latency_s"] = {
|
|
85
|
-
"mean": statistics.mean(latencies),
|
|
86
|
-
"min": min(latencies),
|
|
87
|
-
"max": max(latencies),
|
|
88
|
-
"p50": _pct(latencies, 50),
|
|
89
|
-
"p90": _pct(latencies, 90),
|
|
90
|
-
"p95": _pct(latencies, 95),
|
|
91
|
-
"p99": _pct(latencies, 99),
|
|
92
|
-
"p999": _pct(latencies, 99.9),
|
|
93
|
-
}
|
|
55
|
+
out = _summary_for_samples(self.samples, wall_s)
|
|
94
56
|
|
|
95
|
-
#
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
"min": min(vals),
|
|
110
|
-
"max": max(vals),
|
|
111
|
-
}
|
|
112
|
-
out["workload_metrics"] = ext_summary
|
|
57
|
+
# A mixed benchmark needs each lane's SLO signal independently. Use
|
|
58
|
+
# the same wall-clock interval as the aggregate so lane throughput is
|
|
59
|
+
# directly comparable to the total, while latency and workload metrics
|
|
60
|
+
# remain scoped to that lane's samples.
|
|
61
|
+
lanes: dict[str, list[Sample]] = defaultdict(list)
|
|
62
|
+
for sample in self.samples:
|
|
63
|
+
lane = sample.meta.get("lane")
|
|
64
|
+
if isinstance(lane, str) and lane:
|
|
65
|
+
lanes[lane].append(sample)
|
|
66
|
+
if lanes:
|
|
67
|
+
out["lanes"] = {
|
|
68
|
+
name: _summary_for_samples(samples, wall_s)
|
|
69
|
+
for name, samples in sorted(lanes.items())
|
|
70
|
+
}
|
|
113
71
|
|
|
114
72
|
# Monitor time-series: summarize each metric per monitor.
|
|
115
73
|
if self.monitor_samples:
|
|
@@ -181,6 +139,22 @@ class MetricsAggregator:
|
|
|
181
139
|
lines.append(f" {k}")
|
|
182
140
|
for kk in ("mean", "p50", "p90", "p99", "max"):
|
|
183
141
|
lines.append(f" {kk:<6}: {v[kk]:.4f}")
|
|
142
|
+
if s.get("lanes"):
|
|
143
|
+
lines.append("")
|
|
144
|
+
lines.append(" lanes")
|
|
145
|
+
for name, lane in s["lanes"].items():
|
|
146
|
+
lines.append(
|
|
147
|
+
f" {name}: {lane['total_requests']} requests, "
|
|
148
|
+
f"{lane['throughput_rps']:.2f} req/s, "
|
|
149
|
+
f"{lane['success']} success"
|
|
150
|
+
)
|
|
151
|
+
for metric in ("ttft_s", "itl_ms_mean", "tokens_per_s"):
|
|
152
|
+
values = lane.get("workload_metrics", {}).get(metric)
|
|
153
|
+
if values:
|
|
154
|
+
lines.append(
|
|
155
|
+
f" {metric}: p50={values['p50']:.4f}, "
|
|
156
|
+
f"p99={values['p99']:.4f}"
|
|
157
|
+
)
|
|
184
158
|
if s.get("monitors"):
|
|
185
159
|
for mon_name, mon in s["monitors"].items():
|
|
186
160
|
lines.append("")
|
|
@@ -223,6 +197,70 @@ class MetricsAggregator:
|
|
|
223
197
|
}) + "\n")
|
|
224
198
|
|
|
225
199
|
|
|
200
|
+
def _summary_for_samples(samples: list[Sample], wall_s: float) -> dict:
|
|
201
|
+
"""Summarize a sample subset over a shared benchmark wall-clock interval."""
|
|
202
|
+
ok = [s for s in samples if s.ok]
|
|
203
|
+
fail = [s for s in samples if not s.ok]
|
|
204
|
+
# Split fail into transport failures vs. delivered-but-graded-wrong.
|
|
205
|
+
wrong = [s for s in fail if s.request_ok]
|
|
206
|
+
request_failed = [s for s in fail if not s.request_ok]
|
|
207
|
+
latencies = [s.latency_s for s in ok]
|
|
208
|
+
|
|
209
|
+
status_counts = Counter(s.status for s in samples)
|
|
210
|
+
error_counts = Counter(s.error for s in fail if s.error)
|
|
211
|
+
|
|
212
|
+
out: dict = {
|
|
213
|
+
"wall_time_s": wall_s,
|
|
214
|
+
"total_requests": len(samples),
|
|
215
|
+
"success": len(ok),
|
|
216
|
+
"failed": len(fail),
|
|
217
|
+
"request_failed": len(request_failed),
|
|
218
|
+
"wrong_output": len(wrong),
|
|
219
|
+
"error_rate": (len(fail) / len(samples)) if samples else 0.0,
|
|
220
|
+
"request_failure_rate": (
|
|
221
|
+
(len(request_failed) / len(samples)) if samples else 0.0
|
|
222
|
+
),
|
|
223
|
+
"throughput_rps": len(samples) / wall_s,
|
|
224
|
+
"goodput_rps": len(ok) / wall_s,
|
|
225
|
+
"bytes_sent": sum(s.bytes_sent for s in samples),
|
|
226
|
+
"bytes_recv": sum(s.bytes_recv for s in samples),
|
|
227
|
+
"status_codes": dict(status_counts),
|
|
228
|
+
"errors": dict(error_counts),
|
|
229
|
+
}
|
|
230
|
+
if latencies:
|
|
231
|
+
out["latency_s"] = {
|
|
232
|
+
"mean": statistics.mean(latencies),
|
|
233
|
+
"min": min(latencies),
|
|
234
|
+
"max": max(latencies),
|
|
235
|
+
"p50": _pct(latencies, 50),
|
|
236
|
+
"p90": _pct(latencies, 90),
|
|
237
|
+
"p95": _pct(latencies, 95),
|
|
238
|
+
"p99": _pct(latencies, 99),
|
|
239
|
+
"p999": _pct(latencies, 99.9),
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
# Aggregate workload-specific `extra` metrics generically: mean + percentiles.
|
|
243
|
+
extras: dict[str, list[float]] = defaultdict(list)
|
|
244
|
+
for s in ok:
|
|
245
|
+
for k, v in s.extra.items():
|
|
246
|
+
if isinstance(v, (int, float)):
|
|
247
|
+
extras[k].append(float(v))
|
|
248
|
+
if extras:
|
|
249
|
+
ext_summary = {}
|
|
250
|
+
for k, vals in extras.items():
|
|
251
|
+
ext_summary[k] = {
|
|
252
|
+
"mean": statistics.mean(vals),
|
|
253
|
+
"p50": _pct(vals, 50),
|
|
254
|
+
"p90": _pct(vals, 90),
|
|
255
|
+
"p99": _pct(vals, 99),
|
|
256
|
+
"min": min(vals),
|
|
257
|
+
"max": max(vals),
|
|
258
|
+
}
|
|
259
|
+
out["workload_metrics"] = ext_summary
|
|
260
|
+
|
|
261
|
+
return out
|
|
262
|
+
|
|
263
|
+
|
|
226
264
|
def _safe_meta(meta: dict) -> dict:
|
|
227
265
|
out = {}
|
|
228
266
|
for k, v in meta.items():
|