benchmaker 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {benchmaker-0.1.2 → benchmaker-0.1.3}/PKG-INFO +40 -11
  2. {benchmaker-0.1.2 → benchmaker-0.1.3}/README.md +37 -10
  3. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/__init__.py +1 -1
  4. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/config.py +7 -0
  5. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/swebench_replay.py +66 -22
  6. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/trajectory_replay.py +45 -6
  7. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/harbor_eval.py +6 -1
  8. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/pi_agent.py +174 -35
  9. benchmaker-0.1.3/benchmaker/swebench/pi_ext/register_provider.js +65 -0
  10. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/pi_ext/remote_exec.js +6 -2
  11. benchmaker-0.1.3/benchmaker/swebench/pi_ext/remote_exec_all.js +231 -0
  12. benchmaker-0.1.3/benchmaker/swebench/replay_server.py +448 -0
  13. benchmaker-0.1.3/benchmaker/swebench/timeout_load.py +107 -0
  14. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/trajectory.py +148 -3
  15. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/datasets.py +11 -0
  16. benchmaker-0.1.3/benchmaker/workloads/trajectory.py +437 -0
  17. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/PKG-INFO +40 -11
  18. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/SOURCES.txt +9 -0
  19. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/requires.txt +3 -0
  20. {benchmaker-0.1.2 → benchmaker-0.1.3}/pyproject.toml +2 -1
  21. benchmaker-0.1.3/tests/test_backfill_trajectory_status.py +93 -0
  22. benchmaker-0.1.3/tests/test_collect_trajectories.py +441 -0
  23. benchmaker-0.1.3/tests/test_dedupe_trajectories.py +167 -0
  24. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_pi_agent.py +98 -0
  25. benchmaker-0.1.3/tests/test_pi_agent_timeout_injection.py +47 -0
  26. benchmaker-0.1.3/tests/test_replay_server.py +454 -0
  27. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_swebench_replay_recipe.py +23 -3
  28. benchmaker-0.1.3/tests/test_timeout_load.py +177 -0
  29. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_trajectory.py +184 -0
  30. benchmaker-0.1.3/tests/test_trajectory_interleave.py +284 -0
  31. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_trajectory_replay.py +43 -0
  32. benchmaker-0.1.2/benchmaker/swebench/replay_server.py +0 -206
  33. benchmaker-0.1.2/benchmaker/workloads/trajectory.py +0 -209
  34. benchmaker-0.1.2/tests/test_replay_server.py +0 -133
  35. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/cli.py +0 -0
  36. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/__init__.py +0 -0
  37. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/load.py +0 -0
  38. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/metrics.py +0 -0
  39. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/monitors.py +0 -0
  40. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/runner.py +0 -0
  41. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/trace.py +0 -0
  42. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/types.py +0 -0
  43. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/env.py +0 -0
  44. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/io/__init__.py +0 -0
  45. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/io/bundle.py +0 -0
  46. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/io/collect.py +0 -0
  47. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/__init__.py +0 -0
  48. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/_cli_shared.py +0 -0
  49. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/_factory.py +0 -0
  50. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/base.py +0 -0
  51. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/http.py +0 -0
  52. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/llm.py +0 -0
  53. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/sandbox.py +0 -0
  54. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/sglang.py +0 -0
  55. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/swebench.py +0 -0
  56. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/__init__.py +0 -0
  57. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/_flash_hardening.py +0 -0
  58. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/agent.py +0 -0
  59. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/grading.py +0 -0
  60. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/harbor_agent.py +0 -0
  61. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/observability.py +0 -0
  62. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/pi_ext/max_turns.js +0 -0
  63. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/__init__.py +0 -0
  64. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/agent.py +0 -0
  65. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/base.py +0 -0
  66. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/eval.py +0 -0
  67. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/hf.py +0 -0
  68. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/http.py +0 -0
  69. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/llm.py +0 -0
  70. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/sandbox.py +0 -0
  71. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/sglang.py +0 -0
  72. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/dependency_links.txt +0 -0
  73. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/entry_points.txt +0 -0
  74. {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/top_level.txt +0 -0
  75. {benchmaker-0.1.2 → benchmaker-0.1.3}/setup.cfg +0 -0
  76. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_agent.py +0 -0
  77. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_agent_warmup.py +0 -0
  78. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_bundle.py +0 -0
  79. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_coding_agent.py +0 -0
  80. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_eval.py +0 -0
  81. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_flash_hardening.py +0 -0
  82. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_hf.py +0 -0
  83. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_observability.py +0 -0
  84. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_passthrough_meta.py +0 -0
  85. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_recipes_cli.py +0 -0
  86. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_sandbox_duration.py +0 -0
  87. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_sglang.py +0 -0
  88. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_smoke.py +0 -0
  89. {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_trace.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmaker
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
5
5
  Author: Xiaozhe Yao
6
6
  License: MIT
@@ -18,6 +18,8 @@ Requires-Dist: rich>=13; extra == "rich"
18
18
  Provides-Extra: hf
19
19
  Requires-Dist: datasets>=2.18; extra == "hf"
20
20
  Requires-Dist: transformers>=4.40; extra == "hf"
21
+ Provides-Extra: tokenizer
22
+ Requires-Dist: transformers>=4.40; extra == "tokenizer"
21
23
  Provides-Extra: dev
22
24
  Requires-Dist: pytest>=7; extra == "dev"
23
25
  Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
@@ -73,8 +75,8 @@ asyncio.run(main())
73
75
  ```
74
76
 
75
77
  Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
76
- `benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`, `sglang`,
77
- `trajectory-replay`):
78
+ `benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
79
+ `swebench-replay`, `sglang`, `trajectory-replay`):
78
80
 
79
81
  ```bash
80
82
  benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
@@ -185,16 +187,15 @@ Full docs live in [`docs/`](docs/):
185
187
  - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
186
188
  - [CLI & YAML reference](docs/cli-and-yaml.md)
187
189
  - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
188
- - `benchmaker sglang` — native SGLang `/generate` benchmark (see [`docs/sglang.md`](docs/sglang.md)).
189
- - `benchmaker trajectory-replay` — multi-turn prefix-cache parity replay of
190
- trajectory datasets like SWE-smith (see [`docs/trajectory-replay.md`](docs/trajectory-replay.md)).
190
+ - [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
191
+ - [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
191
192
 
192
193
  ## Deterministic replay (`swebench-replay`)
193
194
 
194
195
  Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
195
196
  real pi + sandbox + verifier pipeline still runs, only the model is served back
196
197
  from recorded outputs, so re-runs are deterministic and free of model
197
- cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
198
+ cost/variance. Vary `--concurrency` (or `--concurrency-sweep`) to study the rest of the
198
199
  pipeline without the model's stochasticity as a confound. Still needs
199
200
  `FLASH_SANDBOX_URL` (the sandbox + verifier are real).
200
201
 
@@ -207,7 +208,7 @@ python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
207
208
  # 2) replay (host mode, localhost) across a concurrency sweep
208
209
  FLASH_SANDBOX_URL=http://localhost:8080 \
209
210
  benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
210
- --mode pi-host --sweep 1,5,25
211
+ --mode pi-host --concurrency-sweep 1,5,25
211
212
 
212
213
  # container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
213
214
  FLASH_SANDBOX_URL=http://localhost:8080 \
@@ -221,6 +222,18 @@ run lacked an instance id) plus the count of assistant messages already in the
221
222
  request — so it is correct at any concurrency. A `MISSES` column in the summary
222
223
  flags any divergence (a request beyond the recorded turns).
223
224
 
225
+ The standalone replay server can also **mock realistic streaming** for
226
+ latency-sensitive benchmarks. Pass a real tokenizer and a per-token delay; the
227
+ first token is emitted immediately (prefill free, TTFT≈0) and each subsequent
228
+ token is spaced by `--inter-token-time` ms. Output stays byte-exact and the
229
+ reported `usage` is the recorded value.
230
+
231
+ ```bash
232
+ pip install 'benchmaker[tokenizer]' # adds transformers for the tokenizer
233
+ python -m benchmaker.swebench.replay_server replay-trajectories.jsonl \
234
+ --tokenizer zai-org/GLM-4.7-Flash --inter-token-time 50
235
+ ```
236
+
224
237
  ## Examples
225
238
 
226
239
  Under [`examples/`](examples/):
@@ -228,9 +241,12 @@ Under [`examples/`](examples/):
228
241
  - `simple_get.py` — minimal library usage
229
242
  - `custom_hooks.py` — request signing + response parsing
230
243
  - `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
244
+ - `llm_from_env.py` — LLM benchmark using `from_env()`
231
245
  - `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
246
+ - `agent_trove.py` — user-defined agent benchmark
232
247
  - `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
233
248
  - `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
249
+ - `bench_sandbox.py` / `bench_sandbox.sh` — sandbox benchmarks
234
250
  - `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
235
251
  - `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
236
252
  - `config.yaml` — generic HTTP YAML config
@@ -254,9 +270,22 @@ benchmaker/ # library code
254
270
  config.py env.py # YAML config loading + .env interpolation
255
271
  core/ # engine: types, load models, runner, metrics, monitors, trace
256
272
  io/ # run output: per-run bundle + cross-run collection
257
- workloads/ # workload-types (http, llm, sandbox, agent, hf, eval)
258
- recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay) + registry
259
- swebench/ # SWE-bench coding agent + grading + harbor adapters
273
+ workloads/
274
+ http.py # HTTP workload-type
275
+ llm.py # OpenAI-compatible chat workload-type
276
+ sandbox.py # Flash Sandbox workload-type
277
+ sglang.py # SGLang native /generate workload-type
278
+ agent.py # user-defined Agent workload-type
279
+ trajectory.py # multi-turn trajectory replay workload
280
+ eval.py # correctness/accuracy evaluation
281
+ hf.py # HuggingFace dataset source
282
+ datasets.py # generic workload/dataset base classes
283
+ base.py # WorkloadType base class
284
+ recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay, sglang, trajectory-replay) + registry
285
+ swebench/
286
+ trajectory.py # convert pi logs to replay trajectories
287
+ replay_server.py # mock-LLM replay server for swebench-replay
288
+ agent.py # SWE-bench coding agent + grading + harbor adapters
260
289
  examples/ # runnable examples (incl. swebench/ coding-agent config)
261
290
  tools/ # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
262
291
  tests/ # pytest smoke tests
@@ -45,8 +45,8 @@ asyncio.run(main())
45
45
  ```
46
46
 
47
47
  Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
48
- `benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`, `sglang`,
49
- `trajectory-replay`):
48
+ `benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
49
+ `swebench-replay`, `sglang`, `trajectory-replay`):
50
50
 
51
51
  ```bash
52
52
  benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
@@ -157,16 +157,15 @@ Full docs live in [`docs/`](docs/):
157
157
  - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
158
158
  - [CLI & YAML reference](docs/cli-and-yaml.md)
159
159
  - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
160
- - `benchmaker sglang` — native SGLang `/generate` benchmark (see [`docs/sglang.md`](docs/sglang.md)).
161
- - `benchmaker trajectory-replay` — multi-turn prefix-cache parity replay of
162
- trajectory datasets like SWE-smith (see [`docs/trajectory-replay.md`](docs/trajectory-replay.md)).
160
+ - [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
161
+ - [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
163
162
 
164
163
  ## Deterministic replay (`swebench-replay`)
165
164
 
166
165
  Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
167
166
  real pi + sandbox + verifier pipeline still runs, only the model is served back
168
167
  from recorded outputs, so re-runs are deterministic and free of model
169
- cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
168
+ cost/variance. Vary `--concurrency` (or `--concurrency-sweep`) to study the rest of the
170
169
  pipeline without the model's stochasticity as a confound. Still needs
171
170
  `FLASH_SANDBOX_URL` (the sandbox + verifier are real).
172
171
 
@@ -179,7 +178,7 @@ python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
179
178
  # 2) replay (host mode, localhost) across a concurrency sweep
180
179
  FLASH_SANDBOX_URL=http://localhost:8080 \
181
180
  benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
182
- --mode pi-host --sweep 1,5,25
181
+ --mode pi-host --concurrency-sweep 1,5,25
183
182
 
184
183
  # container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
185
184
  FLASH_SANDBOX_URL=http://localhost:8080 \
@@ -193,6 +192,18 @@ run lacked an instance id) plus the count of assistant messages already in the
193
192
  request — so it is correct at any concurrency. A `MISSES` column in the summary
194
193
  flags any divergence (a request beyond the recorded turns).
195
194
 
195
+ The standalone replay server can also **mock realistic streaming** for
196
+ latency-sensitive benchmarks. Pass a real tokenizer and a per-token delay; the
197
+ first token is emitted immediately (prefill free, TTFT≈0) and each subsequent
198
+ token is spaced by `--inter-token-time` ms. Output stays byte-exact and the
199
+ reported `usage` is the recorded value.
200
+
201
+ ```bash
202
+ pip install 'benchmaker[tokenizer]' # adds transformers for the tokenizer
203
+ python -m benchmaker.swebench.replay_server replay-trajectories.jsonl \
204
+ --tokenizer zai-org/GLM-4.7-Flash --inter-token-time 50
205
+ ```
206
+
196
207
  ## Examples
197
208
 
198
209
  Under [`examples/`](examples/):
@@ -200,9 +211,12 @@ Under [`examples/`](examples/):
200
211
  - `simple_get.py` — minimal library usage
201
212
  - `custom_hooks.py` — request signing + response parsing
202
213
  - `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
214
+ - `llm_from_env.py` — LLM benchmark using `from_env()`
203
215
  - `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
216
+ - `agent_trove.py` — user-defined agent benchmark
204
217
  - `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
205
218
  - `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
219
+ - `bench_sandbox.py` / `bench_sandbox.sh` — sandbox benchmarks
206
220
  - `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
207
221
  - `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
208
222
  - `config.yaml` — generic HTTP YAML config
@@ -226,9 +240,22 @@ benchmaker/ # library code
226
240
  config.py env.py # YAML config loading + .env interpolation
227
241
  core/ # engine: types, load models, runner, metrics, monitors, trace
228
242
  io/ # run output: per-run bundle + cross-run collection
229
- workloads/ # workload-types (http, llm, sandbox, agent, hf, eval)
230
- recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay) + registry
231
- swebench/ # SWE-bench coding agent + grading + harbor adapters
243
+ workloads/
244
+ http.py # HTTP workload-type
245
+ llm.py # OpenAI-compatible chat workload-type
246
+ sandbox.py # Flash Sandbox workload-type
247
+ sglang.py # SGLang native /generate workload-type
248
+ agent.py # user-defined Agent workload-type
249
+ trajectory.py # multi-turn trajectory replay workload
250
+ eval.py # correctness/accuracy evaluation
251
+ hf.py # HuggingFace dataset source
252
+ datasets.py # generic workload/dataset base classes
253
+ base.py # WorkloadType base class
254
+ recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay, sglang, trajectory-replay) + registry
255
+ swebench/
256
+ trajectory.py # convert pi logs to replay trajectories
257
+ replay_server.py # mock-LLM replay server for swebench-replay
258
+ agent.py # SWE-bench coding agent + grading + harbor adapters
232
259
  examples/ # runnable examples (incl. swebench/ coding-agent config)
233
260
  tools/ # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
234
261
  tests/ # pytest smoke tests
@@ -153,4 +153,4 @@ __all__ = [
153
153
  "write_bundle",
154
154
  ]
155
155
 
156
- __version__ = "0.1.1"
156
+ __version__ = "0.1.3"
@@ -407,6 +407,13 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
407
407
  workload_type, extra_post = apply_correctness(workload_type, correctness_spec)
408
408
  post_hooks = list(post_hooks) + list(extra_post)
409
409
 
410
+ # A workload that schedules on per-request completion (e.g. interleaved
411
+ # trajectory replay) declares the post-hook it needs; install it so a YAML
412
+ # config can't silently stall waiting for a signal it never wired up.
413
+ wl_hook = workload.completion_hook()
414
+ if wl_hook is not None and wl_hook not in post_hooks:
415
+ post_hooks = list(post_hooks) + [wl_hook]
416
+
410
417
  recorder = _build_recorder(cfg.get("record"))
411
418
 
412
419
  return BenchConfig(
@@ -5,7 +5,7 @@ Builds a replay store from recorded pi logs (or loads a prebuilt
5
5
  `replay-trajectories.jsonl`), starts the stateless replay server in-process, and
6
6
  runs the *real* harbor SWE-bench pipeline (pi + sandbox + verifier) with the
7
7
  model endpoint pointed at the replay server — at one ``--concurrency`` or a
8
- ``--sweep`` of them. The LLM is the only thing mocked; everything else runs for
8
+ ``--concurrency-sweep`` of them. The LLM is the only thing mocked; everything else runs for
9
9
  real, so re-runs are deterministic and free of model cost/variance.
10
10
 
11
11
  Still requires ``FLASH_SANDBOX_URL`` (the sandbox + verifier are real). For
@@ -54,28 +54,30 @@ def _parse_concurrencies(sweep: Optional[str], concurrency: int) -> list[int]:
54
54
  return [int(x.strip()) for x in sweep.split(",") if x.strip()]
55
55
 
56
56
 
57
- def _resolve_task_filter(task, store) -> tuple[list[str], int]:
57
+ def _resolve_task_filter(task, exclude_task, store) -> tuple[list[str], int]:
58
58
  """Which dataset tasks to run, and how many trajectories can't be targeted.
59
59
 
60
60
  Default to exactly the recorded tasks (each trajectory's instance_id) so
61
61
  harbor replays only what we have trajectories for — otherwise it would run
62
62
  the whole ``--dataset`` and every task without a recording becomes a replay
63
63
  miss. An explicit ``--task`` wins (the user is narrowing on purpose).
64
+ ``--exclude-task`` drops the named id(s) from the resolved set.
64
65
  Returns ``(task_ids, n_missing_instance_id)``."""
65
- explicit = list(task)
66
+ excluded = set(exclude_task)
67
+ explicit = [t for t in task if t not in excluded]
66
68
  if explicit:
67
69
  return explicit, 0
68
- ids = sorted({t.instance_id for t in store.values() if t.instance_id})
70
+ ids = sorted({t.instance_id for t in store.values()
71
+ if t.instance_id and t.instance_id not in excluded})
69
72
  missing = sum(1 for t in store.values() if not t.instance_id)
70
73
  return ids, missing
71
74
 
72
-
73
75
  class SWEBenchReplayRecipe(Recipe):
74
76
  name = "swebench-replay"
75
77
  help = (
76
78
  "Replay recorded SWE-bench trajectories deterministically: mock the LLM "
77
79
  "with recorded outputs, run the real pi+sandbox+verifier pipeline at one "
78
- "--concurrency or a --sweep. Requires FLASH_SANDBOX_URL."
80
+ "--concurrency or a --concurrency-sweep. Requires FLASH_SANDBOX_URL."
79
81
  )
80
82
  wants_load_options = False
81
83
 
@@ -88,12 +90,20 @@ class SWEBenchReplayRecipe(Recipe):
88
90
  help="Prebuilt replay-trajectories.jsonl (instead of --job)."),
89
91
  click.option("--concurrency", type=int, default=4, show_default=True,
90
92
  help="Concurrent trials (harbor n_concurrent_trials)."),
91
- click.option("--sweep", default=None,
93
+ click.option("--concurrency-sweep", "concurrency_sweep", default=None,
92
94
  help="Comma list of concurrencies to run in sequence, "
93
95
  "e.g. '1,5,25' (overrides --concurrency)."),
94
96
  click.option("--mode", type=click.Choice(["pi-host", "pi-container"]),
95
97
  default="pi-host", show_default=True,
96
98
  help="pi run mode (the harbor agent key)."),
99
+ click.option("--route-tools", "route_tools",
100
+ type=click.Choice(["all", "bash"]),
101
+ default="all", show_default=True,
102
+ help="pi-host: which tools to route into the sandbox. "
103
+ "'all' routes bash+read+write+edit (matches how "
104
+ "trajectories are recorded); 'bash' routes only bash "
105
+ "(file edits hit the host fs and are lost on replay). "
106
+ "Ignored for pi-container (pi runs in the sandbox)."),
97
107
  click.option("--host", default="127.0.0.1", show_default=True,
98
108
  help="Replay server bind host (use 0.0.0.0 for container mode)."),
99
109
  click.option("--port", type=int, default=9100, show_default=True,
@@ -107,11 +117,21 @@ class SWEBenchReplayRecipe(Recipe):
107
117
  "trajectory's model."),
108
118
  click.option("--dataset", default="swebench-verified", show_default=True,
109
119
  help="Harbor dataset slug."),
120
+ click.option("--exec-timeout-sec", "exec_timeout_sec", type=float,
121
+ default=None,
122
+ help="pi-host: real per-command timeout (seconds) passed "
123
+ "to environment.exec for every routed tool call "
124
+ "(default 600). Lower it to surface real sandbox "
125
+ "slowness/hangs under load. Ignored for pi-container "
126
+ "(pi runs as one process with no per-command timeout)."),
110
127
  click.option("--n-tasks", "n_tasks", type=int, default=None,
111
128
  help="Cap the number of recorded tasks to replay "
112
129
  "(applied on top of the recorded-task filter)."),
113
130
  click.option("--task", multiple=True,
114
131
  help="Restrict to specific task name(s)/glob(s). Repeatable."),
132
+ click.option("--exclude-task", "exclude_task", multiple=True,
133
+ help="Drop specific task id(s) from the replay set. "
134
+ "Repeatable."),
115
135
  click.option("--n-attempts", "n_attempts", type=int, default=1,
116
136
  show_default=True, help="Attempts per task."),
117
137
  click.option("--timeout-multiplier", "timeout_multiplier", type=float,
@@ -129,15 +149,22 @@ class SWEBenchReplayRecipe(Recipe):
129
149
  click.option("--timeline/--no-timeline", "timeline", default=True,
130
150
  show_default=True,
131
151
  help="Capture timeline/utilization/tokens into the job dir."),
152
+ click.option("--validate-observations/--no-validate-observations",
153
+ "validate_observations", default=False, show_default=True,
154
+ help="Fail-fast on environment divergence: compare each "
155
+ "step's tool-result status against the recording and "
156
+ "stop the agent at the first mismatch. Requires a "
157
+ "trajectory store recorded with tool_results."),
132
158
  click.option("--utilization-interval-sec", "utilization_interval_sec",
133
159
  type=float, default=5.0, show_default=True),
134
160
  ]
135
161
 
136
- def run(self, shared: SharedOpts, *, job, trajectories, concurrency, sweep, mode,
137
- host, port, reachable_host, model, dataset, n_tasks, task, n_attempts,
162
+ def run(self, shared: SharedOpts, *, job, trajectories, concurrency,
163
+ concurrency_sweep, mode, route_tools, host, port, reachable_host, model,
164
+ dataset, exec_timeout_sec, n_tasks, task, exclude_task, n_attempts,
138
165
  timeout_multiplier, backend_type, request_timeout_sec,
139
166
  agent_ready_timeout_sec, jobs_dir, timeline,
140
- utilization_interval_sec) -> Optional[int]:
167
+ utilization_interval_sec, validate_observations) -> Optional[int]:
141
168
  from benchmaker.swebench import harbor_eval as he
142
169
  from benchmaker.swebench import trajectory as T
143
170
 
@@ -180,7 +207,7 @@ class SWEBenchReplayRecipe(Recipe):
180
207
  raise click.UsageError("--model required (no model recorded in trajectories).")
181
208
 
182
209
  # Run exactly the recorded tasks, not the whole dataset (see helper).
183
- task_filter, n_missing = _resolve_task_filter(task, store)
210
+ task_filter, n_missing = _resolve_task_filter(task, exclude_task, store)
184
211
  if n_missing:
185
212
  click.echo(f"warning: {n_missing} trajectories have no instance_id "
186
213
  f"and cannot be targeted; they will be skipped.")
@@ -190,18 +217,34 @@ class SWEBenchReplayRecipe(Recipe):
190
217
  "cannot select which tasks to replay.")
191
218
 
192
219
  replay_url = _replay_url(host, port, reachable_host)
193
- concurrencies = _parse_concurrencies(sweep, concurrency)
220
+ concurrencies = _parse_concurrencies(concurrency_sweep, concurrency)
194
221
  click.echo(f"replay: {len(store)} trajectories, {len(task_filter)} tasks, "
195
222
  f"model={run_model}, agent={mode}, url={replay_url}, "
196
223
  f"concurrencies={concurrencies}")
197
224
 
225
+ # pi-host edits the sandbox over a bridge; the file tools (read/write/edit)
226
+ # only land in the sandbox when routed (route_tools=all), which is how the
227
+ # trajectories were recorded. With the agent default (bash-only) those
228
+ # recorded edits replay against the host fs and silently no-op. pi-container
229
+ # runs pi inside the sandbox, so the kwarg does not apply.
230
+ agent_kwargs = [f"route_tools={route_tools}"] if mode == "pi-host" else []
231
+ # Real per-command sandbox timeout. Only pi-host routes each tool call
232
+ # through environment.exec(timeout_sec=...); pi-container runs as one
233
+ # process with no per-command budget, so the flag is a no-op there.
234
+ if exec_timeout_sec is not None:
235
+ if mode == "pi-host":
236
+ agent_kwargs.append(f"exec_timeout_s={exec_timeout_sec}")
237
+ else:
238
+ click.echo("warning: --exec-timeout-sec is ignored for "
239
+ "pi-container (no per-command timeout).")
240
+
198
241
  # Static harbor config shared by every sweep iteration; only `concurrency`
199
242
  # and `job_name` vary per run (set inside `_run_one`).
200
243
  base_ns = argparse.Namespace(
201
244
  dataset=dataset, agent=mode, model=run_model,
202
245
  api_key="replay",
203
- agent_kwarg=[], agent_config_file=None,
204
- n_tasks=n_tasks, task=task_filter,
246
+ agent_kwarg=agent_kwargs, agent_config_file=None,
247
+ n_tasks=n_tasks, task=task_filter, exclude_task=None,
205
248
  n_attempts=n_attempts, timeout_multiplier=timeout_multiplier,
206
249
  force_build=False, backend_type=backend_type,
207
250
  request_timeout_sec=request_timeout_sec,
@@ -214,20 +257,21 @@ class SWEBenchReplayRecipe(Recipe):
214
257
  for c in concurrencies:
215
258
  results.append(asyncio.run(self._run_one(
216
259
  store, base_ns, c, run_model, host, port, reachable_host,
217
- timeline, utilization_interval_sec)))
260
+ timeline, utilization_interval_sec, validate_observations)))
218
261
  finally:
219
262
  if tmpdir is not None:
220
263
  tmpdir.cleanup()
221
264
 
222
265
  # Comparison table.
223
- click.echo("\nCONCURRENCY ACCURACY PASS/TOTAL MISSES JOB_DIR")
224
- for c, accuracy, n_pass, n_total, misses, job_dir in results:
266
+ click.echo("\nCONCURRENCY ACCURACY PASS/TOTAL MISSES DIVERG JOB_DIR")
267
+ for c, accuracy, n_pass, n_total, misses, diverg, job_dir in results:
225
268
  click.echo(f"{c:>11} {accuracy:>7.1%} {n_pass:>4}/{n_total:<5} "
226
- f"{misses:>6} {job_dir}")
269
+ f"{misses:>6} {diverg:>6} {job_dir}")
227
270
  return None
228
271
 
229
272
  async def _run_one(self, store, base_ns, concurrency, run_model, host, port,
230
- reachable_host, timeline, utilization_interval_sec) -> tuple:
273
+ reachable_host, timeline, utilization_interval_sec,
274
+ validate_observations) -> tuple:
231
275
  """Serve `store` on host:port and run one harbor job at `concurrency`.
232
276
 
233
277
  Binds a fresh listener per call (pass --port 0 for an ephemeral port,
@@ -240,9 +284,9 @@ class SWEBenchReplayRecipe(Recipe):
240
284
 
241
285
  from benchmaker.swebench import harbor_eval as he
242
286
  from benchmaker.swebench.observability import run_job_with_observability
243
- from benchmaker.swebench.replay_server import as_app, get_misses
287
+ from benchmaker.swebench.replay_server import as_app, get_divergences, get_misses
244
288
 
245
- app = as_app(store, model_fallback=run_model)
289
+ app = as_app(store, model_fallback=run_model, validate=validate_observations)
246
290
  runner = web.AppRunner(app)
247
291
  await runner.setup()
248
292
  site = web.TCPSite(runner, host, port)
@@ -261,7 +305,7 @@ class SWEBenchReplayRecipe(Recipe):
261
305
  rows, accuracy = he._summarise(job_result)
262
306
  n_pass = sum(1 for r in rows if r["passed"])
263
307
  return (concurrency, accuracy, n_pass, len(rows), get_misses(app),
264
- str(job.job_dir))
308
+ get_divergences(app), str(job.job_dir))
265
309
  finally:
266
310
  await runner.cleanup()
267
311
 
@@ -3,7 +3,21 @@
3
3
  Expands each trajectory into one chat request per assistant turn (growing shared
4
4
  prefix) against an OpenAI-compatible endpoint, recording the prefix-cache parity
5
5
  pair: meta.expected_prefix_tokens (tokenizer upper bound) vs extra.cached_tokens
6
- (server actual). Use `--rate closed:N` for clean prefix-cache locality.
6
+ (server actual).
7
+
8
+ Two scheduling regimes:
9
+
10
+ * **Contiguous (default)** — all of trajectory A's turns, then all of B's. Turn
11
+ k+1 is served within a few requests of turn k, so its history is reused while
12
+ still hot in the local cache. Use ``--rate closed:N`` for clean prefix-cache
13
+ locality (best case: locality preserved).
14
+ * **Interleaved** (``--concurrent-sessions N``) — keep up to N sessions active
15
+ and round-robin their turns, gating each session's turn k+1 on turn k's
16
+ completion (+ an optional ``--inter-turn-gap`` think time). Concurrent session
17
+ histories overflow the device KV pool, so a session's history is evicted
18
+ before its next turn — the multi-turn *reuse-after-eviction* regime that
19
+ stresses hierarchical / shared KV tiers. The in-flight ceiling defaults to
20
+ ``closed:N`` to match the active session count.
7
21
  """
8
22
 
9
23
  from __future__ import annotations
@@ -67,12 +81,24 @@ class TrajectoryReplayRecipe(Recipe):
67
81
  help="Cap assistant turns replayed per trajectory."),
68
82
  click.option("--max-trajectories", "max_trajectories", type=int,
69
83
  default=None, help="Cap number of trajectories replayed."),
84
+ click.option("--concurrent-sessions", "concurrent_sessions", type=int,
85
+ default=None,
86
+ help="Interleave turns across up to N concurrent "
87
+ "sessions (round-robin, each session's turn k+1 "
88
+ "gated on turn k completing) instead of replaying "
89
+ "each trajectory contiguously. Enables the "
90
+ "reuse-after-eviction regime; defaults the rate to "
91
+ "closed:N."),
92
+ click.option("--inter-turn-gap", "inter_turn_gap", default=None,
93
+ help="Per-session think time between consecutive turns "
94
+ "(interleaved mode). E.g. 'const:2s', 'exp:1.5', "
95
+ "'uniform:1s..3s'. Default: no gap."),
70
96
  ]
71
97
 
72
98
  def build(self, shared: SharedOpts, *, url, model, api_key, header, dataset,
73
99
  prompts_jsonl, split, preset, tokenizer, messages_field, id_field,
74
- model_field, max_tokens, max_turns_per_trajectory, max_trajectories
75
- ) -> BuildResult:
100
+ model_field, max_tokens, max_turns_per_trajectory, max_trajectories,
101
+ concurrent_sessions=None, inter_turn_gap=None) -> BuildResult:
76
102
  from benchmaker.workloads.llm import OpenAIChatWorkloadType
77
103
  from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
78
104
 
@@ -99,7 +125,8 @@ class TrajectoryReplayRecipe(Recipe):
99
125
  messages_field=messages_field, id_field=id_field,
100
126
  model_field=model_field, max_tokens=max_tokens,
101
127
  max_turns_per_trajectory=max_turns_per_trajectory,
102
- max_trajectories=max_trajectories, tokenizer=tokenizer)
128
+ max_trajectories=max_trajectories, tokenizer=tokenizer,
129
+ concurrent_sessions=concurrent_sessions, inter_turn_gap=inter_turn_gap)
103
130
 
104
131
  source_config = {
105
132
  "workload_type": {"type": "openai-chat", "url": wt._url,
@@ -111,14 +138,26 @@ class TrajectoryReplayRecipe(Recipe):
111
138
  "model_field": model_field, "tokenizer": tokenizer,
112
139
  "max_tokens": max_tokens,
113
140
  "max_trajectories": max_trajectories,
114
- "max_turns_per_trajectory": max_turns_per_trajectory},
141
+ "max_turns_per_trajectory": max_turns_per_trajectory,
142
+ "concurrent_sessions": concurrent_sessions,
143
+ "inter_turn_gap": inter_turn_gap},
115
144
  }
145
+
146
+ # Interleaved mode needs a per-turn completion signal to gate each
147
+ # session's next turn; wire the workload's post-hook and default the
148
+ # in-flight ceiling to the active session count.
149
+ hook = workload.completion_hook()
150
+ post_hooks: list = [hook] if hook is not None else []
151
+ default_rate = ("closed:8" if concurrent_sessions is None
152
+ else f"closed:{concurrent_sessions}")
153
+
116
154
  # Finite dataset: replay once. The workload raises StopAsyncIteration when
117
155
  # exhausted, which halts the run; default to closed-loop with a long
118
156
  # nominal duration so exhaustion (not the clock) ends it.
119
157
  return BuildResult(
120
158
  workload_type=wt, workload=workload, source_config=source_config,
121
- default_rate="closed:8", default_duration="24h")
159
+ post_hooks=post_hooks,
160
+ default_rate=default_rate, default_duration="24h")
122
161
 
123
162
 
124
163
  register(TrajectoryReplayRecipe())
@@ -209,7 +209,8 @@ def _build_job_config(args: argparse.Namespace) -> JobConfig:
209
209
  )
210
210
 
211
211
  dataset = DatasetConfig(name=args.dataset, n_tasks=args.n_tasks,
212
- task_names=args.task or None)
212
+ task_names=args.task or None,
213
+ exclude_task_names=args.exclude_task or None)
213
214
 
214
215
  # Parent directory for the run bundle (harbor writes to <jobs_dir>/<job_name>).
215
216
  # Omit when unset so harbor keeps its own default of "jobs".
@@ -301,6 +302,10 @@ def _parse_args() -> argparse.Namespace:
301
302
  help="Cap the number of dataset tasks.")
302
303
  p.add_argument("--task", action="append", default=[],
303
304
  help="Restrict to specific task name(s)/glob(s) (repeatable).")
305
+ p.add_argument("--exclude-task", action="append", default=[],
306
+ help="Skip specific task name(s)/glob(s) (repeatable). Applied "
307
+ "after --task and before the --n-tasks cap, so the cap "
308
+ "selects the first N tasks that remain after exclusion.")
304
309
  p.add_argument("--concurrency", type=int, default=4)
305
310
  p.add_argument("--n-attempts", type=int, default=1)
306
311
  p.add_argument("--timeout-multiplier", type=float, default=4.0,