benchmaker 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {benchmaker-0.1.0/benchmaker.egg-info → benchmaker-0.1.2}/PKG-INFO +74 -18
  2. benchmaker-0.1.0/PKG-INFO → benchmaker-0.1.2/README.md +63 -35
  3. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/__init__.py +11 -7
  4. benchmaker-0.1.2/benchmaker/cli.py +215 -0
  5. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/config.py +10 -4
  6. benchmaker-0.1.2/benchmaker/core/__init__.py +2 -0
  7. {benchmaker-0.1.0/benchmaker → benchmaker-0.1.2/benchmaker/core}/metrics.py +1 -1
  8. {benchmaker-0.1.0/benchmaker → benchmaker-0.1.2/benchmaker/core}/runner.py +8 -8
  9. {benchmaker-0.1.0/benchmaker → benchmaker-0.1.2/benchmaker/core}/trace.py +2 -2
  10. benchmaker-0.1.2/benchmaker/io/__init__.py +1 -0
  11. {benchmaker-0.1.0/benchmaker → benchmaker-0.1.2/benchmaker/io}/bundle.py +2 -2
  12. {benchmaker-0.1.0/benchmaker → benchmaker-0.1.2/benchmaker/io}/collect.py +1 -1
  13. benchmaker-0.1.2/benchmaker/recipes/__init__.py +46 -0
  14. benchmaker-0.1.2/benchmaker/recipes/_cli_shared.py +60 -0
  15. benchmaker-0.1.2/benchmaker/recipes/_factory.py +123 -0
  16. benchmaker-0.1.2/benchmaker/recipes/base.py +165 -0
  17. benchmaker-0.1.2/benchmaker/recipes/http.py +73 -0
  18. benchmaker-0.1.2/benchmaker/recipes/llm.py +144 -0
  19. benchmaker-0.1.2/benchmaker/recipes/sandbox.py +155 -0
  20. benchmaker-0.1.2/benchmaker/recipes/sglang.py +109 -0
  21. benchmaker-0.1.2/benchmaker/recipes/swebench.py +313 -0
  22. benchmaker-0.1.2/benchmaker/recipes/swebench_replay.py +269 -0
  23. benchmaker-0.1.2/benchmaker/recipes/trajectory_replay.py +124 -0
  24. benchmaker-0.1.2/benchmaker/swebench/__init__.py +43 -0
  25. benchmaker-0.1.2/benchmaker/swebench/_flash_hardening.py +175 -0
  26. benchmaker-0.1.2/benchmaker/swebench/agent.py +543 -0
  27. benchmaker-0.1.2/benchmaker/swebench/grading.py +169 -0
  28. benchmaker-0.1.2/benchmaker/swebench/harbor_agent.py +266 -0
  29. benchmaker-0.1.2/benchmaker/swebench/harbor_eval.py +375 -0
  30. benchmaker-0.1.2/benchmaker/swebench/observability.py +621 -0
  31. benchmaker-0.1.2/benchmaker/swebench/pi_agent.py +564 -0
  32. benchmaker-0.1.2/benchmaker/swebench/pi_ext/max_turns.js +52 -0
  33. benchmaker-0.1.2/benchmaker/swebench/pi_ext/remote_exec.js +47 -0
  34. benchmaker-0.1.2/benchmaker/swebench/replay_server.py +206 -0
  35. benchmaker-0.1.2/benchmaker/swebench/trajectory.py +289 -0
  36. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/workloads/agent.py +2 -2
  37. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/workloads/base.py +1 -1
  38. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/workloads/eval.py +2 -2
  39. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/workloads/http.py +1 -1
  40. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/workloads/llm.py +69 -6
  41. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/workloads/sandbox.py +319 -47
  42. benchmaker-0.1.2/benchmaker/workloads/sglang.py +221 -0
  43. benchmaker-0.1.2/benchmaker/workloads/trajectory.py +209 -0
  44. benchmaker-0.1.0/README.md → benchmaker-0.1.2/benchmaker.egg-info/PKG-INFO +91 -16
  45. benchmaker-0.1.2/benchmaker.egg-info/SOURCES.txt +75 -0
  46. benchmaker-0.1.2/benchmaker.egg-info/requires.txt +23 -0
  47. benchmaker-0.1.2/pyproject.toml +50 -0
  48. benchmaker-0.1.2/tests/test_agent_warmup.py +147 -0
  49. {benchmaker-0.1.0 → benchmaker-0.1.2}/tests/test_bundle.py +2 -2
  50. {benchmaker-0.1.0 → benchmaker-0.1.2}/tests/test_coding_agent.py +2 -2
  51. benchmaker-0.1.2/tests/test_flash_hardening.py +89 -0
  52. benchmaker-0.1.2/tests/test_observability.py +502 -0
  53. benchmaker-0.1.2/tests/test_passthrough_meta.py +161 -0
  54. benchmaker-0.1.2/tests/test_pi_agent.py +162 -0
  55. benchmaker-0.1.2/tests/test_recipes_cli.py +313 -0
  56. benchmaker-0.1.2/tests/test_replay_server.py +133 -0
  57. benchmaker-0.1.2/tests/test_sandbox_duration.py +40 -0
  58. benchmaker-0.1.2/tests/test_sglang.py +152 -0
  59. {benchmaker-0.1.0 → benchmaker-0.1.2}/tests/test_smoke.py +49 -2
  60. benchmaker-0.1.2/tests/test_swebench_replay_recipe.py +91 -0
  61. {benchmaker-0.1.0 → benchmaker-0.1.2}/tests/test_trace.py +2 -2
  62. benchmaker-0.1.2/tests/test_trajectory.py +223 -0
  63. benchmaker-0.1.2/tests/test_trajectory_replay.py +300 -0
  64. benchmaker-0.1.0/benchmaker/cli.py +0 -382
  65. benchmaker-0.1.0/benchmaker.egg-info/SOURCES.txt +0 -36
  66. benchmaker-0.1.0/benchmaker.egg-info/requires.txt +0 -13
  67. benchmaker-0.1.0/pyproject.toml +0 -32
  68. {benchmaker-0.1.0/benchmaker → benchmaker-0.1.2/benchmaker/core}/load.py +0 -0
  69. {benchmaker-0.1.0/benchmaker → benchmaker-0.1.2/benchmaker/core}/monitors.py +0 -0
  70. {benchmaker-0.1.0/benchmaker → benchmaker-0.1.2/benchmaker/core}/types.py +0 -0
  71. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/env.py +0 -0
  72. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/workloads/__init__.py +0 -0
  73. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/workloads/datasets.py +0 -0
  74. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/workloads/hf.py +0 -0
  75. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker.egg-info/dependency_links.txt +0 -0
  76. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker.egg-info/entry_points.txt +0 -0
  77. {benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker.egg-info/top_level.txt +0 -0
  78. {benchmaker-0.1.0 → benchmaker-0.1.2}/setup.cfg +0 -0
  79. {benchmaker-0.1.0 → benchmaker-0.1.2}/tests/test_agent.py +0 -0
  80. {benchmaker-0.1.0 → benchmaker-0.1.2}/tests/test_eval.py +0 -0
  81. {benchmaker-0.1.0 → benchmaker-0.1.2}/tests/test_hf.py +0 -0
@@ -1,23 +1,32 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmaker
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
5
5
  Author: Xiaozhe Yao
6
6
  License: MIT
7
- Requires-Python: >=3.10
7
+ Requires-Python: >=3.12
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: aiohttp>=3.9
10
10
  Requires-Dist: click>=8.1
11
+ Requires-Dist: datasets>=4.8.5
12
+ Requires-Dist: huggingface-hub>=1.16.4
13
+ Requires-Dist: pyarrow>=24.0.0
11
14
  Requires-Dist: pyyaml>=6.0
15
+ Requires-Dist: swebench>=4.1.0
12
16
  Provides-Extra: rich
13
17
  Requires-Dist: rich>=13; extra == "rich"
14
18
  Provides-Extra: hf
15
19
  Requires-Dist: datasets>=2.18; extra == "hf"
20
+ Requires-Dist: transformers>=4.40; extra == "hf"
16
21
  Provides-Extra: dev
17
22
  Requires-Dist: pytest>=7; extra == "dev"
18
23
  Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
24
+ Provides-Extra: plot
25
+ Requires-Dist: ipykernel>=7.2.0; extra == "plot"
26
+ Requires-Dist: matplotlib>=3.10.9; extra == "plot"
27
+ Requires-Dist: seaborn>=0.13.2; extra == "plot"
19
28
 
20
- # bench-maker
29
+ # benchmaker
21
30
 
22
31
  Async HTTP benchmarking with pluggable workload-types (protocols), workloads
23
32
  (datasets), load models, hooks, and optional periodic monitors.
@@ -44,7 +53,7 @@ pip install -e .
44
53
  pip install -e .[dev] # for tests
45
54
  ```
46
55
 
47
- This installs the `benchmaker` Python package and the `bench-maker` CLI.
56
+ This installs the `benchmaker` Python package and the `benchmaker` CLI.
48
57
 
49
58
  ## 30-second tour
50
59
 
@@ -63,10 +72,12 @@ async def main():
63
72
  asyncio.run(main())
64
73
  ```
65
74
 
66
- Or via the CLI:
75
+ Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
76
+ `benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`, `sglang`,
77
+ `trajectory-replay`):
67
78
 
68
79
  ```bash
69
- bench-maker quick --url https://httpbin.org/get --rate poisson:50 --duration 10s
80
+ benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
70
81
  ```
71
82
 
72
83
  ## Walkthrough: benchmarking an LLM endpoint with ShareGPT
@@ -129,16 +140,16 @@ also come from `.env` via `OpenAIChatWorkloadType.from_env(...)`.
129
140
 
130
141
  ### Rebuild or customize it yourself
131
142
 
132
- The published split is produced by `tools/prepare_sharegpt.py`, which downloads
143
+ The published split is produced by `tools/sharegpt/prepare.py`, which downloads
133
144
  the upstream JSON once into `.local/` (gitignored) and converts it to the JSONL
134
145
  shape above. Run it when you want a subset, different filtering, or a refresh:
135
146
 
136
147
  ```bash
137
148
  # Defaults: .local/sharegpt_v3_raw.json -> .local/sharegpt_v3.jsonl
138
- python tools/prepare_sharegpt.py
149
+ python tools/sharegpt/prepare.py
139
150
 
140
151
  # A quick subset for smoke tests:
141
- python tools/prepare_sharegpt.py --max-items 2000
152
+ python tools/sharegpt/prepare.py --max-items 2000
142
153
  ```
143
154
 
144
155
  The raw download is ~700 MB. Use `--min-chars` / `--max-chars` to drop empty or
@@ -147,7 +158,7 @@ row). Point any workload at the local file with `JsonlWorkload(path=...,
147
158
  field="messages")`, or on the CLI:
148
159
 
149
160
  ```bash
150
- bench-maker llm \
161
+ benchmaker llm \
151
162
  --url http://localhost:8000/v1/chat/completions \
152
163
  --model meta-llama/Llama-3.1-8B-Instruct \
153
164
  --prompts-jsonl .local/sharegpt_v3.jsonl \
@@ -157,7 +168,7 @@ bench-maker llm \
157
168
  --out-dir ./runs --label dataset=sharegpt
158
169
  ```
159
170
 
160
- To re-publish after regenerating, `tools/upload_sharegpt_hf.py` pushes the
171
+ To re-publish after regenerating, `tools/sharegpt/upload_hf.py` pushes the
161
172
  JSONL back to the Hub (needs a write token).
162
173
 
163
174
  ## Documentation
@@ -174,6 +185,41 @@ Full docs live in [`docs/`](docs/):
174
185
  - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
175
186
  - [CLI & YAML reference](docs/cli-and-yaml.md)
176
187
  - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
188
+ - `benchmaker sglang` — native SGLang `/generate` benchmark (see [`docs/sglang.md`](docs/sglang.md)).
189
+ - `benchmaker trajectory-replay` — multi-turn prefix-cache parity replay of
190
+ trajectory datasets like SWE-smith (see [`docs/trajectory-replay.md`](docs/trajectory-replay.md)).
191
+
192
+ ## Deterministic replay (`swebench-replay`)
193
+
194
+ Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
195
+ real pi + sandbox + verifier pipeline still runs, only the model is served back
196
+ from recorded outputs, so re-runs are deterministic and free of model
197
+ cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
198
+ pipeline without the model's stochasticity as a confound. Still needs
199
+ `FLASH_SANDBOX_URL` (the sandbox + verifier are real).
200
+
201
+ ```bash
202
+ # 1) (optional) convert a job's pi logs to a replay store — the recipe can also
203
+ # do this inline via --job.
204
+ python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
205
+ -o replay-trajectories.jsonl
206
+
207
+ # 2) replay (host mode, localhost) across a concurrency sweep
208
+ FLASH_SANDBOX_URL=http://localhost:8080 \
209
+ benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
210
+ --mode pi-host --sweep 1,5,25
211
+
212
+ # container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
213
+ FLASH_SANDBOX_URL=http://localhost:8080 \
214
+ benchmaker swebench-replay --job jobs/2026-06-08__05-24-01_b352cb \
215
+ --mode pi-container --host 0.0.0.0 --reachable-host "$(hostname -I | awk '{print $1}')"
216
+ ```
217
+
218
+ The replay server is stateless: it picks each response by the task's identity
219
+ (the `# Task:` line, falling back to a hash of the full prompt when the recorded
220
+ run lacked an instance id) plus the count of assistant messages already in the
221
+ request — so it is correct at any concurrency. A `MISSES` column in the summary
222
+ flags any divergence (a request beyond the recorded turns).
177
223
 
178
224
  ## Examples
179
225
 
@@ -190,19 +236,29 @@ Under [`examples/`](examples/):
190
236
  - `config.yaml` — generic HTTP YAML config
191
237
  - `config_llm.yaml` — LLM YAML config with a Prometheus monitor
192
238
 
193
- Helper scripts under [`tools/`](tools/):
239
+ Helper tooling under [`tools/`](tools/), grouped by purpose:
194
240
 
195
- - `prepare_sharegpt.py` fetch ShareGPT V3 and convert to a generic JSONL
196
- - `upload_sharegpt_hf.py` — push the converted JSONL to the HF Hub (write token)
197
- - `start_local_llm.sh` example local SGLang launch command
241
+ - `sharegpt/` — `prepare.py` (fetch ShareGPT V3 JSONL) + `upload_hf.py`
242
+ (push to the HF Hub with a write token)
243
+ - `swe_images/` mirror SWE-bench/R2E-Gym container images to ghcr
244
+ (`publish.py`) and list the published refs (`pull.py`)
245
+ - `agent_warmup/` — build the agent-warmup SFT dataset
246
+ (`python -m tools.agent_warmup.cli`)
247
+ - `start_local_llm.sh` — example local SGLang launch command
198
248
 
199
249
  ## Project layout
200
250
 
201
251
  ```
202
252
  benchmaker/ # library code
203
- entrypoints/ # CLI (bench-maker)
204
- examples/ # runnable examples
205
- tools/ # one-off helper scripts (dataset prep, etc.)
253
+ __init__.py # public API (re-exports); cli.py — the `benchmaker` CLI
254
+ config.py env.py # YAML config loading + .env interpolation
255
+ core/ # engine: types, load models, runner, metrics, monitors, trace
256
+ io/ # run output: per-run bundle + cross-run collection
257
+ workloads/ # workload-types (http, llm, sandbox, agent, hf, eval)
258
+ recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay) + registry
259
+ swebench/ # SWE-bench coding agent + grading + harbor adapters
260
+ examples/ # runnable examples (incl. swebench/ coding-agent config)
261
+ tools/ # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
206
262
  tests/ # pytest smoke tests
207
263
  docs/ # reference docs
208
264
  ```
@@ -1,23 +1,4 @@
1
- Metadata-Version: 2.4
2
- Name: benchmaker
3
- Version: 0.1.0
4
- Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
5
- Author: Xiaozhe Yao
6
- License: MIT
7
- Requires-Python: >=3.10
8
- Description-Content-Type: text/markdown
9
- Requires-Dist: aiohttp>=3.9
10
- Requires-Dist: click>=8.1
11
- Requires-Dist: pyyaml>=6.0
12
- Provides-Extra: rich
13
- Requires-Dist: rich>=13; extra == "rich"
14
- Provides-Extra: hf
15
- Requires-Dist: datasets>=2.18; extra == "hf"
16
- Provides-Extra: dev
17
- Requires-Dist: pytest>=7; extra == "dev"
18
- Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
19
-
20
- # bench-maker
1
+ # benchmaker
21
2
 
22
3
  Async HTTP benchmarking with pluggable workload-types (protocols), workloads
23
4
  (datasets), load models, hooks, and optional periodic monitors.
@@ -44,7 +25,7 @@ pip install -e .
44
25
  pip install -e .[dev] # for tests
45
26
  ```
46
27
 
47
- This installs the `benchmaker` Python package and the `bench-maker` CLI.
28
+ This installs the `benchmaker` Python package and the `benchmaker` CLI.
48
29
 
49
30
  ## 30-second tour
50
31
 
@@ -63,10 +44,12 @@ async def main():
63
44
  asyncio.run(main())
64
45
  ```
65
46
 
66
- Or via the CLI:
47
+ Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
48
+ `benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`, `sglang`,
49
+ `trajectory-replay`):
67
50
 
68
51
  ```bash
69
- bench-maker quick --url https://httpbin.org/get --rate poisson:50 --duration 10s
52
+ benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
70
53
  ```
71
54
 
72
55
  ## Walkthrough: benchmarking an LLM endpoint with ShareGPT
@@ -129,16 +112,16 @@ also come from `.env` via `OpenAIChatWorkloadType.from_env(...)`.
129
112
 
130
113
  ### Rebuild or customize it yourself
131
114
 
132
- The published split is produced by `tools/prepare_sharegpt.py`, which downloads
115
+ The published split is produced by `tools/sharegpt/prepare.py`, which downloads
133
116
  the upstream JSON once into `.local/` (gitignored) and converts it to the JSONL
134
117
  shape above. Run it when you want a subset, different filtering, or a refresh:
135
118
 
136
119
  ```bash
137
120
  # Defaults: .local/sharegpt_v3_raw.json -> .local/sharegpt_v3.jsonl
138
- python tools/prepare_sharegpt.py
121
+ python tools/sharegpt/prepare.py
139
122
 
140
123
  # A quick subset for smoke tests:
141
- python tools/prepare_sharegpt.py --max-items 2000
124
+ python tools/sharegpt/prepare.py --max-items 2000
142
125
  ```
143
126
 
144
127
  The raw download is ~700 MB. Use `--min-chars` / `--max-chars` to drop empty or
@@ -147,7 +130,7 @@ row). Point any workload at the local file with `JsonlWorkload(path=...,
147
130
  field="messages")`, or on the CLI:
148
131
 
149
132
  ```bash
150
- bench-maker llm \
133
+ benchmaker llm \
151
134
  --url http://localhost:8000/v1/chat/completions \
152
135
  --model meta-llama/Llama-3.1-8B-Instruct \
153
136
  --prompts-jsonl .local/sharegpt_v3.jsonl \
@@ -157,7 +140,7 @@ bench-maker llm \
157
140
  --out-dir ./runs --label dataset=sharegpt
158
141
  ```
159
142
 
160
- To re-publish after regenerating, `tools/upload_sharegpt_hf.py` pushes the
143
+ To re-publish after regenerating, `tools/sharegpt/upload_hf.py` pushes the
161
144
  JSONL back to the Hub (needs a write token).
162
145
 
163
146
  ## Documentation
@@ -174,6 +157,41 @@ Full docs live in [`docs/`](docs/):
174
157
  - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
175
158
  - [CLI & YAML reference](docs/cli-and-yaml.md)
176
159
  - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
160
+ - `benchmaker sglang` — native SGLang `/generate` benchmark (see [`docs/sglang.md`](docs/sglang.md)).
161
+ - `benchmaker trajectory-replay` — multi-turn prefix-cache parity replay of
162
+ trajectory datasets like SWE-smith (see [`docs/trajectory-replay.md`](docs/trajectory-replay.md)).
163
+
164
+ ## Deterministic replay (`swebench-replay`)
165
+
166
+ Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
167
+ real pi + sandbox + verifier pipeline still runs, only the model is served back
168
+ from recorded outputs, so re-runs are deterministic and free of model
169
+ cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
170
+ pipeline without the model's stochasticity as a confound. Still needs
171
+ `FLASH_SANDBOX_URL` (the sandbox + verifier are real).
172
+
173
+ ```bash
174
+ # 1) (optional) convert a job's pi logs to a replay store — the recipe can also
175
+ # do this inline via --job.
176
+ python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
177
+ -o replay-trajectories.jsonl
178
+
179
+ # 2) replay (host mode, localhost) across a concurrency sweep
180
+ FLASH_SANDBOX_URL=http://localhost:8080 \
181
+ benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
182
+ --mode pi-host --sweep 1,5,25
183
+
184
+ # container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
185
+ FLASH_SANDBOX_URL=http://localhost:8080 \
186
+ benchmaker swebench-replay --job jobs/2026-06-08__05-24-01_b352cb \
187
+ --mode pi-container --host 0.0.0.0 --reachable-host "$(hostname -I | awk '{print $1}')"
188
+ ```
189
+
190
+ The replay server is stateless: it picks each response by the task's identity
191
+ (the `# Task:` line, falling back to a hash of the full prompt when the recorded
192
+ run lacked an instance id) plus the count of assistant messages already in the
193
+ request — so it is correct at any concurrency. A `MISSES` column in the summary
194
+ flags any divergence (a request beyond the recorded turns).
177
195
 
178
196
  ## Examples
179
197
 
@@ -190,19 +208,29 @@ Under [`examples/`](examples/):
190
208
  - `config.yaml` — generic HTTP YAML config
191
209
  - `config_llm.yaml` — LLM YAML config with a Prometheus monitor
192
210
 
193
- Helper scripts under [`tools/`](tools/):
211
+ Helper tooling under [`tools/`](tools/), grouped by purpose:
194
212
 
195
- - `prepare_sharegpt.py` fetch ShareGPT V3 and convert to a generic JSONL
196
- - `upload_sharegpt_hf.py` — push the converted JSONL to the HF Hub (write token)
197
- - `start_local_llm.sh` example local SGLang launch command
213
+ - `sharegpt/` — `prepare.py` (fetch ShareGPT V3 JSONL) + `upload_hf.py`
214
+ (push to the HF Hub with a write token)
215
+ - `swe_images/` mirror SWE-bench/R2E-Gym container images to ghcr
216
+ (`publish.py`) and list the published refs (`pull.py`)
217
+ - `agent_warmup/` — build the agent-warmup SFT dataset
218
+ (`python -m tools.agent_warmup.cli`)
219
+ - `start_local_llm.sh` — example local SGLang launch command
198
220
 
199
221
  ## Project layout
200
222
 
201
223
  ```
202
224
  benchmaker/ # library code
203
- entrypoints/ # CLI (bench-maker)
204
- examples/ # runnable examples
205
- tools/ # one-off helper scripts (dataset prep, etc.)
225
+ __init__.py # public API (re-exports); cli.py — the `benchmaker` CLI
226
+ config.py env.py # YAML config loading + .env interpolation
227
+ core/ # engine: types, load models, runner, metrics, monitors, trace
228
+ io/ # run output: per-run bundle + cross-run collection
229
+ workloads/ # workload-types (http, llm, sandbox, agent, hf, eval)
230
+ recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay) + registry
231
+ swebench/ # SWE-bench coding agent + grading + harbor adapters
232
+ examples/ # runnable examples (incl. swebench/ coding-agent config)
233
+ tools/ # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
206
234
  tests/ # pytest smoke tests
207
235
  docs/ # reference docs
208
236
  ```
@@ -1,6 +1,6 @@
1
1
  """benchmaker: async HTTP benchmarking with pluggable workload-types + workloads (datasets)."""
2
2
 
3
- from benchmaker.types import (
3
+ from benchmaker.core.types import (
4
4
  Request,
5
5
  Response,
6
6
  Sample,
@@ -19,6 +19,8 @@ from benchmaker.workloads.http import HttpWorkloadType
19
19
  from benchmaker.workloads.llm import OpenAIChatWorkloadType
20
20
  from benchmaker.workloads.sandbox import SandboxWorkloadType
21
21
  from benchmaker.workloads.hf import HFDatasetWorkload
22
+ from benchmaker.workloads.sglang import SGLangGenerateWorkloadType
23
+ from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
22
24
  from benchmaker.workloads.agent import (
23
25
  Agent,
24
26
  AgentContext,
@@ -41,7 +43,7 @@ from benchmaker.workloads.eval import (
41
43
  judge_llm,
42
44
  openai_chat_judge,
43
45
  )
44
- from benchmaker.load import (
46
+ from benchmaker.core.load import (
45
47
  LoadModel,
46
48
  ConstantRPS,
47
49
  PoissonRPS,
@@ -51,21 +53,21 @@ from benchmaker.load import (
51
53
  parse_rate_spec,
52
54
  )
53
55
  from benchmaker.env import interpolate, load_dotenv
54
- from benchmaker.monitors import (
56
+ from benchmaker.core.monitors import (
55
57
  Monitor,
56
58
  FunctionMonitor,
57
59
  PrometheusMonitor,
58
60
  parse_prometheus,
59
61
  )
60
- from benchmaker.runner import BenchRunner, BenchConfig, BenchResult
61
- from benchmaker.trace import (
62
+ from benchmaker.core.runner import BenchRunner, BenchConfig, BenchResult
63
+ from benchmaker.core.trace import (
62
64
  ReplayWorkloadType,
63
65
  TracePacedLoad,
64
66
  TraceRecorder,
65
67
  TraceWorkload,
66
68
  load_trace,
67
69
  )
68
- from benchmaker.bundle import (
70
+ from benchmaker.io.bundle import (
69
71
  BUNDLE_VERSION,
70
72
  RunMeta,
71
73
  default_run_id,
@@ -87,6 +89,8 @@ __all__ = [
87
89
  "OpenAIChatWorkloadType",
88
90
  "SandboxWorkloadType",
89
91
  "HFDatasetWorkload",
92
+ "SGLangGenerateWorkloadType",
93
+ "TrajectoryReplayWorkload",
90
94
  # agent workload (pluggable user-defined agents)
91
95
  "Agent",
92
96
  "AgentContext",
@@ -149,4 +153,4 @@ __all__ = [
149
153
  "write_bundle",
150
154
  ]
151
155
 
152
- __version__ = "0.1.0"
156
+ __version__ = "0.1.1"
@@ -0,0 +1,215 @@
1
+ """benchmaker CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ import sys
9
+
10
+ import click
11
+ import yaml
12
+
13
+ from benchmaker.config import build_config
14
+ from benchmaker.core.runner import BenchRunner
15
+ from benchmaker.recipes import all_recipes
16
+ from benchmaker.recipes._cli_shared import (
17
+ output_options as _output_options,
18
+ parse_headers as _parse_headers,
19
+ write_bundle_if_requested as _write_bundle_if_requested,
20
+ )
21
+ from benchmaker.recipes._factory import make_command
22
+
23
+
24
+ # ---------------------------------------------------------------- main
25
+
26
+
27
+ @click.group()
28
+ @click.option("--log-level", default="INFO",
29
+ type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False),
30
+ help="Logging level (default: INFO).")
31
+ def main(log_level: str) -> None:
32
+ """[benchmaker]: async HTTP benchmarking with pluggable workloads."""
33
+ level = log_level.upper()
34
+ logging.basicConfig(
35
+ level=level,
36
+ format="%(asctime)s [%(name)s] %(message)s",
37
+ datefmt="%H:%M:%S",
38
+ )
39
+ # Chatty third-party loggers (one INFO line per HTTP request / hub fetch)
40
+ # drown out our own output. Pin them to WARNING unless DEBUG was requested.
41
+ if level != "DEBUG":
42
+ for noisy in ("httpx", "httpcore", "urllib3", "huggingface_hub",
43
+ "filelock", "fsspec", "datasets", "aiohttp"):
44
+ logging.getLogger(noisy).setLevel(logging.WARNING)
45
+
46
+
47
+ @main.command()
48
+ @click.argument("config_path", type=click.Path(exists=True, dir_okay=False))
49
+ @_output_options
50
+ @click.option("--dotenv", type=click.Path(), default=".env",
51
+ help="Path to .env file to load (default: .env). "
52
+ "Use --dotenv '' to disable.")
53
+ @click.option("--record", "record_path", type=click.Path(), default=None,
54
+ help="Write a JSONL request trace (with relative timestamps) to "
55
+ "this path. A later run can replay it deterministically via "
56
+ "a 'replay:' config block. Overrides any 'record:' in YAML.")
57
+ @click.option("--replay", "replay_path", type=click.Path(exists=True, dir_okay=False),
58
+ default=None,
59
+ help="Replay a previously recorded trace at the same relative "
60
+ "timings. Overrides 'workload_type' / 'workload' / 'load' "
61
+ "(and any 'replay:' in YAML).")
62
+ @click.option("--replay-speed", type=float, default=None,
63
+ help="Speed multiplier for --replay (default 1.0).")
64
+ @click.option("--quiet", is_flag=True, help="Suppress progress output.")
65
+ def run(config_path: str, out_dir: str | None, run_id: str | None,
66
+ labels: tuple[str, ...], notes: str, dotenv: str,
67
+ record_path: str | None, replay_path: str | None,
68
+ replay_speed: float | None, quiet: bool) -> None:
69
+ """Run a benchmark from a YAML config file.
70
+
71
+ Environment variables (loaded from `.env` by default) are interpolated
72
+ into the YAML using `${VAR}` or `${VAR:-default}` syntax.
73
+ """
74
+ with open(config_path) as f:
75
+ raw_cfg = yaml.safe_load(f)
76
+
77
+ if record_path is not None:
78
+ raw_cfg = {**raw_cfg, "record": {"path": record_path}}
79
+ if replay_path is not None:
80
+ replay_cfg: dict = {"path": replay_path}
81
+ if replay_speed is not None:
82
+ replay_cfg["speed"] = replay_speed
83
+ raw_cfg = {**raw_cfg, "replay": replay_cfg}
84
+
85
+ bench_cfg = build_config(raw_cfg, dotenv_path=(dotenv or None))
86
+ if quiet:
87
+ bench_cfg.progress_every_s = 0.0
88
+
89
+ runner = BenchRunner(bench_cfg)
90
+ asyncio.run(runner.run())
91
+ runner.metrics.render(sys.stdout)
92
+ _write_bundle_if_requested(runner, raw_cfg, out_dir, run_id, labels, notes)
93
+
94
+
95
+ @main.command()
96
+ @click.option("--url", required=True, help="Target URL.")
97
+ @click.option("--method", default="GET")
98
+ @click.option("--header", "-H", multiple=True, help="Header 'Name: value'.")
99
+ @click.option("--json-body", default=None, help="JSON body string.")
100
+ @click.option("--data", default=None, help="Raw body string.")
101
+ @click.option("--rate", default="10", help="Load spec, e.g. '100', 'poisson:100', "
102
+ "'closed:32', 'ramp:10..500:30s'.")
103
+ @click.option("--duration", default="10s", help="Run duration (e.g. '30s', '2m').")
104
+ @click.option("--max-requests", type=int, default=None)
105
+ @click.option("--timeout", "timeout_s", default=60.0, type=float)
106
+ @click.option("--connection-limit", default=1000, type=int)
107
+ @_output_options
108
+ @click.option("--quiet", is_flag=True)
109
+ def quick(url: str, method: str, header: tuple[str, ...], json_body: str | None,
110
+ data: str | None, rate: str, duration: str, max_requests: int | None,
111
+ timeout_s: float, connection_limit: int,
112
+ out_dir: str | None, run_id: str | None,
113
+ labels: tuple[str, ...], notes: str, quiet: bool) -> None:
114
+ """[deprecated] One-liner HTTP benchmark — use `benchmaker http` instead."""
115
+ sys.stderr.write(
116
+ "[benchmaker] 'quick' is deprecated; use 'benchmaker http'.\n"
117
+ )
118
+ cfg: dict = {
119
+ "workload_type": {
120
+ "type": "http",
121
+ "url": url,
122
+ "method": method,
123
+ "headers": _parse_headers(header),
124
+ "timeout_s": timeout_s,
125
+ },
126
+ "load": rate,
127
+ "duration": duration,
128
+ "max_requests": max_requests,
129
+ "timeout_s": timeout_s,
130
+ "connection_limit": connection_limit,
131
+ }
132
+ if json_body is not None:
133
+ cfg["workload"] = {"type": "static", "items": [json.loads(json_body)]}
134
+ elif data is not None:
135
+ cfg["workload"] = {"type": "static", "items": [data.encode("utf-8")]}
136
+
137
+ bench_cfg = build_config(cfg)
138
+ if quiet:
139
+ bench_cfg.progress_every_s = 0.0
140
+
141
+ runner = BenchRunner(bench_cfg)
142
+ asyncio.run(runner.run())
143
+ runner.metrics.render(sys.stdout)
144
+ _write_bundle_if_requested(runner, cfg, out_dir, run_id, labels, notes)
145
+
146
+
147
+ # ---------------------------------------------------------------- collect
148
+
149
+
150
+ @main.command()
151
+ @click.argument("paths", nargs=-1, required=True,
152
+ type=click.Path(exists=True, file_okay=False))
153
+ @click.option("--format", "fmt", type=click.Choice(["md", "csv", "json"]),
154
+ default="md", show_default=True,
155
+ help="Output format. 'md' is a Markdown table, 'csv' is comma-separated, "
156
+ "'json' is a JSON array of row dicts.")
157
+ @click.option("--metric", "metrics", multiple=True,
158
+ help="Extra dotted-path metric to add as a column "
159
+ "(e.g. 'workload_metrics.ttft_s.p50'). Repeatable.")
160
+ @click.option("--columns", default=None,
161
+ help="Comma-separated list of column names to keep (after metrics are added). "
162
+ "Overrides the default column set.")
163
+ @click.option("--sort-by", default=None,
164
+ help="Column name to sort rows by (ascending).")
165
+ @click.option("--label", "label_keys", multiple=True,
166
+ help="Promote a meta.labels[<key>] entry into its own column. Repeatable.")
167
+ @click.option("--recursive/--no-recursive", default=True,
168
+ help="When a path is a directory of run-dirs, descend one level to find them.")
169
+ def collect(paths: tuple[str, ...], fmt: str, metrics: tuple[str, ...],
170
+ columns: str | None, sort_by: str | None,
171
+ label_keys: tuple[str, ...], recursive: bool) -> None:
172
+ """Collect summaries from one or more run-dirs into a table.
173
+
174
+ Each PATH may be a run directory (containing meta.json + summary.json) or a
175
+ directory of such run-dirs. With --recursive (default), a non-bundle
176
+ directory is scanned for immediate subdirectories that are bundles.
177
+ """
178
+ from benchmaker.io.collect import collect_table, format_table, find_bundles
179
+
180
+ bundle_dirs: list[str] = []
181
+ for p in paths:
182
+ bundle_dirs.extend(find_bundles(p, recursive=recursive))
183
+ if not bundle_dirs:
184
+ raise click.UsageError(
185
+ f"No run bundles found under: {', '.join(paths)}. "
186
+ "Run bundles must contain meta.json and summary.json."
187
+ )
188
+
189
+ rows, column_names = collect_table(
190
+ bundle_dirs,
191
+ extra_metrics=list(metrics),
192
+ label_keys=list(label_keys),
193
+ )
194
+ if columns:
195
+ column_names = [c.strip() for c in columns.split(",") if c.strip()]
196
+ if sort_by:
197
+ rows.sort(key=lambda r: (r.get(sort_by) is None, r.get(sort_by)))
198
+
199
+ sys.stdout.write(format_table(rows, column_names, fmt))
200
+ if fmt != "json":
201
+ sys.stdout.write("\n")
202
+
203
+
204
+ # ---------------------------------------------------------------- recipes
205
+ #
206
+ # Each registered recipe (http, llm, sandbox, swebench, ...) is exposed as a
207
+ # `benchmaker <recipe> --args` subcommand, built from the recipe's options plus
208
+ # the shared load/output options. See benchmaker/recipes/.
209
+
210
+ for _recipe in all_recipes():
211
+ main.add_command(make_command(_recipe))
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()
@@ -20,9 +20,9 @@ import importlib
20
20
  from typing import Any, Callable, Optional
21
21
 
22
22
  from benchmaker.env import interpolate, load_dotenv
23
- from benchmaker.load import parse_duration, parse_rate_spec
24
- from benchmaker.monitors import FunctionMonitor, Monitor, PrometheusMonitor
25
- from benchmaker.runner import BenchConfig
23
+ from benchmaker.core.load import parse_duration, parse_rate_spec
24
+ from benchmaker.core.monitors import FunctionMonitor, Monitor, PrometheusMonitor
25
+ from benchmaker.core.runner import BenchConfig
26
26
  from benchmaker.workloads.base import WorkloadType
27
27
  from benchmaker.workloads.datasets import (
28
28
  CallableWorkload,
@@ -47,7 +47,7 @@ from benchmaker.workloads.eval import (
47
47
  openai_chat_judge,
48
48
  regex_match,
49
49
  )
50
- from benchmaker.trace import (
50
+ from benchmaker.core.trace import (
51
51
  ReplayWorkloadType,
52
52
  TracePacedLoad,
53
53
  TraceRecorder,
@@ -88,6 +88,9 @@ def build_workload_type(spec: dict) -> WorkloadType:
88
88
  return HttpWorkloadType(**kwargs)
89
89
  if t in ("openai", "openai-chat", "llm-chat", "llm"):
90
90
  return OpenAIChatWorkloadType(**kwargs)
91
+ if t in ("sglang", "sglang-generate"):
92
+ from benchmaker.workloads.sglang import SGLangGenerateWorkloadType
93
+ return SGLangGenerateWorkloadType(**kwargs)
91
94
  if t in ("sandbox", "flash-sandbox"):
92
95
  return SandboxWorkloadType(**kwargs)
93
96
  if t == "agent":
@@ -151,6 +154,9 @@ def build_workload(spec: Any) -> Workload:
151
154
  return CallableWorkload(fn=fn, **kwargs)
152
155
  if t in ("hf", "huggingface"):
153
156
  return HFDatasetWorkload(**kwargs)
157
+ if t == "trajectory":
158
+ from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
159
+ return TrajectoryReplayWorkload(**kwargs)
154
160
  raise ValueError(f"Unknown workload type {t!r}")
155
161
 
156
162
 
@@ -0,0 +1,2 @@
1
+ """benchmaker engine internals: data types, load models, the run loop,
2
+ metrics aggregation, periodic monitors, and trace record/replay."""
@@ -9,7 +9,7 @@ from collections import Counter, defaultdict
9
9
  from dataclasses import dataclass, field
10
10
  from typing import Optional, TextIO
11
11
 
12
- from benchmaker.types import Sample
12
+ from benchmaker.core.types import Sample
13
13
 
14
14
 
15
15
  def _pct(xs: list[float], p: float) -> float: