benchmaker 0.1.2__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchmaker-0.1.2 → benchmaker-0.1.3}/PKG-INFO +40 -11
- {benchmaker-0.1.2 → benchmaker-0.1.3}/README.md +37 -10
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/__init__.py +1 -1
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/config.py +7 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/swebench_replay.py +66 -22
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/trajectory_replay.py +45 -6
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/harbor_eval.py +6 -1
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/pi_agent.py +174 -35
- benchmaker-0.1.3/benchmaker/swebench/pi_ext/register_provider.js +65 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/pi_ext/remote_exec.js +6 -2
- benchmaker-0.1.3/benchmaker/swebench/pi_ext/remote_exec_all.js +231 -0
- benchmaker-0.1.3/benchmaker/swebench/replay_server.py +448 -0
- benchmaker-0.1.3/benchmaker/swebench/timeout_load.py +107 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/trajectory.py +148 -3
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/datasets.py +11 -0
- benchmaker-0.1.3/benchmaker/workloads/trajectory.py +437 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/PKG-INFO +40 -11
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/SOURCES.txt +9 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/requires.txt +3 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/pyproject.toml +2 -1
- benchmaker-0.1.3/tests/test_backfill_trajectory_status.py +93 -0
- benchmaker-0.1.3/tests/test_collect_trajectories.py +441 -0
- benchmaker-0.1.3/tests/test_dedupe_trajectories.py +167 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_pi_agent.py +98 -0
- benchmaker-0.1.3/tests/test_pi_agent_timeout_injection.py +47 -0
- benchmaker-0.1.3/tests/test_replay_server.py +454 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_swebench_replay_recipe.py +23 -3
- benchmaker-0.1.3/tests/test_timeout_load.py +177 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_trajectory.py +184 -0
- benchmaker-0.1.3/tests/test_trajectory_interleave.py +284 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_trajectory_replay.py +43 -0
- benchmaker-0.1.2/benchmaker/swebench/replay_server.py +0 -206
- benchmaker-0.1.2/benchmaker/workloads/trajectory.py +0 -209
- benchmaker-0.1.2/tests/test_replay_server.py +0 -133
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/cli.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/__init__.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/load.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/metrics.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/monitors.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/runner.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/trace.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/core/types.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/env.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/io/__init__.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/io/bundle.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/io/collect.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/__init__.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/_cli_shared.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/_factory.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/base.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/http.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/llm.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/sandbox.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/sglang.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/recipes/swebench.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/__init__.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/_flash_hardening.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/agent.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/grading.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/harbor_agent.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/observability.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/swebench/pi_ext/max_turns.js +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/__init__.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/agent.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/base.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/eval.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/hf.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/http.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/llm.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/sandbox.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker/workloads/sglang.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/dependency_links.txt +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/entry_points.txt +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/benchmaker.egg-info/top_level.txt +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/setup.cfg +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_agent.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_agent_warmup.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_bundle.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_coding_agent.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_eval.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_flash_hardening.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_hf.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_observability.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_passthrough_meta.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_recipes_cli.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_sandbox_duration.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_sglang.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_smoke.py +0 -0
- {benchmaker-0.1.2 → benchmaker-0.1.3}/tests/test_trace.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchmaker
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
|
|
5
5
|
Author: Xiaozhe Yao
|
|
6
6
|
License: MIT
|
|
@@ -18,6 +18,8 @@ Requires-Dist: rich>=13; extra == "rich"
|
|
|
18
18
|
Provides-Extra: hf
|
|
19
19
|
Requires-Dist: datasets>=2.18; extra == "hf"
|
|
20
20
|
Requires-Dist: transformers>=4.40; extra == "hf"
|
|
21
|
+
Provides-Extra: tokenizer
|
|
22
|
+
Requires-Dist: transformers>=4.40; extra == "tokenizer"
|
|
21
23
|
Provides-Extra: dev
|
|
22
24
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
23
25
|
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
@@ -73,8 +75,8 @@ asyncio.run(main())
|
|
|
73
75
|
```
|
|
74
76
|
|
|
75
77
|
Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
|
|
76
|
-
`benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
|
|
77
|
-
`trajectory-replay`):
|
|
78
|
+
`benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
|
|
79
|
+
`swebench-replay`, `sglang`, `trajectory-replay`):
|
|
78
80
|
|
|
79
81
|
```bash
|
|
80
82
|
benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
|
|
@@ -185,16 +187,15 @@ Full docs live in [`docs/`](docs/):
|
|
|
185
187
|
- [Correctness / accuracy eval](docs/eval.md) — grade responses against references
|
|
186
188
|
- [CLI & YAML reference](docs/cli-and-yaml.md)
|
|
187
189
|
- [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
|
|
188
|
-
-
|
|
189
|
-
-
|
|
190
|
-
trajectory datasets like SWE-smith (see [`docs/trajectory-replay.md`](docs/trajectory-replay.md)).
|
|
190
|
+
- [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
|
|
191
|
+
- [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
|
|
191
192
|
|
|
192
193
|
## Deterministic replay (`swebench-replay`)
|
|
193
194
|
|
|
194
195
|
Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
|
|
195
196
|
real pi + sandbox + verifier pipeline still runs, only the model is served back
|
|
196
197
|
from recorded outputs, so re-runs are deterministic and free of model
|
|
197
|
-
cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
|
|
198
|
+
cost/variance. Vary `--concurrency` (or `--concurrency-sweep`) to study the rest of the
|
|
198
199
|
pipeline without the model's stochasticity as a confound. Still needs
|
|
199
200
|
`FLASH_SANDBOX_URL` (the sandbox + verifier are real).
|
|
200
201
|
|
|
@@ -207,7 +208,7 @@ python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
|
|
|
207
208
|
# 2) replay (host mode, localhost) across a concurrency sweep
|
|
208
209
|
FLASH_SANDBOX_URL=http://localhost:8080 \
|
|
209
210
|
benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
|
|
210
|
-
--mode pi-host --sweep 1,5,25
|
|
211
|
+
--mode pi-host --concurrency-sweep 1,5,25
|
|
211
212
|
|
|
212
213
|
# container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
|
|
213
214
|
FLASH_SANDBOX_URL=http://localhost:8080 \
|
|
@@ -221,6 +222,18 @@ run lacked an instance id) plus the count of assistant messages already in the
|
|
|
221
222
|
request — so it is correct at any concurrency. A `MISSES` column in the summary
|
|
222
223
|
flags any divergence (a request beyond the recorded turns).
|
|
223
224
|
|
|
225
|
+
The standalone replay server can also **mock realistic streaming** for
|
|
226
|
+
latency-sensitive benchmarks. Pass a real tokenizer and a per-token delay; the
|
|
227
|
+
first token is emitted immediately (prefill free, TTFT≈0) and each subsequent
|
|
228
|
+
token is spaced by `--inter-token-time` ms. Output stays byte-exact and the
|
|
229
|
+
reported `usage` is the recorded value.
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
pip install 'benchmaker[tokenizer]' # adds transformers for the tokenizer
|
|
233
|
+
python -m benchmaker.swebench.replay_server replay-trajectories.jsonl \
|
|
234
|
+
--tokenizer zai-org/GLM-4.7-Flash --inter-token-time 50
|
|
235
|
+
```
|
|
236
|
+
|
|
224
237
|
## Examples
|
|
225
238
|
|
|
226
239
|
Under [`examples/`](examples/):
|
|
@@ -228,9 +241,12 @@ Under [`examples/`](examples/):
|
|
|
228
241
|
- `simple_get.py` — minimal library usage
|
|
229
242
|
- `custom_hooks.py` — request signing + response parsing
|
|
230
243
|
- `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
|
|
244
|
+
- `llm_from_env.py` — LLM benchmark using `from_env()`
|
|
231
245
|
- `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
|
|
246
|
+
- `agent_trove.py` — user-defined agent benchmark
|
|
232
247
|
- `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
|
|
233
248
|
- `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
|
|
249
|
+
- `bench_sandbox.py` / `bench_sandbox.sh` — sandbox benchmarks
|
|
234
250
|
- `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
|
|
235
251
|
- `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
|
|
236
252
|
- `config.yaml` — generic HTTP YAML config
|
|
@@ -254,9 +270,22 @@ benchmaker/ # library code
|
|
|
254
270
|
config.py env.py # YAML config loading + .env interpolation
|
|
255
271
|
core/ # engine: types, load models, runner, metrics, monitors, trace
|
|
256
272
|
io/ # run output: per-run bundle + cross-run collection
|
|
257
|
-
workloads/
|
|
258
|
-
|
|
259
|
-
|
|
273
|
+
workloads/
|
|
274
|
+
http.py # HTTP workload-type
|
|
275
|
+
llm.py # OpenAI-compatible chat workload-type
|
|
276
|
+
sandbox.py # Flash Sandbox workload-type
|
|
277
|
+
sglang.py # SGLang native /generate workload-type
|
|
278
|
+
agent.py # user-defined Agent workload-type
|
|
279
|
+
trajectory.py # multi-turn trajectory replay workload
|
|
280
|
+
eval.py # correctness/accuracy evaluation
|
|
281
|
+
hf.py # HuggingFace dataset source
|
|
282
|
+
datasets.py # generic workload/dataset base classes
|
|
283
|
+
base.py # WorkloadType base class
|
|
284
|
+
recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay, sglang, trajectory-replay) + registry
|
|
285
|
+
swebench/
|
|
286
|
+
trajectory.py # convert pi logs to replay trajectories
|
|
287
|
+
replay_server.py # mock-LLM replay server for swebench-replay
|
|
288
|
+
agent.py # SWE-bench coding agent + grading + harbor adapters
|
|
260
289
|
examples/ # runnable examples (incl. swebench/ coding-agent config)
|
|
261
290
|
tools/ # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
|
|
262
291
|
tests/ # pytest smoke tests
|
|
@@ -45,8 +45,8 @@ asyncio.run(main())
|
|
|
45
45
|
```
|
|
46
46
|
|
|
47
47
|
Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
|
|
48
|
-
`benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
|
|
49
|
-
`trajectory-replay`):
|
|
48
|
+
`benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`,
|
|
49
|
+
`swebench-replay`, `sglang`, `trajectory-replay`):
|
|
50
50
|
|
|
51
51
|
```bash
|
|
52
52
|
benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
|
|
@@ -157,16 +157,15 @@ Full docs live in [`docs/`](docs/):
|
|
|
157
157
|
- [Correctness / accuracy eval](docs/eval.md) — grade responses against references
|
|
158
158
|
- [CLI & YAML reference](docs/cli-and-yaml.md)
|
|
159
159
|
- [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
|
|
160
|
-
-
|
|
161
|
-
-
|
|
162
|
-
trajectory datasets like SWE-smith (see [`docs/trajectory-replay.md`](docs/trajectory-replay.md)).
|
|
160
|
+
- [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
|
|
161
|
+
- [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
|
|
163
162
|
|
|
164
163
|
## Deterministic replay (`swebench-replay`)
|
|
165
164
|
|
|
166
165
|
Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
|
|
167
166
|
real pi + sandbox + verifier pipeline still runs, only the model is served back
|
|
168
167
|
from recorded outputs, so re-runs are deterministic and free of model
|
|
169
|
-
cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
|
|
168
|
+
cost/variance. Vary `--concurrency` (or `--concurrency-sweep`) to study the rest of the
|
|
170
169
|
pipeline without the model's stochasticity as a confound. Still needs
|
|
171
170
|
`FLASH_SANDBOX_URL` (the sandbox + verifier are real).
|
|
172
171
|
|
|
@@ -179,7 +178,7 @@ python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
|
|
|
179
178
|
# 2) replay (host mode, localhost) across a concurrency sweep
|
|
180
179
|
FLASH_SANDBOX_URL=http://localhost:8080 \
|
|
181
180
|
benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
|
|
182
|
-
--mode pi-host --sweep 1,5,25
|
|
181
|
+
--mode pi-host --concurrency-sweep 1,5,25
|
|
183
182
|
|
|
184
183
|
# container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
|
|
185
184
|
FLASH_SANDBOX_URL=http://localhost:8080 \
|
|
@@ -193,6 +192,18 @@ run lacked an instance id) plus the count of assistant messages already in the
|
|
|
193
192
|
request — so it is correct at any concurrency. A `MISSES` column in the summary
|
|
194
193
|
flags any divergence (a request beyond the recorded turns).
|
|
195
194
|
|
|
195
|
+
The standalone replay server can also **mock realistic streaming** for
|
|
196
|
+
latency-sensitive benchmarks. Pass a real tokenizer and a per-token delay; the
|
|
197
|
+
first token is emitted immediately (prefill free, TTFT≈0) and each subsequent
|
|
198
|
+
token is spaced by `--inter-token-time` ms. Output stays byte-exact and the
|
|
199
|
+
reported `usage` is the recorded value.
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
pip install 'benchmaker[tokenizer]' # adds transformers for the tokenizer
|
|
203
|
+
python -m benchmaker.swebench.replay_server replay-trajectories.jsonl \
|
|
204
|
+
--tokenizer zai-org/GLM-4.7-Flash --inter-token-time 50
|
|
205
|
+
```
|
|
206
|
+
|
|
196
207
|
## Examples
|
|
197
208
|
|
|
198
209
|
Under [`examples/`](examples/):
|
|
@@ -200,9 +211,12 @@ Under [`examples/`](examples/):
|
|
|
200
211
|
- `simple_get.py` — minimal library usage
|
|
201
212
|
- `custom_hooks.py` — request signing + response parsing
|
|
202
213
|
- `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
|
|
214
|
+
- `llm_from_env.py` — LLM benchmark using `from_env()`
|
|
203
215
|
- `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
|
|
216
|
+
- `agent_trove.py` — user-defined agent benchmark
|
|
204
217
|
- `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
|
|
205
218
|
- `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
|
|
219
|
+
- `bench_sandbox.py` / `bench_sandbox.sh` — sandbox benchmarks
|
|
206
220
|
- `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
|
|
207
221
|
- `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
|
|
208
222
|
- `config.yaml` — generic HTTP YAML config
|
|
@@ -226,9 +240,22 @@ benchmaker/ # library code
|
|
|
226
240
|
config.py env.py # YAML config loading + .env interpolation
|
|
227
241
|
core/ # engine: types, load models, runner, metrics, monitors, trace
|
|
228
242
|
io/ # run output: per-run bundle + cross-run collection
|
|
229
|
-
workloads/
|
|
230
|
-
|
|
231
|
-
|
|
243
|
+
workloads/
|
|
244
|
+
http.py # HTTP workload-type
|
|
245
|
+
llm.py # OpenAI-compatible chat workload-type
|
|
246
|
+
sandbox.py # Flash Sandbox workload-type
|
|
247
|
+
sglang.py # SGLang native /generate workload-type
|
|
248
|
+
agent.py # user-defined Agent workload-type
|
|
249
|
+
trajectory.py # multi-turn trajectory replay workload
|
|
250
|
+
eval.py # correctness/accuracy evaluation
|
|
251
|
+
hf.py # HuggingFace dataset source
|
|
252
|
+
datasets.py # generic workload/dataset base classes
|
|
253
|
+
base.py # WorkloadType base class
|
|
254
|
+
recipes/ # CLI recipes (http, llm, sandbox, swebench, swebench-replay, sglang, trajectory-replay) + registry
|
|
255
|
+
swebench/
|
|
256
|
+
trajectory.py # convert pi logs to replay trajectories
|
|
257
|
+
replay_server.py # mock-LLM replay server for swebench-replay
|
|
258
|
+
agent.py # SWE-bench coding agent + grading + harbor adapters
|
|
232
259
|
examples/ # runnable examples (incl. swebench/ coding-agent config)
|
|
233
260
|
tools/ # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
|
|
234
261
|
tests/ # pytest smoke tests
|
|
@@ -407,6 +407,13 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
|
|
|
407
407
|
workload_type, extra_post = apply_correctness(workload_type, correctness_spec)
|
|
408
408
|
post_hooks = list(post_hooks) + list(extra_post)
|
|
409
409
|
|
|
410
|
+
# A workload that schedules on per-request completion (e.g. interleaved
|
|
411
|
+
# trajectory replay) declares the post-hook it needs; install it so a YAML
|
|
412
|
+
# config can't silently stall waiting for a signal it never wired up.
|
|
413
|
+
wl_hook = workload.completion_hook()
|
|
414
|
+
if wl_hook is not None and wl_hook not in post_hooks:
|
|
415
|
+
post_hooks = list(post_hooks) + [wl_hook]
|
|
416
|
+
|
|
410
417
|
recorder = _build_recorder(cfg.get("record"))
|
|
411
418
|
|
|
412
419
|
return BenchConfig(
|
|
@@ -5,7 +5,7 @@ Builds a replay store from recorded pi logs (or loads a prebuilt
|
|
|
5
5
|
`replay-trajectories.jsonl`), starts the stateless replay server in-process, and
|
|
6
6
|
runs the *real* harbor SWE-bench pipeline (pi + sandbox + verifier) with the
|
|
7
7
|
model endpoint pointed at the replay server — at one ``--concurrency`` or a
|
|
8
|
-
``--sweep`` of them. The LLM is the only thing mocked; everything else runs for
|
|
8
|
+
``--concurrency-sweep`` of them. The LLM is the only thing mocked; everything else runs for
|
|
9
9
|
real, so re-runs are deterministic and free of model cost/variance.
|
|
10
10
|
|
|
11
11
|
Still requires ``FLASH_SANDBOX_URL`` (the sandbox + verifier are real). For
|
|
@@ -54,28 +54,30 @@ def _parse_concurrencies(sweep: Optional[str], concurrency: int) -> list[int]:
|
|
|
54
54
|
return [int(x.strip()) for x in sweep.split(",") if x.strip()]
|
|
55
55
|
|
|
56
56
|
|
|
57
|
-
def _resolve_task_filter(task, store) -> tuple[list[str], int]:
|
|
57
|
+
def _resolve_task_filter(task, exclude_task, store) -> tuple[list[str], int]:
|
|
58
58
|
"""Which dataset tasks to run, and how many trajectories can't be targeted.
|
|
59
59
|
|
|
60
60
|
Default to exactly the recorded tasks (each trajectory's instance_id) so
|
|
61
61
|
harbor replays only what we have trajectories for — otherwise it would run
|
|
62
62
|
the whole ``--dataset`` and every task without a recording becomes a replay
|
|
63
63
|
miss. An explicit ``--task`` wins (the user is narrowing on purpose).
|
|
64
|
+
``--exclude-task`` drops the named id(s) from the resolved set.
|
|
64
65
|
Returns ``(task_ids, n_missing_instance_id)``."""
|
|
65
|
-
|
|
66
|
+
excluded = set(exclude_task)
|
|
67
|
+
explicit = [t for t in task if t not in excluded]
|
|
66
68
|
if explicit:
|
|
67
69
|
return explicit, 0
|
|
68
|
-
ids = sorted({t.instance_id for t in store.values()
|
|
70
|
+
ids = sorted({t.instance_id for t in store.values()
|
|
71
|
+
if t.instance_id and t.instance_id not in excluded})
|
|
69
72
|
missing = sum(1 for t in store.values() if not t.instance_id)
|
|
70
73
|
return ids, missing
|
|
71
74
|
|
|
72
|
-
|
|
73
75
|
class SWEBenchReplayRecipe(Recipe):
|
|
74
76
|
name = "swebench-replay"
|
|
75
77
|
help = (
|
|
76
78
|
"Replay recorded SWE-bench trajectories deterministically: mock the LLM "
|
|
77
79
|
"with recorded outputs, run the real pi+sandbox+verifier pipeline at one "
|
|
78
|
-
"--concurrency or a --sweep. Requires FLASH_SANDBOX_URL."
|
|
80
|
+
"--concurrency or a --concurrency-sweep. Requires FLASH_SANDBOX_URL."
|
|
79
81
|
)
|
|
80
82
|
wants_load_options = False
|
|
81
83
|
|
|
@@ -88,12 +90,20 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
88
90
|
help="Prebuilt replay-trajectories.jsonl (instead of --job)."),
|
|
89
91
|
click.option("--concurrency", type=int, default=4, show_default=True,
|
|
90
92
|
help="Concurrent trials (harbor n_concurrent_trials)."),
|
|
91
|
-
click.option("--sweep", default=None,
|
|
93
|
+
click.option("--concurrency-sweep", "concurrency_sweep", default=None,
|
|
92
94
|
help="Comma list of concurrencies to run in sequence, "
|
|
93
95
|
"e.g. '1,5,25' (overrides --concurrency)."),
|
|
94
96
|
click.option("--mode", type=click.Choice(["pi-host", "pi-container"]),
|
|
95
97
|
default="pi-host", show_default=True,
|
|
96
98
|
help="pi run mode (the harbor agent key)."),
|
|
99
|
+
click.option("--route-tools", "route_tools",
|
|
100
|
+
type=click.Choice(["all", "bash"]),
|
|
101
|
+
default="all", show_default=True,
|
|
102
|
+
help="pi-host: which tools to route into the sandbox. "
|
|
103
|
+
"'all' routes bash+read+write+edit (matches how "
|
|
104
|
+
"trajectories are recorded); 'bash' routes only bash "
|
|
105
|
+
"(file edits hit the host fs and are lost on replay). "
|
|
106
|
+
"Ignored for pi-container (pi runs in the sandbox)."),
|
|
97
107
|
click.option("--host", default="127.0.0.1", show_default=True,
|
|
98
108
|
help="Replay server bind host (use 0.0.0.0 for container mode)."),
|
|
99
109
|
click.option("--port", type=int, default=9100, show_default=True,
|
|
@@ -107,11 +117,21 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
107
117
|
"trajectory's model."),
|
|
108
118
|
click.option("--dataset", default="swebench-verified", show_default=True,
|
|
109
119
|
help="Harbor dataset slug."),
|
|
120
|
+
click.option("--exec-timeout-sec", "exec_timeout_sec", type=float,
|
|
121
|
+
default=None,
|
|
122
|
+
help="pi-host: real per-command timeout (seconds) passed "
|
|
123
|
+
"to environment.exec for every routed tool call "
|
|
124
|
+
"(default 600). Lower it to surface real sandbox "
|
|
125
|
+
"slowness/hangs under load. Ignored for pi-container "
|
|
126
|
+
"(pi runs as one process with no per-command timeout)."),
|
|
110
127
|
click.option("--n-tasks", "n_tasks", type=int, default=None,
|
|
111
128
|
help="Cap the number of recorded tasks to replay "
|
|
112
129
|
"(applied on top of the recorded-task filter)."),
|
|
113
130
|
click.option("--task", multiple=True,
|
|
114
131
|
help="Restrict to specific task name(s)/glob(s). Repeatable."),
|
|
132
|
+
click.option("--exclude-task", "exclude_task", multiple=True,
|
|
133
|
+
help="Drop specific task id(s) from the replay set. "
|
|
134
|
+
"Repeatable."),
|
|
115
135
|
click.option("--n-attempts", "n_attempts", type=int, default=1,
|
|
116
136
|
show_default=True, help="Attempts per task."),
|
|
117
137
|
click.option("--timeout-multiplier", "timeout_multiplier", type=float,
|
|
@@ -129,15 +149,22 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
129
149
|
click.option("--timeline/--no-timeline", "timeline", default=True,
|
|
130
150
|
show_default=True,
|
|
131
151
|
help="Capture timeline/utilization/tokens into the job dir."),
|
|
152
|
+
click.option("--validate-observations/--no-validate-observations",
|
|
153
|
+
"validate_observations", default=False, show_default=True,
|
|
154
|
+
help="Fail-fast on environment divergence: compare each "
|
|
155
|
+
"step's tool-result status against the recording and "
|
|
156
|
+
"stop the agent at the first mismatch. Requires a "
|
|
157
|
+
"trajectory store recorded with tool_results."),
|
|
132
158
|
click.option("--utilization-interval-sec", "utilization_interval_sec",
|
|
133
159
|
type=float, default=5.0, show_default=True),
|
|
134
160
|
]
|
|
135
161
|
|
|
136
|
-
def run(self, shared: SharedOpts, *, job, trajectories, concurrency,
|
|
137
|
-
|
|
162
|
+
def run(self, shared: SharedOpts, *, job, trajectories, concurrency,
|
|
163
|
+
concurrency_sweep, mode, route_tools, host, port, reachable_host, model,
|
|
164
|
+
dataset, exec_timeout_sec, n_tasks, task, exclude_task, n_attempts,
|
|
138
165
|
timeout_multiplier, backend_type, request_timeout_sec,
|
|
139
166
|
agent_ready_timeout_sec, jobs_dir, timeline,
|
|
140
|
-
utilization_interval_sec) -> Optional[int]:
|
|
167
|
+
utilization_interval_sec, validate_observations) -> Optional[int]:
|
|
141
168
|
from benchmaker.swebench import harbor_eval as he
|
|
142
169
|
from benchmaker.swebench import trajectory as T
|
|
143
170
|
|
|
@@ -180,7 +207,7 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
180
207
|
raise click.UsageError("--model required (no model recorded in trajectories).")
|
|
181
208
|
|
|
182
209
|
# Run exactly the recorded tasks, not the whole dataset (see helper).
|
|
183
|
-
task_filter, n_missing = _resolve_task_filter(task, store)
|
|
210
|
+
task_filter, n_missing = _resolve_task_filter(task, exclude_task, store)
|
|
184
211
|
if n_missing:
|
|
185
212
|
click.echo(f"warning: {n_missing} trajectories have no instance_id "
|
|
186
213
|
f"and cannot be targeted; they will be skipped.")
|
|
@@ -190,18 +217,34 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
190
217
|
"cannot select which tasks to replay.")
|
|
191
218
|
|
|
192
219
|
replay_url = _replay_url(host, port, reachable_host)
|
|
193
|
-
concurrencies = _parse_concurrencies(
|
|
220
|
+
concurrencies = _parse_concurrencies(concurrency_sweep, concurrency)
|
|
194
221
|
click.echo(f"replay: {len(store)} trajectories, {len(task_filter)} tasks, "
|
|
195
222
|
f"model={run_model}, agent={mode}, url={replay_url}, "
|
|
196
223
|
f"concurrencies={concurrencies}")
|
|
197
224
|
|
|
225
|
+
# pi-host edits the sandbox over a bridge; the file tools (read/write/edit)
|
|
226
|
+
# only land in the sandbox when routed (route_tools=all), which is how the
|
|
227
|
+
# trajectories were recorded. With the agent default (bash-only) those
|
|
228
|
+
# recorded edits replay against the host fs and silently no-op. pi-container
|
|
229
|
+
# runs pi inside the sandbox, so the kwarg does not apply.
|
|
230
|
+
agent_kwargs = [f"route_tools={route_tools}"] if mode == "pi-host" else []
|
|
231
|
+
# Real per-command sandbox timeout. Only pi-host routes each tool call
|
|
232
|
+
# through environment.exec(timeout_sec=...); pi-container runs as one
|
|
233
|
+
# process with no per-command budget, so the flag is a no-op there.
|
|
234
|
+
if exec_timeout_sec is not None:
|
|
235
|
+
if mode == "pi-host":
|
|
236
|
+
agent_kwargs.append(f"exec_timeout_s={exec_timeout_sec}")
|
|
237
|
+
else:
|
|
238
|
+
click.echo("warning: --exec-timeout-sec is ignored for "
|
|
239
|
+
"pi-container (no per-command timeout).")
|
|
240
|
+
|
|
198
241
|
# Static harbor config shared by every sweep iteration; only `concurrency`
|
|
199
242
|
# and `job_name` vary per run (set inside `_run_one`).
|
|
200
243
|
base_ns = argparse.Namespace(
|
|
201
244
|
dataset=dataset, agent=mode, model=run_model,
|
|
202
245
|
api_key="replay",
|
|
203
|
-
agent_kwarg=
|
|
204
|
-
n_tasks=n_tasks, task=task_filter,
|
|
246
|
+
agent_kwarg=agent_kwargs, agent_config_file=None,
|
|
247
|
+
n_tasks=n_tasks, task=task_filter, exclude_task=None,
|
|
205
248
|
n_attempts=n_attempts, timeout_multiplier=timeout_multiplier,
|
|
206
249
|
force_build=False, backend_type=backend_type,
|
|
207
250
|
request_timeout_sec=request_timeout_sec,
|
|
@@ -214,20 +257,21 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
214
257
|
for c in concurrencies:
|
|
215
258
|
results.append(asyncio.run(self._run_one(
|
|
216
259
|
store, base_ns, c, run_model, host, port, reachable_host,
|
|
217
|
-
timeline, utilization_interval_sec)))
|
|
260
|
+
timeline, utilization_interval_sec, validate_observations)))
|
|
218
261
|
finally:
|
|
219
262
|
if tmpdir is not None:
|
|
220
263
|
tmpdir.cleanup()
|
|
221
264
|
|
|
222
265
|
# Comparison table.
|
|
223
|
-
click.echo("\nCONCURRENCY ACCURACY PASS/TOTAL MISSES JOB_DIR")
|
|
224
|
-
for c, accuracy, n_pass, n_total, misses, job_dir in results:
|
|
266
|
+
click.echo("\nCONCURRENCY ACCURACY PASS/TOTAL MISSES DIVERG JOB_DIR")
|
|
267
|
+
for c, accuracy, n_pass, n_total, misses, diverg, job_dir in results:
|
|
225
268
|
click.echo(f"{c:>11} {accuracy:>7.1%} {n_pass:>4}/{n_total:<5} "
|
|
226
|
-
f"{misses:>6} {job_dir}")
|
|
269
|
+
f"{misses:>6} {diverg:>6} {job_dir}")
|
|
227
270
|
return None
|
|
228
271
|
|
|
229
272
|
async def _run_one(self, store, base_ns, concurrency, run_model, host, port,
|
|
230
|
-
reachable_host, timeline, utilization_interval_sec
|
|
273
|
+
reachable_host, timeline, utilization_interval_sec,
|
|
274
|
+
validate_observations) -> tuple:
|
|
231
275
|
"""Serve `store` on host:port and run one harbor job at `concurrency`.
|
|
232
276
|
|
|
233
277
|
Binds a fresh listener per call (pass --port 0 for an ephemeral port,
|
|
@@ -240,9 +284,9 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
240
284
|
|
|
241
285
|
from benchmaker.swebench import harbor_eval as he
|
|
242
286
|
from benchmaker.swebench.observability import run_job_with_observability
|
|
243
|
-
from benchmaker.swebench.replay_server import as_app, get_misses
|
|
287
|
+
from benchmaker.swebench.replay_server import as_app, get_divergences, get_misses
|
|
244
288
|
|
|
245
|
-
app = as_app(store, model_fallback=run_model)
|
|
289
|
+
app = as_app(store, model_fallback=run_model, validate=validate_observations)
|
|
246
290
|
runner = web.AppRunner(app)
|
|
247
291
|
await runner.setup()
|
|
248
292
|
site = web.TCPSite(runner, host, port)
|
|
@@ -261,7 +305,7 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
261
305
|
rows, accuracy = he._summarise(job_result)
|
|
262
306
|
n_pass = sum(1 for r in rows if r["passed"])
|
|
263
307
|
return (concurrency, accuracy, n_pass, len(rows), get_misses(app),
|
|
264
|
-
str(job.job_dir))
|
|
308
|
+
get_divergences(app), str(job.job_dir))
|
|
265
309
|
finally:
|
|
266
310
|
await runner.cleanup()
|
|
267
311
|
|
|
@@ -3,7 +3,21 @@
|
|
|
3
3
|
Expands each trajectory into one chat request per assistant turn (growing shared
|
|
4
4
|
prefix) against an OpenAI-compatible endpoint, recording the prefix-cache parity
|
|
5
5
|
pair: meta.expected_prefix_tokens (tokenizer upper bound) vs extra.cached_tokens
|
|
6
|
-
(server actual).
|
|
6
|
+
(server actual).
|
|
7
|
+
|
|
8
|
+
Two scheduling regimes:
|
|
9
|
+
|
|
10
|
+
* **Contiguous (default)** — all of trajectory A's turns, then all of B's. Turn
|
|
11
|
+
k+1 is served within a few requests of turn k, so its history is reused while
|
|
12
|
+
still hot in the local cache. Use ``--rate closed:N`` for clean prefix-cache
|
|
13
|
+
locality (best case: locality preserved).
|
|
14
|
+
* **Interleaved** (``--concurrent-sessions N``) — keep up to N sessions active
|
|
15
|
+
and round-robin their turns, gating each session's turn k+1 on turn k's
|
|
16
|
+
completion (+ an optional ``--inter-turn-gap`` think time). Concurrent session
|
|
17
|
+
histories overflow the device KV pool, so a session's history is evicted
|
|
18
|
+
before its next turn — the multi-turn *reuse-after-eviction* regime that
|
|
19
|
+
stresses hierarchical / shared KV tiers. The in-flight ceiling defaults to
|
|
20
|
+
``closed:N`` to match the active session count.
|
|
7
21
|
"""
|
|
8
22
|
|
|
9
23
|
from __future__ import annotations
|
|
@@ -67,12 +81,24 @@ class TrajectoryReplayRecipe(Recipe):
|
|
|
67
81
|
help="Cap assistant turns replayed per trajectory."),
|
|
68
82
|
click.option("--max-trajectories", "max_trajectories", type=int,
|
|
69
83
|
default=None, help="Cap number of trajectories replayed."),
|
|
84
|
+
click.option("--concurrent-sessions", "concurrent_sessions", type=int,
|
|
85
|
+
default=None,
|
|
86
|
+
help="Interleave turns across up to N concurrent "
|
|
87
|
+
"sessions (round-robin, each session's turn k+1 "
|
|
88
|
+
"gated on turn k completing) instead of replaying "
|
|
89
|
+
"each trajectory contiguously. Enables the "
|
|
90
|
+
"reuse-after-eviction regime; defaults the rate to "
|
|
91
|
+
"closed:N."),
|
|
92
|
+
click.option("--inter-turn-gap", "inter_turn_gap", default=None,
|
|
93
|
+
help="Per-session think time between consecutive turns "
|
|
94
|
+
"(interleaved mode). E.g. 'const:2s', 'exp:1.5', "
|
|
95
|
+
"'uniform:1s..3s'. Default: no gap."),
|
|
70
96
|
]
|
|
71
97
|
|
|
72
98
|
def build(self, shared: SharedOpts, *, url, model, api_key, header, dataset,
|
|
73
99
|
prompts_jsonl, split, preset, tokenizer, messages_field, id_field,
|
|
74
|
-
model_field, max_tokens, max_turns_per_trajectory, max_trajectories
|
|
75
|
-
) -> BuildResult:
|
|
100
|
+
model_field, max_tokens, max_turns_per_trajectory, max_trajectories,
|
|
101
|
+
concurrent_sessions=None, inter_turn_gap=None) -> BuildResult:
|
|
76
102
|
from benchmaker.workloads.llm import OpenAIChatWorkloadType
|
|
77
103
|
from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
|
|
78
104
|
|
|
@@ -99,7 +125,8 @@ class TrajectoryReplayRecipe(Recipe):
|
|
|
99
125
|
messages_field=messages_field, id_field=id_field,
|
|
100
126
|
model_field=model_field, max_tokens=max_tokens,
|
|
101
127
|
max_turns_per_trajectory=max_turns_per_trajectory,
|
|
102
|
-
max_trajectories=max_trajectories, tokenizer=tokenizer
|
|
128
|
+
max_trajectories=max_trajectories, tokenizer=tokenizer,
|
|
129
|
+
concurrent_sessions=concurrent_sessions, inter_turn_gap=inter_turn_gap)
|
|
103
130
|
|
|
104
131
|
source_config = {
|
|
105
132
|
"workload_type": {"type": "openai-chat", "url": wt._url,
|
|
@@ -111,14 +138,26 @@ class TrajectoryReplayRecipe(Recipe):
|
|
|
111
138
|
"model_field": model_field, "tokenizer": tokenizer,
|
|
112
139
|
"max_tokens": max_tokens,
|
|
113
140
|
"max_trajectories": max_trajectories,
|
|
114
|
-
"max_turns_per_trajectory": max_turns_per_trajectory
|
|
141
|
+
"max_turns_per_trajectory": max_turns_per_trajectory,
|
|
142
|
+
"concurrent_sessions": concurrent_sessions,
|
|
143
|
+
"inter_turn_gap": inter_turn_gap},
|
|
115
144
|
}
|
|
145
|
+
|
|
146
|
+
# Interleaved mode needs a per-turn completion signal to gate each
|
|
147
|
+
# session's next turn; wire the workload's post-hook and default the
|
|
148
|
+
# in-flight ceiling to the active session count.
|
|
149
|
+
hook = workload.completion_hook()
|
|
150
|
+
post_hooks: list = [hook] if hook is not None else []
|
|
151
|
+
default_rate = ("closed:8" if concurrent_sessions is None
|
|
152
|
+
else f"closed:{concurrent_sessions}")
|
|
153
|
+
|
|
116
154
|
# Finite dataset: replay once. The workload raises StopAsyncIteration when
|
|
117
155
|
# exhausted, which halts the run; default to closed-loop with a long
|
|
118
156
|
# nominal duration so exhaustion (not the clock) ends it.
|
|
119
157
|
return BuildResult(
|
|
120
158
|
workload_type=wt, workload=workload, source_config=source_config,
|
|
121
|
-
|
|
159
|
+
post_hooks=post_hooks,
|
|
160
|
+
default_rate=default_rate, default_duration="24h")
|
|
122
161
|
|
|
123
162
|
|
|
124
163
|
register(TrajectoryReplayRecipe())
|
|
@@ -209,7 +209,8 @@ def _build_job_config(args: argparse.Namespace) -> JobConfig:
|
|
|
209
209
|
)
|
|
210
210
|
|
|
211
211
|
dataset = DatasetConfig(name=args.dataset, n_tasks=args.n_tasks,
|
|
212
|
-
task_names=args.task or None
|
|
212
|
+
task_names=args.task or None,
|
|
213
|
+
exclude_task_names=args.exclude_task or None)
|
|
213
214
|
|
|
214
215
|
# Parent directory for the run bundle (harbor writes to <jobs_dir>/<job_name>).
|
|
215
216
|
# Omit when unset so harbor keeps its own default of "jobs".
|
|
@@ -301,6 +302,10 @@ def _parse_args() -> argparse.Namespace:
|
|
|
301
302
|
help="Cap the number of dataset tasks.")
|
|
302
303
|
p.add_argument("--task", action="append", default=[],
|
|
303
304
|
help="Restrict to specific task name(s)/glob(s) (repeatable).")
|
|
305
|
+
p.add_argument("--exclude-task", action="append", default=[],
|
|
306
|
+
help="Skip specific task name(s)/glob(s) (repeatable). Applied "
|
|
307
|
+
"after --task and before the --n-tasks cap, so the cap "
|
|
308
|
+
"selects the first N tasks that remain after exclusion.")
|
|
304
309
|
p.add_argument("--concurrency", type=int, default=4)
|
|
305
310
|
p.add_argument("--n-attempts", type=int, default=1)
|
|
306
311
|
p.add_argument("--timeout-multiplier", type=float, default=4.0,
|