benchflow 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchflow-0.2.2/.git +1 -0
- benchflow-0.2.2/.github/workflows/test.yml +38 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/.gitignore +3 -0
- benchflow-0.2.2/.pre-commit-config.yaml +22 -0
- benchflow-0.2.2/CHANGELOG.md +86 -0
- benchflow-0.2.2/CLAUDE.md +31 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/PKG-INFO +39 -100
- {benchflow-0.2.0 → benchflow-0.2.2}/README.md +36 -99
- benchflow-0.2.2/benchmarks/skillsbench-claude-glm5.yaml +10 -0
- benchflow-0.2.2/docs/architecture.md +265 -0
- benchflow-0.2.2/docs/cli-reference.md +283 -0
- benchflow-0.2.2/docs/getting-started.md +295 -0
- benchflow-0.2.2/docs/labs.md +88 -0
- benchflow-0.2.2/docs/task-authoring.md +219 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/README.md +153 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/_attack_runner.py +74 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/comparison.ipynb +172 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/environment/Dockerfile +9 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/environment/conftest_exploit.py +18 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/instruction.md +1 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/solution/solve.sh +26 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/task.toml +17 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test.sh +19 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py +13 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/environment/Dockerfile +3 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/instruction.md +3 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/solution/answer.txt +1 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/solution/solve.sh +15 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/task.toml +17 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/tests/ground_truth.txt +1 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/tests/test.sh +10 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/environment/Dockerfile +4 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/instruction.md +1 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/solution/solve.sh +28 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/task.toml +17 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test.sh +10 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py +12 -0
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/run_comparison.py +201 -0
- benchflow-0.2.2/labs/reward-hack-matrix/.gitignore +6 -0
- benchflow-0.2.2/labs/reward-hack-matrix/README.md +119 -0
- benchflow-0.2.2/labs/reward-hack-matrix/_runner.py +82 -0
- benchflow-0.2.2/labs/reward-hack-matrix/_worker.py +179 -0
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/conftest_hook.sh +87 -0
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/conftest_payload.py +33 -0
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/path_trojan.sh +56 -0
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/pth_injection.sh +122 -0
- benchflow-0.2.2/labs/reward-hack-matrix/fetch_corpora.sh +53 -0
- benchflow-0.2.2/labs/reward-hack-matrix/run_matrix.py +758 -0
- benchflow-0.2.2/labs/reward-hack-matrix/sweep_0.2.0_vs_0.2.2.json +7994 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/pyproject.toml +39 -2
- benchflow-0.2.2/src/benchflow/__init__.py +124 -0
- benchflow-0.2.2/src/benchflow/_acp_run.py +115 -0
- benchflow-0.2.2/src/benchflow/_agent_env.py +189 -0
- benchflow-0.2.2/src/benchflow/_agent_setup.py +121 -0
- benchflow-0.2.2/src/benchflow/_credentials.py +145 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/_env_setup.py +36 -11
- benchflow-0.2.2/src/benchflow/_sandbox.py +448 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/_scoring.py +15 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/_trajectory.py +37 -25
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/acp/client.py +10 -4
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/acp/container_transport.py +2 -1
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/acp/transport.py +3 -2
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/acp/types.py +4 -4
- benchflow-0.2.2/src/benchflow/agents/__init__.py +31 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/agents/openclaw_acp_shim.py +239 -163
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/agents/providers.py +78 -29
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/agents/registry.py +70 -13
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/cli/main.py +83 -33
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/environments.py +11 -6
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/job.py +175 -40
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/metrics.py +72 -21
- benchflow-0.2.0/src/benchflow/_models.py → benchflow-0.2.2/src/benchflow/models.py +39 -10
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/process.py +67 -33
- benchflow-0.2.2/src/benchflow/py.typed +0 -0
- benchflow-0.2.2/src/benchflow/sdk.py +677 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/skills.py +7 -3
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/task_download.py +8 -2
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/tasks.py +8 -4
- benchflow-0.2.2/src/benchflow/trajectories/__init__.py +42 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/trajectories/otel.py +5 -5
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/trajectories/proxy.py +11 -3
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/viewer.py +6 -3
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/conftest.py +0 -1
- benchflow-0.2.2/tests/examples/hello-world-task/instruction.md +5 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/examples/hello-world-task/task.toml +1 -1
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/examples/hello-world-task/tests/test.sh +2 -2
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/examples/test_claude.sh +4 -4
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/examples/test_codex.sh +2 -2
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/examples/test_gemini.sh +2 -2
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/examples/test_openclaw.sh +2 -2
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/fixtures/mock_acp_agent.py +0 -1
- benchflow-0.2.2/tests/fixtures/mock_acp_agent_interleaved.py +140 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_acp.py +67 -47
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_agent_model_decouple.py +12 -5
- benchflow-0.2.2/tests/test_agent_registry.py +62 -0
- benchflow-0.2.0/tests/test_trajectory.py → benchflow-0.2.2/tests/test_atif_trajectory.py +10 -32
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_capture_trajectory.py +77 -55
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_env_setup.py +141 -6
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_exclude_tasks.py +1 -52
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_job.py +98 -29
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_metrics.py +120 -81
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_process.py +18 -5
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_providers.py +45 -158
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_reexport.py +36 -25
- benchflow-0.2.2/tests/test_registry_invariants.py +235 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_resolve_env_helpers.py +85 -42
- benchflow-0.2.2/tests/test_sandbox_hardening.py +915 -0
- benchflow-0.2.2/tests/test_sandbox_verifier_workspace.py +167 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_scoring.py +10 -2
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_sdk_internals.py +20 -82
- benchflow-0.2.2/tests/test_sdk_lockdown.py +278 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_skills.py +4 -11
- benchflow-0.2.2/tests/test_smoke.py +157 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_subscription_auth.py +17 -35
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_tasks.py +8 -4
- benchflow-0.2.2/tests/test_verify.py +665 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_yaml_config.py +61 -1
- {benchflow-0.2.0 → benchflow-0.2.2}/uv.lock +118 -1
- benchflow-0.2.0/CHANGELOG.md +0 -112
- benchflow-0.2.0/CLAUDE.md +0 -83
- benchflow-0.2.0/docs/sdk-refactor-notes.md +0 -304
- benchflow-0.2.0/docs/sdk-reference.md +0 -252
- benchflow-0.2.0/docs/tested-agents.md +0 -51
- benchflow-0.2.0/src/benchflow/__init__.py +0 -48
- benchflow-0.2.0/src/benchflow/agents/__init__.py +0 -2
- benchflow-0.2.0/src/benchflow/sdk.py +0 -808
- benchflow-0.2.0/src/benchflow/trajectories/__init__.py +0 -24
- benchflow-0.2.0/tests/examples/hello-world-task/instruction.md +0 -5
- benchflow-0.2.0/tests/fixtures/mock_acp_agent_interleaved.py +0 -124
- benchflow-0.2.0/tests/test_credentials.py +0 -69
- benchflow-0.2.0/tests/test_dep_staging.py +0 -128
- benchflow-0.2.0/tests/test_env_mapping.py +0 -86
- {benchflow-0.2.0 → benchflow-0.2.2}/.devcontainer/Dockerfile +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/.devcontainer/devcontainer.json +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/.env.sample +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/.python-version +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/LICENSE +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/benchmarks/run_skillsbench.py +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/benchmarks/run_tb2.py +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/benchmarks/skillsbench-codex-gpt54.yaml +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/benchmarks/tb2_multiturn-codex-gpt54.yaml +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/benchmarks/tb2_single-codex-gpt54.yaml +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/acp/session.py +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/agents/user_agent.py +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/trajectories/atif.py +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/trajectories/claude_code.py +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/__init__.py +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.2.0 → benchflow-0.2.2}/tests/test_sandbox.py +0 -0
benchflow-0.2.2/.git
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
gitdir: /workspace/.git/modules/repos/benchflow/worktrees/benchflow-main
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: test
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
|
|
15
|
+
- name: Install uv
|
|
16
|
+
uses: astral-sh/setup-uv@v3
|
|
17
|
+
with:
|
|
18
|
+
enable-cache: true
|
|
19
|
+
|
|
20
|
+
- name: Set up Python
|
|
21
|
+
run: uv python install 3.12
|
|
22
|
+
|
|
23
|
+
- name: Install dependencies
|
|
24
|
+
run: |
|
|
25
|
+
uv venv -p 3.12 .venv
|
|
26
|
+
uv pip install -e ".[dev]"
|
|
27
|
+
|
|
28
|
+
- name: Lint
|
|
29
|
+
run: .venv/bin/ruff check src tests
|
|
30
|
+
|
|
31
|
+
- name: Format check
|
|
32
|
+
run: .venv/bin/ruff format --check src tests
|
|
33
|
+
|
|
34
|
+
- name: Type check
|
|
35
|
+
run: .venv/bin/ty check
|
|
36
|
+
|
|
37
|
+
- name: Test
|
|
38
|
+
run: .venv/bin/python -m pytest tests/
|
|
@@ -130,6 +130,7 @@ celerybeat.pid
|
|
|
130
130
|
# Environments
|
|
131
131
|
.env
|
|
132
132
|
.venv
|
|
133
|
+
.venvs/
|
|
133
134
|
env/
|
|
134
135
|
venv/
|
|
135
136
|
ENV/
|
|
@@ -175,6 +176,8 @@ cython_debug/
|
|
|
175
176
|
.ref/
|
|
176
177
|
trials/
|
|
177
178
|
jobs/
|
|
179
|
+
.jobs/
|
|
178
180
|
dogfood/
|
|
179
181
|
tmp/
|
|
180
182
|
.claude/settings.local.json
|
|
183
|
+
tests/.smoke-jobs/
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Run on staged files at commit time. Mirrors what CI runs so format/lint
|
|
2
|
+
# failures are caught locally before push. Install once per clone:
|
|
3
|
+
#
|
|
4
|
+
# uv pip install -e ".[dev]" && pre-commit install
|
|
5
|
+
#
|
|
6
|
+
# Bypass with --no-verify only if you know what you're doing; CI will still
|
|
7
|
+
# gate the same checks. Pinned to ruff 0.15.7 to match pyproject.toml's
|
|
8
|
+
# ruff>=0.7.0 floor and avoid format-rule drift between hook and CI.
|
|
9
|
+
#
|
|
10
|
+
# Scoped to src/ and tests/ to match what CI actually checks
|
|
11
|
+
# (`.github/workflows/*` runs `ruff format --check src tests`). benchmarks/
|
|
12
|
+
# is intentionally excluded — out of CI scope, would silently expand the
|
|
13
|
+
# format gate beyond what's enforced upstream.
|
|
14
|
+
repos:
|
|
15
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
16
|
+
rev: v0.15.7
|
|
17
|
+
hooks:
|
|
18
|
+
- id: ruff-format
|
|
19
|
+
files: ^(src|tests)/
|
|
20
|
+
- id: ruff-check
|
|
21
|
+
files: ^(src|tests)/
|
|
22
|
+
args: [--fix]
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [Unreleased]
|
|
4
|
+
|
|
5
|
+
## 0.2.2 — 2026-04-13
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
|
|
9
|
+
- **Sandbox hardening tiers 1–3** — layered defense (env scrubbing, path lockdown, workspace
|
|
10
|
+
freeze, wider snapshot, oracle privilege drop) blocking F1–F6 red-team findings.
|
|
11
|
+
- **`labs/reward-hack-matrix`** — per-trial timeout support and 0.2.2 sweep handoff scripts.
|
|
12
|
+
|
|
13
|
+
### Fixed
|
|
14
|
+
|
|
15
|
+
- Multiple sandbox bypass vectors identified in red-team testing.
|
|
16
|
+
|
|
17
|
+
## 0.2.1 — 2026-04-12
|
|
18
|
+
|
|
19
|
+
### Added
|
|
20
|
+
|
|
21
|
+
- **Sandbox hardening on by default** — `sandbox_user` now defaults to `"agent"` (was `None`/root). Blocks conftest-hook and answer-lookup exploit patterns.
|
|
22
|
+
- **Path lockdown** — new `sandbox_locked_paths` parameter makes `/solution` and `/tests` read-only before the verifier runs, blocking `.pth`-injection and similar pre-verify tampering.
|
|
23
|
+
- **Verifier failure isolation** — agent errors and verifier errors are now stored separately; a crashing verifier no longer masks the agent result.
|
|
24
|
+
- **`labs/benchjack-sandbox-hardening`** — cookbook demonstrating three exploit patterns (P1 conftest-hook, P2 answer-lookup, P7 `.pth`-injection) and their defenses.
|
|
25
|
+
|
|
26
|
+
### Fixed
|
|
27
|
+
|
|
28
|
+
- **Oracle runs as `sandbox_user`** — oracle agent now respects path lockdown instead of running as root and bypassing it.
|
|
29
|
+
- **Multi-endpoint provider routing** — providers with multiple endpoints now route by the agent's native API protocol.
|
|
30
|
+
- **Stale API key shadowing subscription auth** — emits a warning when `ANTHROPIC_API_KEY` env var is present alongside `claude login` credentials.
|
|
31
|
+
- **pytest `ini`-injection bypass** — closed a verifier hardening edge case.
|
|
32
|
+
|
|
33
|
+
### Changed
|
|
34
|
+
|
|
35
|
+
- Version is now single-sourced via `importlib.metadata`; no more duplicate version string in `__init__.py`.
|
|
36
|
+
- **User-facing docs** — new `docs/` directory with getting-started guide, CLI reference, architecture overview, task-authoring guide, and labs index. README trimmed; detailed content moved to `docs/`.
|
|
37
|
+
|
|
38
|
+
## 0.2.0 — 2026-04-09
|
|
39
|
+
|
|
40
|
+
**First public release.** A near-complete rearchitecture from the 0.1.x era. API surface has changed — assume breaking changes. Future releases will maintain compatibility within the 0.2.x line. 0.1.x users should treat this as a fresh install; see `.dev-docs/sdk-reference.md` for the new SDK.
|
|
41
|
+
|
|
42
|
+
### Added
|
|
43
|
+
|
|
44
|
+
- **Multi-agent, multi-provider, multi-auth matrix** — one YAML config, any supported agent × model × provider × auth combination.
|
|
45
|
+
- **Subscription auth support** — use `claude login`, `codex --login`, `gemini` OAuth credentials directly. No API keys required for host-based agent workflows.
|
|
46
|
+
- **Vertex AI support** — ADC auth for `google-vertex/`, `anthropic-vertex/`, `vertex-zai/` prefixed models.
|
|
47
|
+
- **Provider registry** — add a new LLM endpoint via a dict entry in `providers.py`, no code changes.
|
|
48
|
+
- **`benchmarks/` directory** with reusable YAML configs and runner scripts for TB2 and SkillsBench.
|
|
49
|
+
- **Auto task download** via `ensure_tasks()` — `terminal-bench-2` and `skillsbench` clone into `.ref/` on first run.
|
|
50
|
+
- **`benchflow tasks init`** — scaffold new tasks.
|
|
51
|
+
- **`benchflow tasks check`** — validate task structure.
|
|
52
|
+
- **`benchflow cleanup`** — delete old sandboxes with `--max-age` filtering (default 24h).
|
|
53
|
+
- **Oracle agent support** — run `solution/solve.sh` directly for task validation.
|
|
54
|
+
- **Hello-world-task example** for sanity-testing the agent pipeline.
|
|
55
|
+
- **Model generation params** via env vars (`BENCHFLOW_TEMPERATURE`, `BENCHFLOW_TOP_P`, `BENCHFLOW_MAX_TOKENS`).
|
|
56
|
+
- **OpenClaw ACP shim** with trajectory parsing and skills support.
|
|
57
|
+
- **ACP trajectory capture** — full multi-turn agent trajectories via ACP protocol.
|
|
58
|
+
|
|
59
|
+
### Changed
|
|
60
|
+
|
|
61
|
+
- **Skill loading** — agent-targeted with proper precedence; auto-distributed from `task.toml` `skills_dir`.
|
|
62
|
+
- **`openclaw-gemini` merged** into `openclaw` — provider mode selected at runtime via `BENCHFLOW_PROVIDER_NAME`.
|
|
63
|
+
|
|
64
|
+
### Fixed
|
|
65
|
+
|
|
66
|
+
- **API keys leaking in `ps aux`** — env vars now written inside the container instead of passed via Docker exec `-e`.
|
|
67
|
+
- **Subscription auth skipped without `-m`** — `benchflow run` without `--model` now checks correctly.
|
|
68
|
+
- **ADC credentials break with `sandbox_user`** (#111) — credentials written to sandbox user's home instead of `/root/`.
|
|
69
|
+
- **Daytona sandboxes not cleaned up** (#102) — auto-delete after max age.
|
|
70
|
+
- **`benchflow cleanup` ignoring `--max-age`** — was deleting everything regardless of age.
|
|
71
|
+
- **readline buffer overflow crashes trial** (#98).
|
|
72
|
+
- **OpenClaw ACP shim loses tool command text** (#96).
|
|
73
|
+
- **OpenClaw ACP shim hardcodes `anthropic/` prefix** (#95) — now routes correctly for Gemini/GLM models.
|
|
74
|
+
- **Oracle agent `PermissionError`** writing `agent/oracle.txt` (#91).
|
|
75
|
+
- **Oracle path skips `pre_agent_hooks`** (#92) — services now start before oracle runs.
|
|
76
|
+
- **Trial data parity with Harbor** (#90) — richer `result.json`, agent logs, per-phase timing.
|
|
77
|
+
- **`SDK.run()` `PermissionError`** — `jobs_dir` subdirectories created as root (#88).
|
|
78
|
+
- **Partial trajectory lost on timeout** — saved before timeout raises.
|
|
79
|
+
- **Redundant `--version` binary check** removed — was wasting 30s per trial.
|
|
80
|
+
- **Trajectory fallback** — scrapes agent-native files when ACP `session/update` is empty (#94).
|
|
81
|
+
- **`litellm` upgraded to 1.83.0** for CVE-2026-35030; transitive dep security alerts resolved (13 Dependabot alerts closed).
|
|
82
|
+
|
|
83
|
+
### Deprecated
|
|
84
|
+
|
|
85
|
+
- `BaseAgent` re-export — planned removal in 0.3.0
|
|
86
|
+
- `Trial` re-export — planned removal in 0.3.0
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# benchflow
|
|
2
|
+
|
|
3
|
+
Multi-turn agent benchmarking with ACP.
|
|
4
|
+
|
|
5
|
+
Architecture, CLI, task format: see `docs/architecture.md`, `docs/cli-reference.md`, `docs/task-authoring.md`. Internal refactor notes and SDK reference: `.dev-docs/`.
|
|
6
|
+
|
|
7
|
+
## Setup
|
|
8
|
+
|
|
9
|
+
Requires Python 3.12+. Uses `uv`.
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
uv venv -p 3.12 .venv && uv pip install -e ".[dev]"
|
|
13
|
+
.venv/bin/pre-commit install
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Test
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
.venv/bin/python -m pytest tests/ # unit (fast, no Docker)
|
|
20
|
+
.venv/bin/python -m pytest -m live tests/ # e2e (Docker + API key)
|
|
21
|
+
.venv/bin/ty check src/ # type check — also the fastest "find references" after any signature change
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
CI gates `ruff format`, `ruff check`, `pytest`, and `ty check src/`. Run all four before pushing. Live tests use Haiku 4.5 (`claude-haiku-4-5-20251001`).
|
|
25
|
+
|
|
26
|
+
## Conventions
|
|
27
|
+
|
|
28
|
+
- **Minimal fix.** Do only what was asked. "Leave as is" is a valid outcome. Generalize on the third repetition, not the first.
|
|
29
|
+
- **Registry over hardcode.** Adding an agent or provider is a dict entry in `agents/registry.py` or `providers.py` — not a new code path. The `oracle` special case in `sdk.py` exists because it bypasses the agent loop; don't add more without the same justification.
|
|
30
|
+
- **Don't rewrite passing tests.** Updating a test because the code it covers changed shape is fine. Rewriting one to match new behavior without understanding why it was written is not. No tautological tests (dataclass reads, stdlib behavior, "does it construct").
|
|
31
|
+
- **Human review before main.** Commit freely on a feature branch, open a PR. Never push to `main` directly, never force-push it.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -26,9 +26,11 @@ Requires-Dist: pyyaml>=6.0
|
|
|
26
26
|
Requires-Dist: rich>=13.0
|
|
27
27
|
Requires-Dist: typer>=0.9
|
|
28
28
|
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pre-commit>=3.7; extra == 'dev'
|
|
29
30
|
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
30
31
|
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
31
32
|
Requires-Dist: ruff>=0.7.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: ty>=0.0.1a1; extra == 'dev'
|
|
32
34
|
Description-Content-Type: text/markdown
|
|
33
35
|
|
|
34
36
|
<div align="center">
|
|
@@ -75,85 +77,25 @@ benchflow view jobs/my-job/my-trial/
|
|
|
75
77
|
## SDK
|
|
76
78
|
|
|
77
79
|
```python
|
|
78
|
-
import asyncio
|
|
79
80
|
from benchflow import SDK, Job, JobConfig, collect_metrics
|
|
80
81
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# Single task — API keys auto-inherited from os.environ
|
|
85
|
-
result = await sdk.run(
|
|
86
|
-
task_path="path/to/task",
|
|
87
|
-
agent="claude-agent-acp",
|
|
88
|
-
model="claude-haiku-4-5-20251001",
|
|
89
|
-
environment="daytona", # or "docker"
|
|
90
|
-
)
|
|
91
|
-
print(result.rewards) # {"reward": 1.0}
|
|
92
|
-
print(result.n_tool_calls) # 17
|
|
93
|
-
|
|
94
|
-
# Multi-turn — None = use task's instruction.md
|
|
95
|
-
result = await sdk.run(
|
|
96
|
-
task_path="path/to/task",
|
|
97
|
-
agent="claude-agent-acp",
|
|
98
|
-
prompts=[
|
|
99
|
-
None,
|
|
100
|
-
"Review your solution. Check for errors, test it, and fix any issues.",
|
|
101
|
-
],
|
|
102
|
-
environment="daytona",
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
# Job — run a full benchmark with concurrency and retries
|
|
106
|
-
job = Job(
|
|
107
|
-
tasks_dir="path/to/tasks",
|
|
108
|
-
jobs_dir="jobs/tb2",
|
|
109
|
-
config=JobConfig(
|
|
110
|
-
agent="claude-agent-acp",
|
|
111
|
-
model="claude-haiku-4-5-20251001",
|
|
112
|
-
environment="daytona",
|
|
113
|
-
concurrency=64,
|
|
114
|
-
),
|
|
115
|
-
)
|
|
116
|
-
result = await job.run()
|
|
117
|
-
print(f"{result.passed}/{result.total} ({result.score:.1%})")
|
|
118
|
-
|
|
119
|
-
# Metrics — aggregate results from a jobs directory
|
|
120
|
-
metrics = collect_metrics("jobs/tb2", benchmark="TB2")
|
|
121
|
-
print(metrics.summary())
|
|
122
|
-
|
|
123
|
-
asyncio.run(main())
|
|
82
|
+
result = await SDK().run(task_path="path/to/task", agent="claude-agent-acp")
|
|
83
|
+
print(result.rewards) # {"reward": 1.0}
|
|
124
84
|
```
|
|
125
85
|
|
|
86
|
+
Single task, multi-turn, full benchmark jobs, and programmatic metrics — see [docs/getting-started.md](docs/getting-started.md).
|
|
87
|
+
|
|
126
88
|
## CLI
|
|
127
89
|
|
|
128
90
|
```bash
|
|
129
|
-
|
|
130
|
-
benchflow
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
benchflow job -t tasks/ -a claude-agent-acp -c 64 -e daytona --retries 1
|
|
134
|
-
|
|
135
|
-
# List agents
|
|
136
|
-
benchflow agents
|
|
137
|
-
|
|
138
|
-
# View metrics
|
|
139
|
-
benchflow metrics jobs/tb2/ --json
|
|
140
|
-
benchflow metrics jobs/tb2/
|
|
141
|
-
|
|
142
|
-
# Evaluate a skill against tasks
|
|
143
|
-
benchflow eval -t tasks/ --skills-dir skills/ -a claude-agent-acp -e daytona
|
|
144
|
-
|
|
145
|
-
# List/install skills
|
|
146
|
-
benchflow skills
|
|
147
|
-
benchflow skills --install owner/repo@skill-name
|
|
148
|
-
|
|
149
|
-
# View trajectory
|
|
150
|
-
benchflow view jobs/tb2/my-trial/
|
|
151
|
-
|
|
152
|
-
# Create/validate tasks
|
|
153
|
-
benchflow tasks init my-task # scaffold a new task directory
|
|
154
|
-
benchflow tasks check tasks/my-task/ # validate task structure
|
|
91
|
+
benchflow run -t path/to/task -a claude-agent-acp # single task
|
|
92
|
+
benchflow job -t tasks/ -a claude-agent-acp -c 1 # benchmark job
|
|
93
|
+
benchflow metrics jobs/ # aggregate results
|
|
94
|
+
benchflow view jobs/my-job/my-trial/ # trajectory viewer
|
|
155
95
|
```
|
|
156
96
|
|
|
97
|
+
Full flag reference for all 8 subcommands: [docs/cli-reference.md](docs/cli-reference.md).
|
|
98
|
+
|
|
157
99
|
## Agents
|
|
158
100
|
|
|
159
101
|
Any [ACP-compatible agent](https://agentclientprotocol.com/get-started/agents) works. Registered agents are auto-installed in sandboxes.
|
|
@@ -163,7 +105,7 @@ benchflow agents # list registered agents
|
|
|
163
105
|
benchflow run -t task/ -a pi-acp -e daytona
|
|
164
106
|
```
|
|
165
107
|
|
|
166
|
-
See [docs/
|
|
108
|
+
See [docs/architecture.md](docs/architecture.md#registry-pattern) for the full tested agent × model/provider matrix and how to add your own.
|
|
167
109
|
|
|
168
110
|
## Environments
|
|
169
111
|
|
|
@@ -174,24 +116,9 @@ See [docs/tested-agents.md](docs/tested-agents.md) for the full list of tested a
|
|
|
174
116
|
|
|
175
117
|
## How it Works
|
|
176
118
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
| 1. Start environment (Harbor) |
|
|
181
|
-
| 2. Install ACP agent (npm) |
|
|
182
|
-
| 3. stdio pipe (exec/SSH) --------> claude-agent-acp
|
|
183
|
-
| |
|
|
184
|
-
| ACP: initialize |
|
|
185
|
-
| ACP: session/new(cwd) --------------> agent sees workspace, skills
|
|
186
|
-
| ACP: session/set_model(haiku) ------> model configured
|
|
187
|
-
| ACP: session/prompt("solve this") --> agent uses Bash, Read, Write
|
|
188
|
-
| ACP: session/update <---------------- tool calls, messages, thoughts
|
|
189
|
-
| ACP: session/prompt("test it") -----> same session, full context
|
|
190
|
-
| ACP: session/update <---------------- more tool calls
|
|
191
|
-
| |
|
|
192
|
-
| 4. Run verifier (Harbor) -----------> tests/test.sh → reward.txt
|
|
193
|
-
| 5. Stop environment |
|
|
194
|
-
```
|
|
119
|
+
BenchFlow starts a sandboxed environment, connects to the agent via ACP over a live stdio pipe, sends one or more prompts (the agent retains full context between turns), then runs the verifier and captures the full trajectory.
|
|
120
|
+
|
|
121
|
+
See [docs/architecture.md](docs/architecture.md) for SDK run phases, ACP protocol details, and the registry pattern.
|
|
195
122
|
|
|
196
123
|
## Task Format
|
|
197
124
|
|
|
@@ -208,6 +135,8 @@ my-task/
|
|
|
208
135
|
└── solution/ # optional reference solution
|
|
209
136
|
```
|
|
210
137
|
|
|
138
|
+
Full `task.toml` schema, verifier contract, and a worked example: [docs/task-authoring.md](docs/task-authoring.md).
|
|
139
|
+
|
|
211
140
|
## Results
|
|
212
141
|
|
|
213
142
|
Every run produces structured output:
|
|
@@ -227,7 +156,25 @@ jobs/{job_name}/{trial_name}/
|
|
|
227
156
|
└── reward.txt # reward value
|
|
228
157
|
```
|
|
229
158
|
|
|
230
|
-
##
|
|
159
|
+
## Benchmarks
|
|
160
|
+
|
|
161
|
+
Tasks are auto-downloaded on first run (cloned into `.ref/`).
|
|
162
|
+
|
|
163
|
+
**SkillsBench** (86 tasks — tool use, file editing, API calls):
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm5.yaml # Claude
|
|
167
|
+
python benchmarks/run_skillsbench.py benchmarks/skillsbench-codex-gpt54.yaml # Codex
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
**Terminal-Bench 2** (89 tasks — shell, git, compilers, daemons):
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
python benchmarks/run_tb2.py benchmarks/tb2_single-codex-gpt54.yaml # single-turn
|
|
174
|
+
python benchmarks/run_tb2.py benchmarks/tb2_multiturn-codex-gpt54.yaml # multi-turn
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Shipped configs use `environment: daytona` and `concurrency: 8`. For local Docker: `--env docker --concurrency 1`.
|
|
231
178
|
|
|
232
179
|
| Benchmark | Agent | Model | Score |
|
|
233
180
|
|-----------|-------|-------|-------|
|
|
@@ -247,15 +194,7 @@ Validation tasks in `.claude/skills/benchflow/tasks/` confirm agents can use the
|
|
|
247
194
|
|
|
248
195
|
## Architecture
|
|
249
196
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
- **ACP client** — multi-turn agent communication via live stdio pipe
|
|
253
|
-
- **Job orchestration** — concurrency, retries, resume, metrics
|
|
254
|
-
- **Multi-agent registry** — auto-install agents in sandboxes
|
|
255
|
-
- **Trajectory capture** — from ACP protocol
|
|
256
|
-
- **Skills** — teach agents to use BenchFlow itself
|
|
257
|
-
- **Viewer** — HTML trajectory visualization
|
|
258
|
-
- **CLI** — `run`, `job`, `agents`, `metrics`, `view`, `eval`, `skills`, `tasks`, `cleanup`
|
|
197
|
+
ACP client, job orchestration, multi-agent registry, trajectory capture, skills, viewer, and CLI — see [docs/architecture.md](docs/architecture.md).
|
|
259
198
|
|
|
260
199
|
## Citation
|
|
261
200
|
|
|
@@ -42,85 +42,25 @@ benchflow view jobs/my-job/my-trial/
|
|
|
42
42
|
## SDK
|
|
43
43
|
|
|
44
44
|
```python
|
|
45
|
-
import asyncio
|
|
46
45
|
from benchflow import SDK, Job, JobConfig, collect_metrics
|
|
47
46
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
# Single task — API keys auto-inherited from os.environ
|
|
52
|
-
result = await sdk.run(
|
|
53
|
-
task_path="path/to/task",
|
|
54
|
-
agent="claude-agent-acp",
|
|
55
|
-
model="claude-haiku-4-5-20251001",
|
|
56
|
-
environment="daytona", # or "docker"
|
|
57
|
-
)
|
|
58
|
-
print(result.rewards) # {"reward": 1.0}
|
|
59
|
-
print(result.n_tool_calls) # 17
|
|
60
|
-
|
|
61
|
-
# Multi-turn — None = use task's instruction.md
|
|
62
|
-
result = await sdk.run(
|
|
63
|
-
task_path="path/to/task",
|
|
64
|
-
agent="claude-agent-acp",
|
|
65
|
-
prompts=[
|
|
66
|
-
None,
|
|
67
|
-
"Review your solution. Check for errors, test it, and fix any issues.",
|
|
68
|
-
],
|
|
69
|
-
environment="daytona",
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
# Job — run a full benchmark with concurrency and retries
|
|
73
|
-
job = Job(
|
|
74
|
-
tasks_dir="path/to/tasks",
|
|
75
|
-
jobs_dir="jobs/tb2",
|
|
76
|
-
config=JobConfig(
|
|
77
|
-
agent="claude-agent-acp",
|
|
78
|
-
model="claude-haiku-4-5-20251001",
|
|
79
|
-
environment="daytona",
|
|
80
|
-
concurrency=64,
|
|
81
|
-
),
|
|
82
|
-
)
|
|
83
|
-
result = await job.run()
|
|
84
|
-
print(f"{result.passed}/{result.total} ({result.score:.1%})")
|
|
85
|
-
|
|
86
|
-
# Metrics — aggregate results from a jobs directory
|
|
87
|
-
metrics = collect_metrics("jobs/tb2", benchmark="TB2")
|
|
88
|
-
print(metrics.summary())
|
|
89
|
-
|
|
90
|
-
asyncio.run(main())
|
|
47
|
+
result = await SDK().run(task_path="path/to/task", agent="claude-agent-acp")
|
|
48
|
+
print(result.rewards) # {"reward": 1.0}
|
|
91
49
|
```
|
|
92
50
|
|
|
51
|
+
Single task, multi-turn, full benchmark jobs, and programmatic metrics — see [docs/getting-started.md](docs/getting-started.md).
|
|
52
|
+
|
|
93
53
|
## CLI
|
|
94
54
|
|
|
95
55
|
```bash
|
|
96
|
-
|
|
97
|
-
benchflow
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
benchflow job -t tasks/ -a claude-agent-acp -c 64 -e daytona --retries 1
|
|
101
|
-
|
|
102
|
-
# List agents
|
|
103
|
-
benchflow agents
|
|
104
|
-
|
|
105
|
-
# View metrics
|
|
106
|
-
benchflow metrics jobs/tb2/ --json
|
|
107
|
-
benchflow metrics jobs/tb2/
|
|
108
|
-
|
|
109
|
-
# Evaluate a skill against tasks
|
|
110
|
-
benchflow eval -t tasks/ --skills-dir skills/ -a claude-agent-acp -e daytona
|
|
111
|
-
|
|
112
|
-
# List/install skills
|
|
113
|
-
benchflow skills
|
|
114
|
-
benchflow skills --install owner/repo@skill-name
|
|
115
|
-
|
|
116
|
-
# View trajectory
|
|
117
|
-
benchflow view jobs/tb2/my-trial/
|
|
118
|
-
|
|
119
|
-
# Create/validate tasks
|
|
120
|
-
benchflow tasks init my-task # scaffold a new task directory
|
|
121
|
-
benchflow tasks check tasks/my-task/ # validate task structure
|
|
56
|
+
benchflow run -t path/to/task -a claude-agent-acp # single task
|
|
57
|
+
benchflow job -t tasks/ -a claude-agent-acp -c 1 # benchmark job
|
|
58
|
+
benchflow metrics jobs/ # aggregate results
|
|
59
|
+
benchflow view jobs/my-job/my-trial/ # trajectory viewer
|
|
122
60
|
```
|
|
123
61
|
|
|
62
|
+
Full flag reference for all 8 subcommands: [docs/cli-reference.md](docs/cli-reference.md).
|
|
63
|
+
|
|
124
64
|
## Agents
|
|
125
65
|
|
|
126
66
|
Any [ACP-compatible agent](https://agentclientprotocol.com/get-started/agents) works. Registered agents are auto-installed in sandboxes.
|
|
@@ -130,7 +70,7 @@ benchflow agents # list registered agents
|
|
|
130
70
|
benchflow run -t task/ -a pi-acp -e daytona
|
|
131
71
|
```
|
|
132
72
|
|
|
133
|
-
See [docs/
|
|
73
|
+
See [docs/architecture.md](docs/architecture.md#registry-pattern) for the full tested agent × model/provider matrix and how to add your own.
|
|
134
74
|
|
|
135
75
|
## Environments
|
|
136
76
|
|
|
@@ -141,24 +81,9 @@ See [docs/tested-agents.md](docs/tested-agents.md) for the full list of tested a
|
|
|
141
81
|
|
|
142
82
|
## How it Works
|
|
143
83
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
| 1. Start environment (Harbor) |
|
|
148
|
-
| 2. Install ACP agent (npm) |
|
|
149
|
-
| 3. stdio pipe (exec/SSH) --------> claude-agent-acp
|
|
150
|
-
| |
|
|
151
|
-
| ACP: initialize |
|
|
152
|
-
| ACP: session/new(cwd) --------------> agent sees workspace, skills
|
|
153
|
-
| ACP: session/set_model(haiku) ------> model configured
|
|
154
|
-
| ACP: session/prompt("solve this") --> agent uses Bash, Read, Write
|
|
155
|
-
| ACP: session/update <---------------- tool calls, messages, thoughts
|
|
156
|
-
| ACP: session/prompt("test it") -----> same session, full context
|
|
157
|
-
| ACP: session/update <---------------- more tool calls
|
|
158
|
-
| |
|
|
159
|
-
| 4. Run verifier (Harbor) -----------> tests/test.sh → reward.txt
|
|
160
|
-
| 5. Stop environment |
|
|
161
|
-
```
|
|
84
|
+
BenchFlow starts a sandboxed environment, connects to the agent via ACP over a live stdio pipe, sends one or more prompts (the agent retains full context between turns), then runs the verifier and captures the full trajectory.
|
|
85
|
+
|
|
86
|
+
See [docs/architecture.md](docs/architecture.md) for SDK run phases, ACP protocol details, and the registry pattern.
|
|
162
87
|
|
|
163
88
|
## Task Format
|
|
164
89
|
|
|
@@ -175,6 +100,8 @@ my-task/
|
|
|
175
100
|
└── solution/ # optional reference solution
|
|
176
101
|
```
|
|
177
102
|
|
|
103
|
+
Full `task.toml` schema, verifier contract, and a worked example: [docs/task-authoring.md](docs/task-authoring.md).
|
|
104
|
+
|
|
178
105
|
## Results
|
|
179
106
|
|
|
180
107
|
Every run produces structured output:
|
|
@@ -194,7 +121,25 @@ jobs/{job_name}/{trial_name}/
|
|
|
194
121
|
└── reward.txt # reward value
|
|
195
122
|
```
|
|
196
123
|
|
|
197
|
-
##
|
|
124
|
+
## Benchmarks
|
|
125
|
+
|
|
126
|
+
Tasks are auto-downloaded on first run (cloned into `.ref/`).
|
|
127
|
+
|
|
128
|
+
**SkillsBench** (86 tasks — tool use, file editing, API calls):
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
python benchmarks/run_skillsbench.py benchmarks/skillsbench-claude-glm5.yaml # Claude
|
|
132
|
+
python benchmarks/run_skillsbench.py benchmarks/skillsbench-codex-gpt54.yaml # Codex
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
**Terminal-Bench 2** (89 tasks — shell, git, compilers, daemons):
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
python benchmarks/run_tb2.py benchmarks/tb2_single-codex-gpt54.yaml # single-turn
|
|
139
|
+
python benchmarks/run_tb2.py benchmarks/tb2_multiturn-codex-gpt54.yaml # multi-turn
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Shipped configs use `environment: daytona` and `concurrency: 8`. For local Docker: `--env docker --concurrency 1`.
|
|
198
143
|
|
|
199
144
|
| Benchmark | Agent | Model | Score |
|
|
200
145
|
|-----------|-------|-------|-------|
|
|
@@ -214,15 +159,7 @@ Validation tasks in `.claude/skills/benchflow/tasks/` confirm agents can use the
|
|
|
214
159
|
|
|
215
160
|
## Architecture
|
|
216
161
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
- **ACP client** — multi-turn agent communication via live stdio pipe
|
|
220
|
-
- **Job orchestration** — concurrency, retries, resume, metrics
|
|
221
|
-
- **Multi-agent registry** — auto-install agents in sandboxes
|
|
222
|
-
- **Trajectory capture** — from ACP protocol
|
|
223
|
-
- **Skills** — teach agents to use BenchFlow itself
|
|
224
|
-
- **Viewer** — HTML trajectory visualization
|
|
225
|
-
- **CLI** — `run`, `job`, `agents`, `metrics`, `view`, `eval`, `skills`, `tasks`, `cleanup`
|
|
162
|
+
ACP client, job orchestration, multi-agent registry, trajectory capture, skills, viewer, and CLI — see [docs/architecture.md](docs/architecture.md).
|
|
226
163
|
|
|
227
164
|
## Citation
|
|
228
165
|
|