benchflow 0.2.2__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.2.2 → benchflow-0.3.0}/.gitignore +2 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/CHANGELOG.md +20 -0
- benchflow-0.3.0/PKG-INFO +212 -0
- benchflow-0.3.0/README.md +177 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/pyproject.toml +12 -11
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/__init__.py +37 -2
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_acp_run.py +63 -26
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_agent_env.py +9 -1
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_env_setup.py +24 -1
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_sandbox.py +337 -43
- benchflow-0.3.0/src/benchflow/_scene.py +289 -0
- benchflow-0.3.0/src/benchflow/_snapshot.py +75 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/agents/openclaw_acp_shim.py +28 -13
- benchflow-0.3.0/src/benchflow/agents/pi_acp_launcher.py +131 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/agents/providers.py +11 -7
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/agents/registry.py +125 -17
- benchflow-0.3.0/src/benchflow/cli/__init__.py +1 -0
- benchflow-0.3.0/src/benchflow/cli/eval.py +371 -0
- benchflow-0.3.0/src/benchflow/cli/main.py +1039 -0
- benchflow-0.3.0/src/benchflow/demo_task/instruction.md +7 -0
- benchflow-0.3.0/src/benchflow/demo_task/task.toml +17 -0
- benchflow-0.3.0/src/benchflow/demo_task/tests/test.sh +20 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/job.py +88 -38
- benchflow-0.3.0/src/benchflow/mcp/__init__.py +5 -0
- benchflow-0.3.0/src/benchflow/mcp/hooks.py +74 -0
- benchflow-0.3.0/src/benchflow/mcp/reviewer_server.py +143 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/metrics.py +1 -2
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/process.py +0 -6
- benchflow-0.3.0/src/benchflow/runtime.py +352 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/sdk.py +89 -248
- benchflow-0.3.0/src/benchflow/skill_eval.py +696 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/skills.py +0 -3
- benchflow-0.3.0/src/benchflow/templates/__init__.py +5 -0
- benchflow-0.3.0/src/benchflow/templates/judge.py.tmpl +193 -0
- benchflow-0.3.0/src/benchflow/templates/test.sh.tmpl +12 -0
- benchflow-0.3.0/src/benchflow/trial.py +690 -0
- benchflow-0.3.0/src/benchflow/trial_yaml.py +169 -0
- benchflow-0.3.0/tests/conformance/README.md +21 -0
- benchflow-0.3.0/tests/conformance/acp_smoke/environment/Dockerfile +7 -0
- benchflow-0.3.0/tests/conformance/acp_smoke/instruction.md +7 -0
- benchflow-0.3.0/tests/conformance/acp_smoke/solution/solve.sh +4 -0
- {benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection → benchflow-0.3.0/tests/conformance/acp_smoke}/task.toml +4 -4
- benchflow-0.3.0/tests/conformance/acp_smoke/tests/test.sh +13 -0
- benchflow-0.3.0/tests/conformance/conformance-results.json +40 -0
- benchflow-0.3.0/tests/conformance/proof_multi_agent.py +165 -0
- benchflow-0.3.0/tests/conformance/proof_snapshot.py +86 -0
- benchflow-0.3.0/tests/conformance/run_conformance.py +129 -0
- benchflow-0.3.0/tests/conftest.py +78 -0
- benchflow-0.3.0/tests/examples/hello-world-task/environment/Dockerfile +7 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_acp.py +118 -2
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_agent_model_decouple.py +17 -28
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_agent_registry.py +3 -3
- benchflow-0.3.0/tests/test_agent_spec.py +80 -0
- benchflow-0.3.0/tests/test_eval_cli.py +118 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_job.py +13 -40
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_metrics.py +4 -1
- benchflow-0.3.0/tests/test_oracle.py +63 -0
- benchflow-0.3.0/tests/test_pi_acp_launcher.py +301 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_providers.py +51 -52
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_registry_invariants.py +1 -1
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_resolve_env_helpers.py +11 -16
- benchflow-0.3.0/tests/test_rewards_jsonl.py +108 -0
- benchflow-0.3.0/tests/test_runtime.py +154 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_sandbox.py +0 -9
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_sandbox_hardening.py +379 -93
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_sandbox_verifier_workspace.py +14 -8
- benchflow-0.3.0/tests/test_scene.py +202 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_scoring.py +24 -28
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_sdk_internals.py +18 -12
- benchflow-0.3.0/tests/test_skill_eval.py +404 -0
- benchflow-0.3.0/tests/test_skill_eval_dryrun.py +271 -0
- benchflow-0.3.0/tests/test_skill_eval_integration.py +338 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_skills.py +0 -12
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_tasks.py +0 -5
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_verify.py +16 -82
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_yaml_config.py +68 -5
- benchflow-0.2.2/.devcontainer/Dockerfile +0 -88
- benchflow-0.2.2/.devcontainer/devcontainer.json +0 -37
- benchflow-0.2.2/.env.sample +0 -27
- benchflow-0.2.2/.git +0 -1
- benchflow-0.2.2/.github/workflows/test.yml +0 -38
- benchflow-0.2.2/.pre-commit-config.yaml +0 -22
- benchflow-0.2.2/.python-version +0 -2
- benchflow-0.2.2/CLAUDE.md +0 -31
- benchflow-0.2.2/PKG-INFO +0 -215
- benchflow-0.2.2/README.md +0 -180
- benchflow-0.2.2/benchmarks/run_skillsbench.py +0 -25
- benchflow-0.2.2/benchmarks/run_tb2.py +0 -30
- benchflow-0.2.2/benchmarks/skillsbench-claude-glm5.yaml +0 -10
- benchflow-0.2.2/benchmarks/skillsbench-codex-gpt54.yaml +0 -10
- benchflow-0.2.2/benchmarks/tb2_multiturn-codex-gpt54.yaml +0 -10
- benchflow-0.2.2/benchmarks/tb2_single-codex-gpt54.yaml +0 -7
- benchflow-0.2.2/docs/architecture.md +0 -265
- benchflow-0.2.2/docs/cli-reference.md +0 -283
- benchflow-0.2.2/docs/getting-started.md +0 -295
- benchflow-0.2.2/docs/labs.md +0 -88
- benchflow-0.2.2/docs/task-authoring.md +0 -219
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/README.md +0 -153
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/_attack_runner.py +0 -74
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/comparison.ipynb +0 -172
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/environment/Dockerfile +0 -9
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/environment/conftest_exploit.py +0 -18
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/instruction.md +0 -1
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/solution/solve.sh +0 -26
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/task.toml +0 -17
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test.sh +0 -19
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py +0 -13
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/environment/Dockerfile +0 -3
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/instruction.md +0 -3
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/solution/answer.txt +0 -1
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/solution/solve.sh +0 -15
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/task.toml +0 -17
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/tests/ground_truth.txt +0 -1
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/tests/test.sh +0 -10
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/environment/Dockerfile +0 -4
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/instruction.md +0 -1
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/solution/solve.sh +0 -28
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test.sh +0 -10
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py +0 -12
- benchflow-0.2.2/labs/benchjack-sandbox-hardening/run_comparison.py +0 -201
- benchflow-0.2.2/labs/reward-hack-matrix/.gitignore +0 -6
- benchflow-0.2.2/labs/reward-hack-matrix/README.md +0 -119
- benchflow-0.2.2/labs/reward-hack-matrix/_runner.py +0 -82
- benchflow-0.2.2/labs/reward-hack-matrix/_worker.py +0 -179
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/conftest_hook.sh +0 -87
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/conftest_payload.py +0 -33
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/path_trojan.sh +0 -56
- benchflow-0.2.2/labs/reward-hack-matrix/exploits/pth_injection.sh +0 -122
- benchflow-0.2.2/labs/reward-hack-matrix/fetch_corpora.sh +0 -53
- benchflow-0.2.2/labs/reward-hack-matrix/run_matrix.py +0 -758
- benchflow-0.2.2/labs/reward-hack-matrix/sweep_0.2.0_vs_0.2.2.json +0 -7994
- benchflow-0.2.2/src/benchflow/cli/main.py +0 -542
- benchflow-0.2.2/tests/__init__.py +0 -1
- benchflow-0.2.2/tests/conftest.py +0 -16
- benchflow-0.2.2/uv.lock +0 -3302
- {benchflow-0.2.2 → benchflow-0.3.0}/LICENSE +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_agent_setup.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_credentials.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_scoring.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_trajectory.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/client.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/container_transport.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/session.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/transport.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/agents/user_agent.py +0 -0
- {benchflow-0.2.2/tests/examples/hello-world-task → benchflow-0.3.0/src/benchflow/demo_task}/environment/Dockerfile +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/environments.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/models.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/py.typed +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/task_download.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/tasks.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/__init__.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/atif.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/claude_code.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/proxy.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/viewer.py +0 -0
- {benchflow-0.2.2/src/benchflow/cli → benchflow-0.3.0/tests}/__init__.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/test_codex.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_atif_trajectory.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_capture_trajectory.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_env_setup.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_process.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_reexport.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_sdk_lockdown.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_smoke.py +0 -0
- {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_subscription_auth.py +0 -0
|
@@ -2,6 +2,26 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
## 0.2.3 — 2026-04-15
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
|
|
9
|
+
- `benchmarks/tb2_multiturn-claude-haiku45.yaml` — shipped config for the README's TB2 multi-turn Claude result.
|
|
10
|
+
- Daytona resource clamping via `BENCHFLOW_DAYTONA_MAX_CPUS` / `MAX_MEMORY_MB`.
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
- Renamed `skillsbench-claude-glm5.yaml` → `skillsbench-claude-glm51.yaml` to match the model ID.
|
|
15
|
+
- `codex --login` correction in `docs/getting-started.md`.
|
|
16
|
+
- Restricted sdist build to `src/`, `tests/`, and metadata.
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
|
|
20
|
+
- Verifier sandbox hardening follow-ups across several base-image and tooling edge cases.
|
|
21
|
+
- Preserve trusted verifier path entries and workspace answer files.
|
|
22
|
+
- Redirect oracle output to container log.
|
|
23
|
+
- Align YAML path resolution to config file location.
|
|
24
|
+
|
|
5
25
|
## 0.2.2 — 2026-04-13
|
|
6
26
|
|
|
7
27
|
### Added
|
benchflow-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: benchflow
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
|
+
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
|
+
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
7
|
+
Project-URL: Issues, https://github.com/benchflow-ai/benchflow/issues
|
|
8
|
+
Project-URL: Discord, https://discord.gg/mZ9Rc8q8W3
|
|
9
|
+
Project-URL: Changelog, https://github.com/benchflow-ai/benchflow/blob/main/CHANGELOG.md
|
|
10
|
+
Author-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
|
|
11
|
+
Maintainer-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
|
|
12
|
+
License: Apache-2.0
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Keywords: acp,agent-evaluation,benchmark,llm-agents,multi-turn,skillsbench,terminal-bench
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Requires-Dist: anyio>=4.0
|
|
22
|
+
Requires-Dist: harbor==0.3.0
|
|
23
|
+
Requires-Dist: httpx>=0.27.0
|
|
24
|
+
Requires-Dist: pydantic>=2.0
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Requires-Dist: rich>=13.0
|
|
27
|
+
Requires-Dist: typer>=0.9
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pre-commit>=3.7; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=9.0.3; extra == 'dev'
|
|
32
|
+
Requires-Dist: ruff>=0.7.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: ty>=0.0.1a1; extra == 'dev'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
<div align="center">
|
|
37
|
+
<h1>BenchFlow</h1>
|
|
38
|
+
<p>Multi-turn agent benchmarking — Scene-based lifecycle for any ACP agent</p>
|
|
39
|
+
<a href="https://pypi.org/project/benchflow/" target="_blank">
|
|
40
|
+
<img src="https://img.shields.io/badge/PyPI-0.3.0a3-blue?style=for-the-badge&logo=pypi" alt="PyPI">
|
|
41
|
+
</a>
|
|
42
|
+
<a href="https://discord.gg/mZ9Rc8q8W3" target="_blank">
|
|
43
|
+
<img src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord">
|
|
44
|
+
</a>
|
|
45
|
+
</div>
|
|
46
|
+
|
|
47
|
+
## What
|
|
48
|
+
|
|
49
|
+
BenchFlow runs AI agents against benchmark tasks in sandboxed environments. It supports single-agent, multi-agent, and multi-turn evaluation patterns through a Scene-based lifecycle.
|
|
50
|
+
|
|
51
|
+
- **Any ACP agent** — Gemini CLI, Claude, Codex, OpenClaw, Pi, or your own
|
|
52
|
+
- **Multi-scene trials** — skill generation → solve, coder → reviewer → revision
|
|
53
|
+
- **Cloud sandboxes** — Daytona backend for parallel execution at scale
|
|
54
|
+
- **YAML-driven** — same task folder, different trial configs for ablation
|
|
55
|
+
|
|
56
|
+
## Install
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install benchflow==0.3.0a3
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Requires Python 3.12+. For cloud sandboxes, set `DAYTONA_API_KEY`.
|
|
63
|
+
|
|
64
|
+
## Quick Start
|
|
65
|
+
|
|
66
|
+
### CLI
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Run a single task with Gemini
|
|
70
|
+
bench eval create -t tasks/my-task -a gemini -m gemini-3.1-flash-lite-preview -e daytona
|
|
71
|
+
|
|
72
|
+
# Run from YAML config (batch, concurrent)
|
|
73
|
+
bench eval create -f benchmarks/tb2-gemini-baseline.yaml
|
|
74
|
+
|
|
75
|
+
# List agents
|
|
76
|
+
bench agent list
|
|
77
|
+
|
|
78
|
+
# Check task validity
|
|
79
|
+
bench tasks check tasks/my-task
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Python
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
import benchflow as bf
|
|
86
|
+
from benchflow.trial import TrialConfig, Scene, Role, Turn
|
|
87
|
+
|
|
88
|
+
# Simplest: one agent, one task
|
|
89
|
+
result = await bf.run("gemini", task_path="tasks/my-task", model="gemini-3.1-flash-lite-preview")
|
|
90
|
+
print(result.rewards) # {"reward": 1.0}
|
|
91
|
+
|
|
92
|
+
# Scene-based: skill-gen → solve (BYOS pattern)
|
|
93
|
+
config = TrialConfig(
|
|
94
|
+
task_path=Path("tasks/my-task"),
|
|
95
|
+
scenes=[
|
|
96
|
+
Scene(name="skill-gen",
|
|
97
|
+
roles=[Role("gen", "gemini", "gemini-3.1-flash-lite-preview")],
|
|
98
|
+
turns=[Turn("gen", "Analyze the task and write a skill to /app/generated-skill.md")]),
|
|
99
|
+
Scene(name="solve",
|
|
100
|
+
roles=[Role("solver", "gemini", "gemini-3.1-flash-lite-preview")],
|
|
101
|
+
turns=[Turn("solver")]), # None prompt = use instruction.md
|
|
102
|
+
],
|
|
103
|
+
environment="daytona",
|
|
104
|
+
)
|
|
105
|
+
result = await bf.run(config)
|
|
106
|
+
|
|
107
|
+
# Multi-agent: coder + reviewer
|
|
108
|
+
config = TrialConfig(
|
|
109
|
+
task_path=Path("tasks/my-task"),
|
|
110
|
+
scenes=[
|
|
111
|
+
Scene(name="review-loop",
|
|
112
|
+
roles=[
|
|
113
|
+
Role("coder", "gemini", "gemini-3.1-flash-lite-preview"),
|
|
114
|
+
Role("reviewer", "gemini", "gemini-3.1-flash-lite-preview"),
|
|
115
|
+
],
|
|
116
|
+
turns=[
|
|
117
|
+
Turn("coder", "Solve the task. Write to /app/.outbox/reviewer.json when done."),
|
|
118
|
+
Turn("reviewer", "Review the coder's work. Write feedback to /app/.outbox/coder.json."),
|
|
119
|
+
Turn("coder", "Read the reviewer's feedback and revise your solution."),
|
|
120
|
+
]),
|
|
121
|
+
],
|
|
122
|
+
environment="daytona",
|
|
123
|
+
)
|
|
124
|
+
result = await bf.run(config)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### YAML Trial Config
|
|
128
|
+
|
|
129
|
+
```yaml
|
|
130
|
+
# trial-baseline.yaml
|
|
131
|
+
task_dir: .ref/terminal-bench-2
|
|
132
|
+
agent: gemini
|
|
133
|
+
model: gemini-3.1-flash-lite-preview
|
|
134
|
+
environment: daytona
|
|
135
|
+
concurrency: 89
|
|
136
|
+
|
|
137
|
+
# trial-byos.yaml (same tasks, different config)
|
|
138
|
+
task_dir: .ref/terminal-bench-2
|
|
139
|
+
scenes:
|
|
140
|
+
- name: skill-gen
|
|
141
|
+
roles: [{name: gen, agent: gemini, model: gemini-3.1-flash-lite-preview}]
|
|
142
|
+
turns: [{role: gen, prompt: "Generate a skill for this task..."}]
|
|
143
|
+
- name: solve
|
|
144
|
+
roles: [{name: solver, agent: gemini, model: gemini-3.1-flash-lite-preview}]
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## CLI Reference
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
bench agent list List registered agents
|
|
151
|
+
bench agent show <name> Agent details + conformance status
|
|
152
|
+
|
|
153
|
+
bench eval create Create + run evaluation (returns job-id)
|
|
154
|
+
bench eval list List completed evaluations
|
|
155
|
+
|
|
156
|
+
bench skills eval Evaluate skill via evals.json
|
|
157
|
+
|
|
158
|
+
bench tasks init <name> Scaffold new task
|
|
159
|
+
bench tasks check <dir> Validate task (--rubric for custom)
|
|
160
|
+
|
|
161
|
+
bench train create Reward-based training sweep
|
|
162
|
+
|
|
163
|
+
bench environment create Spin up sandbox from task dir
|
|
164
|
+
bench environment list List active sandboxes
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Architecture
|
|
168
|
+
|
|
169
|
+
```
|
|
170
|
+
Trial = sequence of Scenes in a shared sandbox
|
|
171
|
+
Scene = Roles + Turns (one interaction region)
|
|
172
|
+
Role = agent + model
|
|
173
|
+
Turn = one prompt for one role
|
|
174
|
+
|
|
175
|
+
bf.run(config)
|
|
176
|
+
→ Trial.create(config)
|
|
177
|
+
→ trial.setup() # resolve config, create env object
|
|
178
|
+
→ trial.start() # spin up sandbox, upload task files
|
|
179
|
+
→ for scene in config.scenes:
|
|
180
|
+
→ trial._run_scene(scene) # connect/execute/disconnect per role
|
|
181
|
+
→ trial.verify() # run verifier, score
|
|
182
|
+
→ trial.cleanup() # stop sandbox
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Registered Agents
|
|
186
|
+
|
|
187
|
+
| Agent | Command | Auth |
|
|
188
|
+
|-------|---------|------|
|
|
189
|
+
| `gemini` | `gemini --acp --yolo` | GOOGLE_API_KEY |
|
|
190
|
+
| `claude-agent-acp` | `claude-agent-acp` | ANTHROPIC_API_KEY |
|
|
191
|
+
| `codex-acp` | `codex-acp` | OPENAI_API_KEY |
|
|
192
|
+
| `openclaw` | `openclaw-acp-shim` | inferred from model |
|
|
193
|
+
| `pi-acp` | `pi-acp` | ANTHROPIC_API_KEY |
|
|
194
|
+
|
|
195
|
+
## Adding a Custom Agent
|
|
196
|
+
|
|
197
|
+
Any ACP-native agent works. Create `agent.toml`:
|
|
198
|
+
|
|
199
|
+
```toml
|
|
200
|
+
name = "my-agent"
|
|
201
|
+
launch_cmd = "my-agent --acp"
|
|
202
|
+
install_cmd = "npm install -g my-agent"
|
|
203
|
+
requires_env = ["MY_API_KEY"]
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Development
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
uv venv -p 3.12 .venv && uv pip install -e ".[dev]"
|
|
210
|
+
.venv/bin/python -m pytest tests/ # 580+ unit tests
|
|
211
|
+
.venv/bin/ty check src/ # type check
|
|
212
|
+
```
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<h1>BenchFlow</h1>
|
|
3
|
+
<p>Multi-turn agent benchmarking — Scene-based lifecycle for any ACP agent</p>
|
|
4
|
+
<a href="https://pypi.org/project/benchflow/" target="_blank">
|
|
5
|
+
<img src="https://img.shields.io/badge/PyPI-0.3.0a3-blue?style=for-the-badge&logo=pypi" alt="PyPI">
|
|
6
|
+
</a>
|
|
7
|
+
<a href="https://discord.gg/mZ9Rc8q8W3" target="_blank">
|
|
8
|
+
<img src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord">
|
|
9
|
+
</a>
|
|
10
|
+
</div>
|
|
11
|
+
|
|
12
|
+
## What
|
|
13
|
+
|
|
14
|
+
BenchFlow runs AI agents against benchmark tasks in sandboxed environments. It supports single-agent, multi-agent, and multi-turn evaluation patterns through a Scene-based lifecycle.
|
|
15
|
+
|
|
16
|
+
- **Any ACP agent** — Gemini CLI, Claude, Codex, OpenClaw, Pi, or your own
|
|
17
|
+
- **Multi-scene trials** — skill generation → solve, coder → reviewer → revision
|
|
18
|
+
- **Cloud sandboxes** — Daytona backend for parallel execution at scale
|
|
19
|
+
- **YAML-driven** — same task folder, different trial configs for ablation
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install benchflow==0.3.0a3
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Requires Python 3.12+. For cloud sandboxes, set `DAYTONA_API_KEY`.
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
### CLI
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
# Run a single task with Gemini
|
|
35
|
+
bench eval create -t tasks/my-task -a gemini -m gemini-3.1-flash-lite-preview -e daytona
|
|
36
|
+
|
|
37
|
+
# Run from YAML config (batch, concurrent)
|
|
38
|
+
bench eval create -f benchmarks/tb2-gemini-baseline.yaml
|
|
39
|
+
|
|
40
|
+
# List agents
|
|
41
|
+
bench agent list
|
|
42
|
+
|
|
43
|
+
# Check task validity
|
|
44
|
+
bench tasks check tasks/my-task
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Python
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import benchflow as bf
|
|
51
|
+
from benchflow.trial import TrialConfig, Scene, Role, Turn
|
|
52
|
+
|
|
53
|
+
# Simplest: one agent, one task
|
|
54
|
+
result = await bf.run("gemini", task_path="tasks/my-task", model="gemini-3.1-flash-lite-preview")
|
|
55
|
+
print(result.rewards) # {"reward": 1.0}
|
|
56
|
+
|
|
57
|
+
# Scene-based: skill-gen → solve (BYOS pattern)
|
|
58
|
+
config = TrialConfig(
|
|
59
|
+
task_path=Path("tasks/my-task"),
|
|
60
|
+
scenes=[
|
|
61
|
+
Scene(name="skill-gen",
|
|
62
|
+
roles=[Role("gen", "gemini", "gemini-3.1-flash-lite-preview")],
|
|
63
|
+
turns=[Turn("gen", "Analyze the task and write a skill to /app/generated-skill.md")]),
|
|
64
|
+
Scene(name="solve",
|
|
65
|
+
roles=[Role("solver", "gemini", "gemini-3.1-flash-lite-preview")],
|
|
66
|
+
turns=[Turn("solver")]), # None prompt = use instruction.md
|
|
67
|
+
],
|
|
68
|
+
environment="daytona",
|
|
69
|
+
)
|
|
70
|
+
result = await bf.run(config)
|
|
71
|
+
|
|
72
|
+
# Multi-agent: coder + reviewer
|
|
73
|
+
config = TrialConfig(
|
|
74
|
+
task_path=Path("tasks/my-task"),
|
|
75
|
+
scenes=[
|
|
76
|
+
Scene(name="review-loop",
|
|
77
|
+
roles=[
|
|
78
|
+
Role("coder", "gemini", "gemini-3.1-flash-lite-preview"),
|
|
79
|
+
Role("reviewer", "gemini", "gemini-3.1-flash-lite-preview"),
|
|
80
|
+
],
|
|
81
|
+
turns=[
|
|
82
|
+
Turn("coder", "Solve the task. Write to /app/.outbox/reviewer.json when done."),
|
|
83
|
+
Turn("reviewer", "Review the coder's work. Write feedback to /app/.outbox/coder.json."),
|
|
84
|
+
Turn("coder", "Read the reviewer's feedback and revise your solution."),
|
|
85
|
+
]),
|
|
86
|
+
],
|
|
87
|
+
environment="daytona",
|
|
88
|
+
)
|
|
89
|
+
result = await bf.run(config)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### YAML Trial Config
|
|
93
|
+
|
|
94
|
+
```yaml
|
|
95
|
+
# trial-baseline.yaml
|
|
96
|
+
task_dir: .ref/terminal-bench-2
|
|
97
|
+
agent: gemini
|
|
98
|
+
model: gemini-3.1-flash-lite-preview
|
|
99
|
+
environment: daytona
|
|
100
|
+
concurrency: 89
|
|
101
|
+
|
|
102
|
+
# trial-byos.yaml (same tasks, different config)
|
|
103
|
+
task_dir: .ref/terminal-bench-2
|
|
104
|
+
scenes:
|
|
105
|
+
- name: skill-gen
|
|
106
|
+
roles: [{name: gen, agent: gemini, model: gemini-3.1-flash-lite-preview}]
|
|
107
|
+
turns: [{role: gen, prompt: "Generate a skill for this task..."}]
|
|
108
|
+
- name: solve
|
|
109
|
+
roles: [{name: solver, agent: gemini, model: gemini-3.1-flash-lite-preview}]
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## CLI Reference
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
bench agent list List registered agents
|
|
116
|
+
bench agent show <name> Agent details + conformance status
|
|
117
|
+
|
|
118
|
+
bench eval create Create + run evaluation (returns job-id)
|
|
119
|
+
bench eval list List completed evaluations
|
|
120
|
+
|
|
121
|
+
bench skills eval Evaluate skill via evals.json
|
|
122
|
+
|
|
123
|
+
bench tasks init <name> Scaffold new task
|
|
124
|
+
bench tasks check <dir> Validate task (--rubric for custom)
|
|
125
|
+
|
|
126
|
+
bench train create Reward-based training sweep
|
|
127
|
+
|
|
128
|
+
bench environment create Spin up sandbox from task dir
|
|
129
|
+
bench environment list List active sandboxes
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Architecture
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
Trial = sequence of Scenes in a shared sandbox
|
|
136
|
+
Scene = Roles + Turns (one interaction region)
|
|
137
|
+
Role = agent + model
|
|
138
|
+
Turn = one prompt for one role
|
|
139
|
+
|
|
140
|
+
bf.run(config)
|
|
141
|
+
→ Trial.create(config)
|
|
142
|
+
→ trial.setup() # resolve config, create env object
|
|
143
|
+
→ trial.start() # spin up sandbox, upload task files
|
|
144
|
+
→ for scene in config.scenes:
|
|
145
|
+
→ trial._run_scene(scene) # connect/execute/disconnect per role
|
|
146
|
+
→ trial.verify() # run verifier, score
|
|
147
|
+
→ trial.cleanup() # stop sandbox
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Registered Agents
|
|
151
|
+
|
|
152
|
+
| Agent | Command | Auth |
|
|
153
|
+
|-------|---------|------|
|
|
154
|
+
| `gemini` | `gemini --acp --yolo` | GOOGLE_API_KEY |
|
|
155
|
+
| `claude-agent-acp` | `claude-agent-acp` | ANTHROPIC_API_KEY |
|
|
156
|
+
| `codex-acp` | `codex-acp` | OPENAI_API_KEY |
|
|
157
|
+
| `openclaw` | `openclaw-acp-shim` | inferred from model |
|
|
158
|
+
| `pi-acp` | `pi-acp` | ANTHROPIC_API_KEY |
|
|
159
|
+
|
|
160
|
+
## Adding a Custom Agent
|
|
161
|
+
|
|
162
|
+
Any ACP-native agent works. Create `agent.toml`:
|
|
163
|
+
|
|
164
|
+
```toml
|
|
165
|
+
name = "my-agent"
|
|
166
|
+
launch_cmd = "my-agent --acp"
|
|
167
|
+
install_cmd = "npm install -g my-agent"
|
|
168
|
+
requires_env = ["MY_API_KEY"]
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Development
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
uv venv -p 3.12 .venv && uv pip install -e ".[dev]"
|
|
175
|
+
.venv/bin/python -m pytest tests/ # 580+ unit tests
|
|
176
|
+
.venv/bin/ty check src/ # type check
|
|
177
|
+
```
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "benchflow"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -37,7 +37,7 @@ classifiers = [
|
|
|
37
37
|
[project.optional-dependencies]
|
|
38
38
|
dev = [
|
|
39
39
|
"pre-commit>=3.7",
|
|
40
|
-
"pytest>=
|
|
40
|
+
"pytest>=9.0.3",
|
|
41
41
|
"pytest-asyncio>=0.24.0",
|
|
42
42
|
"ruff>=0.7.0",
|
|
43
43
|
"ty>=0.0.1a1",
|
|
@@ -45,6 +45,7 @@ dev = [
|
|
|
45
45
|
|
|
46
46
|
[project.scripts]
|
|
47
47
|
benchflow = "benchflow.cli.main:app"
|
|
48
|
+
bench = "benchflow.cli.main:app"
|
|
48
49
|
|
|
49
50
|
[project.urls]
|
|
50
51
|
Homepage = "https://github.com/benchflow-ai/benchflow"
|
|
@@ -58,20 +59,20 @@ requires = ["hatchling"]
|
|
|
58
59
|
build-backend = "hatchling.build"
|
|
59
60
|
|
|
60
61
|
[tool.hatch.build.targets.sdist]
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
"
|
|
64
|
-
"
|
|
65
|
-
"
|
|
66
|
-
".
|
|
67
|
-
"
|
|
68
|
-
".
|
|
69
|
-
"__pycache__",
|
|
62
|
+
# Allowlist: only ship what the installed package needs.
|
|
63
|
+
only-include = [
|
|
64
|
+
"src",
|
|
65
|
+
"tests",
|
|
66
|
+
"README.md",
|
|
67
|
+
"CHANGELOG.md",
|
|
68
|
+
"LICENSE",
|
|
69
|
+
"pyproject.toml",
|
|
70
70
|
]
|
|
71
71
|
|
|
72
72
|
[tool.pytest.ini_options]
|
|
73
73
|
asyncio_mode = "auto"
|
|
74
74
|
addopts = "-m 'not live'"
|
|
75
|
+
testpaths = ["tests"]
|
|
75
76
|
markers = [
|
|
76
77
|
"live: requires real Anthropic API and Docker daemon (run with -m live)",
|
|
77
78
|
]
|
|
@@ -45,7 +45,20 @@ from benchflow.environments import (
|
|
|
45
45
|
from benchflow.job import Job, JobConfig, JobResult, RetryConfig
|
|
46
46
|
from benchflow.metrics import BenchmarkMetrics, collect_metrics
|
|
47
47
|
from benchflow.models import AgentInstallError, AgentTimeoutError, RunResult
|
|
48
|
+
from benchflow.runtime import (
|
|
49
|
+
Agent,
|
|
50
|
+
Environment,
|
|
51
|
+
Runtime,
|
|
52
|
+
RuntimeConfig,
|
|
53
|
+
RuntimeResult,
|
|
54
|
+
run, # bf.run(agent, env) — the primary 0.3 API
|
|
55
|
+
)
|
|
56
|
+
from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene
|
|
57
|
+
from benchflow._snapshot import list_snapshots, restore, snapshot
|
|
48
58
|
from benchflow.sdk import SDK
|
|
59
|
+
from benchflow.trial import Trial, TrialConfig
|
|
60
|
+
from benchflow.trial import Role as TrialRole, Scene as TrialScene, Turn
|
|
61
|
+
from benchflow.trial_yaml import trial_config_from_yaml
|
|
49
62
|
from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
|
|
50
63
|
from benchflow.trajectories.otel import OTelCollector
|
|
51
64
|
from benchflow.trajectories.proxy import TrajectoryProxy
|
|
@@ -63,7 +76,6 @@ __all__ = [
|
|
|
63
76
|
"ExecResult",
|
|
64
77
|
"Task",
|
|
65
78
|
"TaskConfig",
|
|
66
|
-
"Trial",
|
|
67
79
|
"Verifier",
|
|
68
80
|
"VerifierResult",
|
|
69
81
|
# ACP
|
|
@@ -88,7 +100,30 @@ __all__ = [
|
|
|
88
100
|
"AgentInstallError",
|
|
89
101
|
"AgentTimeoutError",
|
|
90
102
|
"RunResult",
|
|
91
|
-
#
|
|
103
|
+
# Runtime (0.3 primary API)
|
|
104
|
+
"Agent",
|
|
105
|
+
"Environment",
|
|
106
|
+
"Runtime",
|
|
107
|
+
"RuntimeConfig",
|
|
108
|
+
"RuntimeResult",
|
|
109
|
+
"run",
|
|
110
|
+
# Multi-agent scene
|
|
111
|
+
"Scene",
|
|
112
|
+
"Role",
|
|
113
|
+
"Message",
|
|
114
|
+
"MessageTransport",
|
|
115
|
+
"MailboxTransport",
|
|
116
|
+
# Env snapshots
|
|
117
|
+
"snapshot",
|
|
118
|
+
"restore",
|
|
119
|
+
"list_snapshots",
|
|
120
|
+
# Trial (decomposed lifecycle)
|
|
121
|
+
"Trial",
|
|
122
|
+
"TrialConfig",
|
|
123
|
+
"TrialRole",
|
|
124
|
+
"TrialScene",
|
|
125
|
+
"Turn",
|
|
126
|
+
# SDK (backwards compat)
|
|
92
127
|
"SDK",
|
|
93
128
|
# Environments / dep staging
|
|
94
129
|
"SERVICES",
|
|
@@ -25,11 +25,16 @@ from benchflow._sandbox import build_priv_drop_cmd
|
|
|
25
25
|
from benchflow._trajectory import _capture_session_trajectory
|
|
26
26
|
from benchflow.acp.client import ACPClient
|
|
27
27
|
from benchflow.acp.container_transport import ContainerTransport
|
|
28
|
+
from benchflow.agents.providers import strip_provider_prefix
|
|
28
29
|
from benchflow.process import DaytonaProcess, DockerProcess
|
|
29
30
|
|
|
30
31
|
logger = logging.getLogger(__name__)
|
|
31
32
|
|
|
32
33
|
|
|
34
|
+
_ACP_CONNECT_MAX_RETRIES = 3
|
|
35
|
+
_ACP_CONNECT_BASE_DELAY = 2.0
|
|
36
|
+
|
|
37
|
+
|
|
33
38
|
async def connect_acp(
|
|
34
39
|
env,
|
|
35
40
|
agent: str,
|
|
@@ -41,7 +46,10 @@ async def connect_acp(
|
|
|
41
46
|
environment: str,
|
|
42
47
|
agent_cwd: str,
|
|
43
48
|
) -> tuple[ACPClient, object, str]:
|
|
44
|
-
"""Create ACP transport, connect, init session, set model. Return (client, session, agent_name).
|
|
49
|
+
"""Create ACP transport, connect, init session, set model. Return (client, session, agent_name).
|
|
50
|
+
|
|
51
|
+
Retries with exponential backoff on ConnectionError (Daytona SSH storms).
|
|
52
|
+
"""
|
|
45
53
|
# Resolve agent binary path for non-docker environments
|
|
46
54
|
if environment != "docker":
|
|
47
55
|
which_result = await env.exec(
|
|
@@ -58,32 +66,61 @@ async def connect_acp(
|
|
|
58
66
|
agent_launch = build_priv_drop_cmd(agent_launch, sandbox_user)
|
|
59
67
|
logger.info(f"Agent sandboxed as: {sandbox_user}")
|
|
60
68
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
container_process=live_proc,
|
|
69
|
-
command=agent_launch,
|
|
70
|
-
env=agent_env,
|
|
71
|
-
cwd=agent_cwd,
|
|
72
|
-
agent_log_path=agent_log,
|
|
73
|
-
)
|
|
74
|
-
acp_client = ACPClient(transport)
|
|
75
|
-
await acp_client.connect()
|
|
76
|
-
|
|
77
|
-
init_result = await asyncio.wait_for(acp_client.initialize(), timeout=60)
|
|
78
|
-
agent_name = init_result.agent_info.name if init_result.agent_info else agent
|
|
79
|
-
logger.info(f"ACP agent: {agent_name}")
|
|
80
|
-
|
|
81
|
-
session = await asyncio.wait_for(acp_client.session_new(cwd=agent_cwd), timeout=60)
|
|
82
|
-
logger.info(f"Session: {session.session_id}")
|
|
69
|
+
last_err: Exception | None = None
|
|
70
|
+
acp_client: ACPClient | None = None
|
|
71
|
+
for attempt in range(_ACP_CONNECT_MAX_RETRIES + 1):
|
|
72
|
+
if attempt > 0:
|
|
73
|
+
delay = _ACP_CONNECT_BASE_DELAY * (2 ** (attempt - 1))
|
|
74
|
+
logger.info(f"ACP connect retry {attempt}/{_ACP_CONNECT_MAX_RETRIES} after {delay:.0f}s")
|
|
75
|
+
await asyncio.sleep(delay)
|
|
83
76
|
|
|
84
|
-
|
|
85
|
-
|
|
77
|
+
try:
|
|
78
|
+
if environment == "docker":
|
|
79
|
+
live_proc = DockerProcess.from_harbor_env(env)
|
|
80
|
+
else:
|
|
81
|
+
live_proc = await DaytonaProcess.from_harbor_env(env)
|
|
82
|
+
|
|
83
|
+
agent_log = trial_dir / "agent" / f"{agent.replace('-', '_')}.txt"
|
|
84
|
+
transport = ContainerTransport(
|
|
85
|
+
container_process=live_proc,
|
|
86
|
+
command=agent_launch,
|
|
87
|
+
env=agent_env,
|
|
88
|
+
cwd=agent_cwd,
|
|
89
|
+
agent_log_path=agent_log,
|
|
90
|
+
)
|
|
91
|
+
acp_client = ACPClient(transport)
|
|
92
|
+
await acp_client.connect()
|
|
93
|
+
|
|
94
|
+
init_result = await asyncio.wait_for(acp_client.initialize(), timeout=60)
|
|
95
|
+
agent_name = init_result.agent_info.name if init_result.agent_info else agent
|
|
96
|
+
logger.info(f"ACP agent: {agent_name}")
|
|
97
|
+
|
|
98
|
+
session = await asyncio.wait_for(acp_client.session_new(cwd=agent_cwd), timeout=60)
|
|
99
|
+
logger.info(f"Session: {session.session_id}")
|
|
100
|
+
break
|
|
101
|
+
except ConnectionError as e:
|
|
102
|
+
# Close the failed client before retrying
|
|
103
|
+
if acp_client:
|
|
104
|
+
try:
|
|
105
|
+
await acp_client.close()
|
|
106
|
+
except Exception:
|
|
107
|
+
pass
|
|
108
|
+
acp_client = None
|
|
109
|
+
last_err = e
|
|
110
|
+
if attempt == _ACP_CONNECT_MAX_RETRIES:
|
|
111
|
+
raise
|
|
112
|
+
logger.warning(f"ACP connect failed (attempt {attempt + 1}): {e}")
|
|
113
|
+
continue
|
|
114
|
+
except Exception:
|
|
115
|
+
# Non-retryable error — close client to prevent leak
|
|
116
|
+
if acp_client:
|
|
117
|
+
try:
|
|
118
|
+
await acp_client.close()
|
|
119
|
+
except Exception:
|
|
120
|
+
pass
|
|
121
|
+
raise
|
|
86
122
|
|
|
123
|
+
if model:
|
|
87
124
|
acp_model_id = strip_provider_prefix(model)
|
|
88
125
|
try:
|
|
89
126
|
await asyncio.wait_for(acp_client.set_model(acp_model_id), timeout=60)
|
|
@@ -102,7 +139,7 @@ async def execute_prompts(
|
|
|
102
139
|
) -> tuple[list[dict], int]:
|
|
103
140
|
"""Send prompts via ACP and capture trajectory. Return (trajectory, n_tool_calls)."""
|
|
104
141
|
for i, prompt in enumerate(prompts):
|
|
105
|
-
logger.info(f"Prompt {i + 1}/{len(prompts)}: {prompt[:80]}...")
|
|
142
|
+
logger.info(f"Prompt {i + 1}/{len(prompts)}: {(prompt or '<instruction.md>')[:80]}...")
|
|
106
143
|
prompt_result = await asyncio.wait_for(
|
|
107
144
|
acp_client.prompt(prompt),
|
|
108
145
|
timeout=timeout,
|