benchflow 0.2.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. {benchflow-0.2.2 → benchflow-0.3.0}/.gitignore +2 -0
  2. {benchflow-0.2.2 → benchflow-0.3.0}/CHANGELOG.md +20 -0
  3. benchflow-0.3.0/PKG-INFO +212 -0
  4. benchflow-0.3.0/README.md +177 -0
  5. {benchflow-0.2.2 → benchflow-0.3.0}/pyproject.toml +12 -11
  6. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/__init__.py +37 -2
  7. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_acp_run.py +63 -26
  8. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_agent_env.py +9 -1
  9. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_env_setup.py +24 -1
  10. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_sandbox.py +337 -43
  11. benchflow-0.3.0/src/benchflow/_scene.py +289 -0
  12. benchflow-0.3.0/src/benchflow/_snapshot.py +75 -0
  13. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/agents/openclaw_acp_shim.py +28 -13
  14. benchflow-0.3.0/src/benchflow/agents/pi_acp_launcher.py +131 -0
  15. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/agents/providers.py +11 -7
  16. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/agents/registry.py +125 -17
  17. benchflow-0.3.0/src/benchflow/cli/__init__.py +1 -0
  18. benchflow-0.3.0/src/benchflow/cli/eval.py +371 -0
  19. benchflow-0.3.0/src/benchflow/cli/main.py +1039 -0
  20. benchflow-0.3.0/src/benchflow/demo_task/instruction.md +7 -0
  21. benchflow-0.3.0/src/benchflow/demo_task/task.toml +17 -0
  22. benchflow-0.3.0/src/benchflow/demo_task/tests/test.sh +20 -0
  23. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/job.py +88 -38
  24. benchflow-0.3.0/src/benchflow/mcp/__init__.py +5 -0
  25. benchflow-0.3.0/src/benchflow/mcp/hooks.py +74 -0
  26. benchflow-0.3.0/src/benchflow/mcp/reviewer_server.py +143 -0
  27. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/metrics.py +1 -2
  28. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/process.py +0 -6
  29. benchflow-0.3.0/src/benchflow/runtime.py +352 -0
  30. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/sdk.py +89 -248
  31. benchflow-0.3.0/src/benchflow/skill_eval.py +696 -0
  32. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/skills.py +0 -3
  33. benchflow-0.3.0/src/benchflow/templates/__init__.py +5 -0
  34. benchflow-0.3.0/src/benchflow/templates/judge.py.tmpl +193 -0
  35. benchflow-0.3.0/src/benchflow/templates/test.sh.tmpl +12 -0
  36. benchflow-0.3.0/src/benchflow/trial.py +690 -0
  37. benchflow-0.3.0/src/benchflow/trial_yaml.py +169 -0
  38. benchflow-0.3.0/tests/conformance/README.md +21 -0
  39. benchflow-0.3.0/tests/conformance/acp_smoke/environment/Dockerfile +7 -0
  40. benchflow-0.3.0/tests/conformance/acp_smoke/instruction.md +7 -0
  41. benchflow-0.3.0/tests/conformance/acp_smoke/solution/solve.sh +4 -0
  42. {benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection → benchflow-0.3.0/tests/conformance/acp_smoke}/task.toml +4 -4
  43. benchflow-0.3.0/tests/conformance/acp_smoke/tests/test.sh +13 -0
  44. benchflow-0.3.0/tests/conformance/conformance-results.json +40 -0
  45. benchflow-0.3.0/tests/conformance/proof_multi_agent.py +165 -0
  46. benchflow-0.3.0/tests/conformance/proof_snapshot.py +86 -0
  47. benchflow-0.3.0/tests/conformance/run_conformance.py +129 -0
  48. benchflow-0.3.0/tests/conftest.py +78 -0
  49. benchflow-0.3.0/tests/examples/hello-world-task/environment/Dockerfile +7 -0
  50. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_acp.py +118 -2
  51. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_agent_model_decouple.py +17 -28
  52. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_agent_registry.py +3 -3
  53. benchflow-0.3.0/tests/test_agent_spec.py +80 -0
  54. benchflow-0.3.0/tests/test_eval_cli.py +118 -0
  55. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_job.py +13 -40
  56. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_metrics.py +4 -1
  57. benchflow-0.3.0/tests/test_oracle.py +63 -0
  58. benchflow-0.3.0/tests/test_pi_acp_launcher.py +301 -0
  59. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_providers.py +51 -52
  60. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_registry_invariants.py +1 -1
  61. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_resolve_env_helpers.py +11 -16
  62. benchflow-0.3.0/tests/test_rewards_jsonl.py +108 -0
  63. benchflow-0.3.0/tests/test_runtime.py +154 -0
  64. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_sandbox.py +0 -9
  65. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_sandbox_hardening.py +379 -93
  66. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_sandbox_verifier_workspace.py +14 -8
  67. benchflow-0.3.0/tests/test_scene.py +202 -0
  68. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_scoring.py +24 -28
  69. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_sdk_internals.py +18 -12
  70. benchflow-0.3.0/tests/test_skill_eval.py +404 -0
  71. benchflow-0.3.0/tests/test_skill_eval_dryrun.py +271 -0
  72. benchflow-0.3.0/tests/test_skill_eval_integration.py +338 -0
  73. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_skills.py +0 -12
  74. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_tasks.py +0 -5
  75. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_verify.py +16 -82
  76. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_yaml_config.py +68 -5
  77. benchflow-0.2.2/.devcontainer/Dockerfile +0 -88
  78. benchflow-0.2.2/.devcontainer/devcontainer.json +0 -37
  79. benchflow-0.2.2/.env.sample +0 -27
  80. benchflow-0.2.2/.git +0 -1
  81. benchflow-0.2.2/.github/workflows/test.yml +0 -38
  82. benchflow-0.2.2/.pre-commit-config.yaml +0 -22
  83. benchflow-0.2.2/.python-version +0 -2
  84. benchflow-0.2.2/CLAUDE.md +0 -31
  85. benchflow-0.2.2/PKG-INFO +0 -215
  86. benchflow-0.2.2/README.md +0 -180
  87. benchflow-0.2.2/benchmarks/run_skillsbench.py +0 -25
  88. benchflow-0.2.2/benchmarks/run_tb2.py +0 -30
  89. benchflow-0.2.2/benchmarks/skillsbench-claude-glm5.yaml +0 -10
  90. benchflow-0.2.2/benchmarks/skillsbench-codex-gpt54.yaml +0 -10
  91. benchflow-0.2.2/benchmarks/tb2_multiturn-codex-gpt54.yaml +0 -10
  92. benchflow-0.2.2/benchmarks/tb2_single-codex-gpt54.yaml +0 -7
  93. benchflow-0.2.2/docs/architecture.md +0 -265
  94. benchflow-0.2.2/docs/cli-reference.md +0 -283
  95. benchflow-0.2.2/docs/getting-started.md +0 -295
  96. benchflow-0.2.2/docs/labs.md +0 -88
  97. benchflow-0.2.2/docs/task-authoring.md +0 -219
  98. benchflow-0.2.2/labs/benchjack-sandbox-hardening/README.md +0 -153
  99. benchflow-0.2.2/labs/benchjack-sandbox-hardening/_attack_runner.py +0 -74
  100. benchflow-0.2.2/labs/benchjack-sandbox-hardening/comparison.ipynb +0 -172
  101. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/environment/Dockerfile +0 -9
  102. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/environment/conftest_exploit.py +0 -18
  103. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/instruction.md +0 -1
  104. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/solution/solve.sh +0 -26
  105. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/task.toml +0 -17
  106. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test.sh +0 -19
  107. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern1_conftest_hook/tests/test_outputs.py +0 -13
  108. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/environment/Dockerfile +0 -3
  109. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/instruction.md +0 -3
  110. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/solution/answer.txt +0 -1
  111. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/solution/solve.sh +0 -15
  112. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/task.toml +0 -17
  113. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/tests/ground_truth.txt +0 -1
  114. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern2_answer_lookup/tests/test.sh +0 -10
  115. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/environment/Dockerfile +0 -4
  116. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/instruction.md +0 -1
  117. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/solution/solve.sh +0 -28
  118. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test.sh +0 -10
  119. benchflow-0.2.2/labs/benchjack-sandbox-hardening/pattern7_pth_injection/tests/test_outputs.py +0 -12
  120. benchflow-0.2.2/labs/benchjack-sandbox-hardening/run_comparison.py +0 -201
  121. benchflow-0.2.2/labs/reward-hack-matrix/.gitignore +0 -6
  122. benchflow-0.2.2/labs/reward-hack-matrix/README.md +0 -119
  123. benchflow-0.2.2/labs/reward-hack-matrix/_runner.py +0 -82
  124. benchflow-0.2.2/labs/reward-hack-matrix/_worker.py +0 -179
  125. benchflow-0.2.2/labs/reward-hack-matrix/exploits/conftest_hook.sh +0 -87
  126. benchflow-0.2.2/labs/reward-hack-matrix/exploits/conftest_payload.py +0 -33
  127. benchflow-0.2.2/labs/reward-hack-matrix/exploits/path_trojan.sh +0 -56
  128. benchflow-0.2.2/labs/reward-hack-matrix/exploits/pth_injection.sh +0 -122
  129. benchflow-0.2.2/labs/reward-hack-matrix/fetch_corpora.sh +0 -53
  130. benchflow-0.2.2/labs/reward-hack-matrix/run_matrix.py +0 -758
  131. benchflow-0.2.2/labs/reward-hack-matrix/sweep_0.2.0_vs_0.2.2.json +0 -7994
  132. benchflow-0.2.2/src/benchflow/cli/main.py +0 -542
  133. benchflow-0.2.2/tests/__init__.py +0 -1
  134. benchflow-0.2.2/tests/conftest.py +0 -16
  135. benchflow-0.2.2/uv.lock +0 -3302
  136. {benchflow-0.2.2 → benchflow-0.3.0}/LICENSE +0 -0
  137. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_agent_setup.py +0 -0
  138. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_credentials.py +0 -0
  139. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_scoring.py +0 -0
  140. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/_trajectory.py +0 -0
  141. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/__init__.py +0 -0
  142. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/client.py +0 -0
  143. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/container_transport.py +0 -0
  144. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/session.py +0 -0
  145. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/transport.py +0 -0
  146. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/acp/types.py +0 -0
  147. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/agents/__init__.py +0 -0
  148. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/agents/user_agent.py +0 -0
  149. {benchflow-0.2.2/tests/examples/hello-world-task → benchflow-0.3.0/src/benchflow/demo_task}/environment/Dockerfile +0 -0
  150. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/environments.py +0 -0
  151. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/models.py +0 -0
  152. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/py.typed +0 -0
  153. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/task_download.py +0 -0
  154. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/tasks.py +0 -0
  155. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/__init__.py +0 -0
  156. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/atif.py +0 -0
  157. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/claude_code.py +0 -0
  158. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/otel.py +0 -0
  159. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/proxy.py +0 -0
  160. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/trajectories/types.py +0 -0
  161. {benchflow-0.2.2 → benchflow-0.3.0}/src/benchflow/viewer.py +0 -0
  162. {benchflow-0.2.2/src/benchflow/cli → benchflow-0.3.0/tests}/__init__.py +0 -0
  163. {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/hello-world-task/instruction.md +0 -0
  164. {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/hello-world-task/solution/solve.sh +0 -0
  165. {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/hello-world-task/task.toml +0 -0
  166. {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/hello-world-task/tests/test.sh +0 -0
  167. {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/test_claude.sh +0 -0
  168. {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/test_codex.sh +0 -0
  169. {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/test_gemini.sh +0 -0
  170. {benchflow-0.2.2 → benchflow-0.3.0}/tests/examples/test_openclaw.sh +0 -0
  171. {benchflow-0.2.2 → benchflow-0.3.0}/tests/fixtures/mock_acp_agent.py +0 -0
  172. {benchflow-0.2.2 → benchflow-0.3.0}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
  173. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_atif_trajectory.py +0 -0
  174. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_capture_trajectory.py +0 -0
  175. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_env_setup.py +0 -0
  176. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_exclude_tasks.py +0 -0
  177. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_process.py +0 -0
  178. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_reexport.py +0 -0
  179. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_sdk_lockdown.py +0 -0
  180. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_smoke.py +0 -0
  181. {benchflow-0.2.2 → benchflow-0.3.0}/tests/test_subscription_auth.py +0 -0
@@ -181,3 +181,5 @@ dogfood/
181
181
  tmp/
182
182
  .claude/settings.local.json
183
183
  tests/.smoke-jobs/
184
+ context/
185
+ tutorials/
@@ -2,6 +2,26 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## 0.2.3 — 2026-04-15
6
+
7
+ ### Added
8
+
9
+ - `benchmarks/tb2_multiturn-claude-haiku45.yaml` — shipped config for the README's TB2 multi-turn Claude result.
10
+ - Daytona resource clamping via `BENCHFLOW_DAYTONA_MAX_CPUS` / `MAX_MEMORY_MB`.
11
+
12
+ ### Changed
13
+
14
+ - Renamed `skillsbench-claude-glm5.yaml` → `skillsbench-claude-glm51.yaml` to match the model ID.
15
+ - `codex --login` correction in `docs/getting-started.md`.
16
+ - Restricted sdist build to `src/`, `tests/`, and metadata.
17
+
18
+ ### Fixed
19
+
20
+ - Verifier sandbox hardening follow-ups across several base-image and tooling edge cases.
21
+ - Preserve trusted verifier path entries and workspace answer files.
22
+ - Redirect oracle output to container log.
23
+ - Align YAML path resolution to config file location.
24
+
5
25
  ## 0.2.2 — 2026-04-13
6
26
 
7
27
  ### Added
@@ -0,0 +1,212 @@
1
+ Metadata-Version: 2.4
2
+ Name: benchflow
3
+ Version: 0.3.0
4
+ Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
5
+ Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
6
+ Project-URL: Repository, https://github.com/benchflow-ai/benchflow
7
+ Project-URL: Issues, https://github.com/benchflow-ai/benchflow/issues
8
+ Project-URL: Discord, https://discord.gg/mZ9Rc8q8W3
9
+ Project-URL: Changelog, https://github.com/benchflow-ai/benchflow/blob/main/CHANGELOG.md
10
+ Author-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
11
+ Maintainer-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
12
+ License: Apache-2.0
13
+ License-File: LICENSE
14
+ Keywords: acp,agent-evaluation,benchmark,llm-agents,multi-turn,skillsbench,terminal-bench
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Python: >=3.12
21
+ Requires-Dist: anyio>=4.0
22
+ Requires-Dist: harbor==0.3.0
23
+ Requires-Dist: httpx>=0.27.0
24
+ Requires-Dist: pydantic>=2.0
25
+ Requires-Dist: pyyaml>=6.0
26
+ Requires-Dist: rich>=13.0
27
+ Requires-Dist: typer>=0.9
28
+ Provides-Extra: dev
29
+ Requires-Dist: pre-commit>=3.7; extra == 'dev'
30
+ Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
31
+ Requires-Dist: pytest>=9.0.3; extra == 'dev'
32
+ Requires-Dist: ruff>=0.7.0; extra == 'dev'
33
+ Requires-Dist: ty>=0.0.1a1; extra == 'dev'
34
+ Description-Content-Type: text/markdown
35
+
36
+ <div align="center">
37
+ <h1>BenchFlow</h1>
38
+ <p>Multi-turn agent benchmarking — Scene-based lifecycle for any ACP agent</p>
39
+ <a href="https://pypi.org/project/benchflow/" target="_blank">
40
+ <img src="https://img.shields.io/badge/PyPI-0.3.0a3-blue?style=for-the-badge&logo=pypi" alt="PyPI">
41
+ </a>
42
+ <a href="https://discord.gg/mZ9Rc8q8W3" target="_blank">
43
+ <img src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord">
44
+ </a>
45
+ </div>
46
+
47
+ ## What
48
+
49
+ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. It supports single-agent, multi-agent, and multi-turn evaluation patterns through a Scene-based lifecycle.
50
+
51
+ - **Any ACP agent** — Gemini CLI, Claude, Codex, OpenClaw, Pi, or your own
52
+ - **Multi-scene trials** — skill generation → solve, coder → reviewer → revision
53
+ - **Cloud sandboxes** — Daytona backend for parallel execution at scale
54
+ - **YAML-driven** — same task folder, different trial configs for ablation
55
+
56
+ ## Install
57
+
58
+ ```bash
59
+ pip install benchflow==0.3.0a3
60
+ ```
61
+
62
+ Requires Python 3.12+. For cloud sandboxes, set `DAYTONA_API_KEY`.
63
+
64
+ ## Quick Start
65
+
66
+ ### CLI
67
+
68
+ ```bash
69
+ # Run a single task with Gemini
70
+ bench eval create -t tasks/my-task -a gemini -m gemini-3.1-flash-lite-preview -e daytona
71
+
72
+ # Run from YAML config (batch, concurrent)
73
+ bench eval create -f benchmarks/tb2-gemini-baseline.yaml
74
+
75
+ # List agents
76
+ bench agent list
77
+
78
+ # Check task validity
79
+ bench tasks check tasks/my-task
80
+ ```
81
+
82
+ ### Python
83
+
84
+ ```python
85
+ import benchflow as bf
86
+ from benchflow.trial import TrialConfig, Scene, Role, Turn
87
+
88
+ # Simplest: one agent, one task
89
+ result = await bf.run("gemini", task_path="tasks/my-task", model="gemini-3.1-flash-lite-preview")
90
+ print(result.rewards) # {"reward": 1.0}
91
+
92
+ # Scene-based: skill-gen → solve (BYOS pattern)
93
+ config = TrialConfig(
94
+ task_path=Path("tasks/my-task"),
95
+ scenes=[
96
+ Scene(name="skill-gen",
97
+ roles=[Role("gen", "gemini", "gemini-3.1-flash-lite-preview")],
98
+ turns=[Turn("gen", "Analyze the task and write a skill to /app/generated-skill.md")]),
99
+ Scene(name="solve",
100
+ roles=[Role("solver", "gemini", "gemini-3.1-flash-lite-preview")],
101
+ turns=[Turn("solver")]), # None prompt = use instruction.md
102
+ ],
103
+ environment="daytona",
104
+ )
105
+ result = await bf.run(config)
106
+
107
+ # Multi-agent: coder + reviewer
108
+ config = TrialConfig(
109
+ task_path=Path("tasks/my-task"),
110
+ scenes=[
111
+ Scene(name="review-loop",
112
+ roles=[
113
+ Role("coder", "gemini", "gemini-3.1-flash-lite-preview"),
114
+ Role("reviewer", "gemini", "gemini-3.1-flash-lite-preview"),
115
+ ],
116
+ turns=[
117
+ Turn("coder", "Solve the task. Write to /app/.outbox/reviewer.json when done."),
118
+ Turn("reviewer", "Review the coder's work. Write feedback to /app/.outbox/coder.json."),
119
+ Turn("coder", "Read the reviewer's feedback and revise your solution."),
120
+ ]),
121
+ ],
122
+ environment="daytona",
123
+ )
124
+ result = await bf.run(config)
125
+ ```
126
+
127
+ ### YAML Trial Config
128
+
129
+ ```yaml
130
+ # trial-baseline.yaml
131
+ task_dir: .ref/terminal-bench-2
132
+ agent: gemini
133
+ model: gemini-3.1-flash-lite-preview
134
+ environment: daytona
135
+ concurrency: 89
136
+
137
+ # trial-byos.yaml (same tasks, different config)
138
+ task_dir: .ref/terminal-bench-2
139
+ scenes:
140
+ - name: skill-gen
141
+ roles: [{name: gen, agent: gemini, model: gemini-3.1-flash-lite-preview}]
142
+ turns: [{role: gen, prompt: "Generate a skill for this task..."}]
143
+ - name: solve
144
+ roles: [{name: solver, agent: gemini, model: gemini-3.1-flash-lite-preview}]
145
+ ```
146
+
147
+ ## CLI Reference
148
+
149
+ ```
150
+ bench agent list List registered agents
151
+ bench agent show <name> Agent details + conformance status
152
+
153
+ bench eval create Create + run evaluation (returns job-id)
154
+ bench eval list List completed evaluations
155
+
156
+ bench skills eval Evaluate skill via evals.json
157
+
158
+ bench tasks init <name> Scaffold new task
159
+ bench tasks check <dir> Validate task (--rubric for custom)
160
+
161
+ bench train create Reward-based training sweep
162
+
163
+ bench environment create Spin up sandbox from task dir
164
+ bench environment list List active sandboxes
165
+ ```
166
+
167
+ ## Architecture
168
+
169
+ ```
170
+ Trial = sequence of Scenes in a shared sandbox
171
+ Scene = Roles + Turns (one interaction region)
172
+ Role = agent + model
173
+ Turn = one prompt for one role
174
+
175
+ bf.run(config)
176
+ → Trial.create(config)
177
+ → trial.setup() # resolve config, create env object
178
+ → trial.start() # spin up sandbox, upload task files
179
+ → for scene in config.scenes:
180
+ → trial._run_scene(scene) # connect/execute/disconnect per role
181
+ → trial.verify() # run verifier, score
182
+ → trial.cleanup() # stop sandbox
183
+ ```
184
+
185
+ ## Registered Agents
186
+
187
+ | Agent | Command | Auth |
188
+ |-------|---------|------|
189
+ | `gemini` | `gemini --acp --yolo` | GOOGLE_API_KEY |
190
+ | `claude-agent-acp` | `claude-agent-acp` | ANTHROPIC_API_KEY |
191
+ | `codex-acp` | `codex-acp` | OPENAI_API_KEY |
192
+ | `openclaw` | `openclaw-acp-shim` | inferred from model |
193
+ | `pi-acp` | `pi-acp` | ANTHROPIC_API_KEY |
194
+
195
+ ## Adding a Custom Agent
196
+
197
+ Any ACP-native agent works. Create `agent.toml`:
198
+
199
+ ```toml
200
+ name = "my-agent"
201
+ launch_cmd = "my-agent --acp"
202
+ install_cmd = "npm install -g my-agent"
203
+ requires_env = ["MY_API_KEY"]
204
+ ```
205
+
206
+ ## Development
207
+
208
+ ```bash
209
+ uv venv -p 3.12 .venv && uv pip install -e ".[dev]"
210
+ .venv/bin/python -m pytest tests/ # 580+ unit tests
211
+ .venv/bin/ty check src/ # type check
212
+ ```
@@ -0,0 +1,177 @@
1
+ <div align="center">
2
+ <h1>BenchFlow</h1>
3
+ <p>Multi-turn agent benchmarking — Scene-based lifecycle for any ACP agent</p>
4
+ <a href="https://pypi.org/project/benchflow/" target="_blank">
5
+ <img src="https://img.shields.io/badge/PyPI-0.3.0a3-blue?style=for-the-badge&logo=pypi" alt="PyPI">
6
+ </a>
7
+ <a href="https://discord.gg/mZ9Rc8q8W3" target="_blank">
8
+ <img src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord">
9
+ </a>
10
+ </div>
11
+
12
+ ## What
13
+
14
+ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. It supports single-agent, multi-agent, and multi-turn evaluation patterns through a Scene-based lifecycle.
15
+
16
+ - **Any ACP agent** — Gemini CLI, Claude, Codex, OpenClaw, Pi, or your own
17
+ - **Multi-scene trials** — skill generation → solve, coder → reviewer → revision
18
+ - **Cloud sandboxes** — Daytona backend for parallel execution at scale
19
+ - **YAML-driven** — same task folder, different trial configs for ablation
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ pip install benchflow==0.3.0a3
25
+ ```
26
+
27
+ Requires Python 3.12+. For cloud sandboxes, set `DAYTONA_API_KEY`.
28
+
29
+ ## Quick Start
30
+
31
+ ### CLI
32
+
33
+ ```bash
34
+ # Run a single task with Gemini
35
+ bench eval create -t tasks/my-task -a gemini -m gemini-3.1-flash-lite-preview -e daytona
36
+
37
+ # Run from YAML config (batch, concurrent)
38
+ bench eval create -f benchmarks/tb2-gemini-baseline.yaml
39
+
40
+ # List agents
41
+ bench agent list
42
+
43
+ # Check task validity
44
+ bench tasks check tasks/my-task
45
+ ```
46
+
47
+ ### Python
48
+
49
+ ```python
50
+ import benchflow as bf
51
+ from benchflow.trial import TrialConfig, Scene, Role, Turn
52
+
53
+ # Simplest: one agent, one task
54
+ result = await bf.run("gemini", task_path="tasks/my-task", model="gemini-3.1-flash-lite-preview")
55
+ print(result.rewards) # {"reward": 1.0}
56
+
57
+ # Scene-based: skill-gen → solve (BYOS pattern)
58
+ config = TrialConfig(
59
+ task_path=Path("tasks/my-task"),
60
+ scenes=[
61
+ Scene(name="skill-gen",
62
+ roles=[Role("gen", "gemini", "gemini-3.1-flash-lite-preview")],
63
+ turns=[Turn("gen", "Analyze the task and write a skill to /app/generated-skill.md")]),
64
+ Scene(name="solve",
65
+ roles=[Role("solver", "gemini", "gemini-3.1-flash-lite-preview")],
66
+ turns=[Turn("solver")]), # None prompt = use instruction.md
67
+ ],
68
+ environment="daytona",
69
+ )
70
+ result = await bf.run(config)
71
+
72
+ # Multi-agent: coder + reviewer
73
+ config = TrialConfig(
74
+ task_path=Path("tasks/my-task"),
75
+ scenes=[
76
+ Scene(name="review-loop",
77
+ roles=[
78
+ Role("coder", "gemini", "gemini-3.1-flash-lite-preview"),
79
+ Role("reviewer", "gemini", "gemini-3.1-flash-lite-preview"),
80
+ ],
81
+ turns=[
82
+ Turn("coder", "Solve the task. Write to /app/.outbox/reviewer.json when done."),
83
+ Turn("reviewer", "Review the coder's work. Write feedback to /app/.outbox/coder.json."),
84
+ Turn("coder", "Read the reviewer's feedback and revise your solution."),
85
+ ]),
86
+ ],
87
+ environment="daytona",
88
+ )
89
+ result = await bf.run(config)
90
+ ```
91
+
92
+ ### YAML Trial Config
93
+
94
+ ```yaml
95
+ # trial-baseline.yaml
96
+ task_dir: .ref/terminal-bench-2
97
+ agent: gemini
98
+ model: gemini-3.1-flash-lite-preview
99
+ environment: daytona
100
+ concurrency: 89
101
+
102
+ # trial-byos.yaml (same tasks, different config)
103
+ task_dir: .ref/terminal-bench-2
104
+ scenes:
105
+ - name: skill-gen
106
+ roles: [{name: gen, agent: gemini, model: gemini-3.1-flash-lite-preview}]
107
+ turns: [{role: gen, prompt: "Generate a skill for this task..."}]
108
+ - name: solve
109
+ roles: [{name: solver, agent: gemini, model: gemini-3.1-flash-lite-preview}]
110
+ ```
111
+
112
+ ## CLI Reference
113
+
114
+ ```
115
+ bench agent list List registered agents
116
+ bench agent show <name> Agent details + conformance status
117
+
118
+ bench eval create Create + run evaluation (returns job-id)
119
+ bench eval list List completed evaluations
120
+
121
+ bench skills eval Evaluate skill via evals.json
122
+
123
+ bench tasks init <name> Scaffold new task
124
+ bench tasks check <dir> Validate task (--rubric for custom)
125
+
126
+ bench train create Reward-based training sweep
127
+
128
+ bench environment create Spin up sandbox from task dir
129
+ bench environment list List active sandboxes
130
+ ```
131
+
132
+ ## Architecture
133
+
134
+ ```
135
+ Trial = sequence of Scenes in a shared sandbox
136
+ Scene = Roles + Turns (one interaction region)
137
+ Role = agent + model
138
+ Turn = one prompt for one role
139
+
140
+ bf.run(config)
141
+ → Trial.create(config)
142
+ → trial.setup() # resolve config, create env object
143
+ → trial.start() # spin up sandbox, upload task files
144
+ → for scene in config.scenes:
145
+ → trial._run_scene(scene) # connect/execute/disconnect per role
146
+ → trial.verify() # run verifier, score
147
+ → trial.cleanup() # stop sandbox
148
+ ```
149
+
150
+ ## Registered Agents
151
+
152
+ | Agent | Command | Auth |
153
+ |-------|---------|------|
154
+ | `gemini` | `gemini --acp --yolo` | GOOGLE_API_KEY |
155
+ | `claude-agent-acp` | `claude-agent-acp` | ANTHROPIC_API_KEY |
156
+ | `codex-acp` | `codex-acp` | OPENAI_API_KEY |
157
+ | `openclaw` | `openclaw-acp-shim` | inferred from model |
158
+ | `pi-acp` | `pi-acp` | ANTHROPIC_API_KEY |
159
+
160
+ ## Adding a Custom Agent
161
+
162
+ Any ACP-native agent works. Create `agent.toml`:
163
+
164
+ ```toml
165
+ name = "my-agent"
166
+ launch_cmd = "my-agent --acp"
167
+ install_cmd = "npm install -g my-agent"
168
+ requires_env = ["MY_API_KEY"]
169
+ ```
170
+
171
+ ## Development
172
+
173
+ ```bash
174
+ uv venv -p 3.12 .venv && uv pip install -e ".[dev]"
175
+ .venv/bin/python -m pytest tests/ # 580+ unit tests
176
+ .venv/bin/ty check src/ # type check
177
+ ```
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "benchflow"
3
- version = "0.2.2"
3
+ version = "0.3.0"
4
4
  description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -37,7 +37,7 @@ classifiers = [
37
37
  [project.optional-dependencies]
38
38
  dev = [
39
39
  "pre-commit>=3.7",
40
- "pytest>=8.0",
40
+ "pytest>=9.0.3",
41
41
  "pytest-asyncio>=0.24.0",
42
42
  "ruff>=0.7.0",
43
43
  "ty>=0.0.1a1",
@@ -45,6 +45,7 @@ dev = [
45
45
 
46
46
  [project.scripts]
47
47
  benchflow = "benchflow.cli.main:app"
48
+ bench = "benchflow.cli.main:app"
48
49
 
49
50
  [project.urls]
50
51
  Homepage = "https://github.com/benchflow-ai/benchflow"
@@ -58,20 +59,20 @@ requires = ["hatchling"]
58
59
  build-backend = "hatchling.build"
59
60
 
60
61
  [tool.hatch.build.targets.sdist]
61
- exclude = [
62
- ".venv*",
63
- ".ref",
64
- "jobs",
65
- "dist",
66
- ".claude",
67
- ".dev-docs",
68
- ".pytest_cache",
69
- "__pycache__",
62
+ # Allowlist: only ship what the installed package needs.
63
+ only-include = [
64
+ "src",
65
+ "tests",
66
+ "README.md",
67
+ "CHANGELOG.md",
68
+ "LICENSE",
69
+ "pyproject.toml",
70
70
  ]
71
71
 
72
72
  [tool.pytest.ini_options]
73
73
  asyncio_mode = "auto"
74
74
  addopts = "-m 'not live'"
75
+ testpaths = ["tests"]
75
76
  markers = [
76
77
  "live: requires real Anthropic API and Docker daemon (run with -m live)",
77
78
  ]
@@ -45,7 +45,20 @@ from benchflow.environments import (
45
45
  from benchflow.job import Job, JobConfig, JobResult, RetryConfig
46
46
  from benchflow.metrics import BenchmarkMetrics, collect_metrics
47
47
  from benchflow.models import AgentInstallError, AgentTimeoutError, RunResult
48
+ from benchflow.runtime import (
49
+ Agent,
50
+ Environment,
51
+ Runtime,
52
+ RuntimeConfig,
53
+ RuntimeResult,
54
+ run, # bf.run(agent, env) — the primary 0.3 API
55
+ )
56
+ from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene
57
+ from benchflow._snapshot import list_snapshots, restore, snapshot
48
58
  from benchflow.sdk import SDK
59
+ from benchflow.trial import Trial, TrialConfig
60
+ from benchflow.trial import Role as TrialRole, Scene as TrialScene, Turn
61
+ from benchflow.trial_yaml import trial_config_from_yaml
49
62
  from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
50
63
  from benchflow.trajectories.otel import OTelCollector
51
64
  from benchflow.trajectories.proxy import TrajectoryProxy
@@ -63,7 +76,6 @@ __all__ = [
63
76
  "ExecResult",
64
77
  "Task",
65
78
  "TaskConfig",
66
- "Trial",
67
79
  "Verifier",
68
80
  "VerifierResult",
69
81
  # ACP
@@ -88,7 +100,30 @@ __all__ = [
88
100
  "AgentInstallError",
89
101
  "AgentTimeoutError",
90
102
  "RunResult",
91
- # SDK
103
+ # Runtime (0.3 primary API)
104
+ "Agent",
105
+ "Environment",
106
+ "Runtime",
107
+ "RuntimeConfig",
108
+ "RuntimeResult",
109
+ "run",
110
+ # Multi-agent scene
111
+ "Scene",
112
+ "Role",
113
+ "Message",
114
+ "MessageTransport",
115
+ "MailboxTransport",
116
+ # Env snapshots
117
+ "snapshot",
118
+ "restore",
119
+ "list_snapshots",
120
+ # Trial (decomposed lifecycle)
121
+ "Trial",
122
+ "TrialConfig",
123
+ "TrialRole",
124
+ "TrialScene",
125
+ "Turn",
126
+ # SDK (backwards compat)
92
127
  "SDK",
93
128
  # Environments / dep staging
94
129
  "SERVICES",
@@ -25,11 +25,16 @@ from benchflow._sandbox import build_priv_drop_cmd
25
25
  from benchflow._trajectory import _capture_session_trajectory
26
26
  from benchflow.acp.client import ACPClient
27
27
  from benchflow.acp.container_transport import ContainerTransport
28
+ from benchflow.agents.providers import strip_provider_prefix
28
29
  from benchflow.process import DaytonaProcess, DockerProcess
29
30
 
30
31
  logger = logging.getLogger(__name__)
31
32
 
32
33
 
34
+ _ACP_CONNECT_MAX_RETRIES = 3
35
+ _ACP_CONNECT_BASE_DELAY = 2.0
36
+
37
+
33
38
  async def connect_acp(
34
39
  env,
35
40
  agent: str,
@@ -41,7 +46,10 @@ async def connect_acp(
41
46
  environment: str,
42
47
  agent_cwd: str,
43
48
  ) -> tuple[ACPClient, object, str]:
44
- """Create ACP transport, connect, init session, set model. Return (client, session, agent_name)."""
49
+ """Create ACP transport, connect, init session, set model. Return (client, session, agent_name).
50
+
51
+ Retries with exponential backoff on ConnectionError (Daytona SSH storms).
52
+ """
45
53
  # Resolve agent binary path for non-docker environments
46
54
  if environment != "docker":
47
55
  which_result = await env.exec(
@@ -58,32 +66,61 @@ async def connect_acp(
58
66
  agent_launch = build_priv_drop_cmd(agent_launch, sandbox_user)
59
67
  logger.info(f"Agent sandboxed as: {sandbox_user}")
60
68
 
61
- if environment == "docker":
62
- live_proc = DockerProcess.from_harbor_env(env)
63
- else:
64
- live_proc = await DaytonaProcess.from_harbor_env(env)
65
-
66
- agent_log = trial_dir / "agent" / f"{agent.replace('-', '_')}.txt"
67
- transport = ContainerTransport(
68
- container_process=live_proc,
69
- command=agent_launch,
70
- env=agent_env,
71
- cwd=agent_cwd,
72
- agent_log_path=agent_log,
73
- )
74
- acp_client = ACPClient(transport)
75
- await acp_client.connect()
76
-
77
- init_result = await asyncio.wait_for(acp_client.initialize(), timeout=60)
78
- agent_name = init_result.agent_info.name if init_result.agent_info else agent
79
- logger.info(f"ACP agent: {agent_name}")
80
-
81
- session = await asyncio.wait_for(acp_client.session_new(cwd=agent_cwd), timeout=60)
82
- logger.info(f"Session: {session.session_id}")
69
+ last_err: Exception | None = None
70
+ acp_client: ACPClient | None = None
71
+ for attempt in range(_ACP_CONNECT_MAX_RETRIES + 1):
72
+ if attempt > 0:
73
+ delay = _ACP_CONNECT_BASE_DELAY * (2 ** (attempt - 1))
74
+ logger.info(f"ACP connect retry {attempt}/{_ACP_CONNECT_MAX_RETRIES} after {delay:.0f}s")
75
+ await asyncio.sleep(delay)
83
76
 
84
- if model:
85
- from benchflow.agents.providers import strip_provider_prefix
77
+ try:
78
+ if environment == "docker":
79
+ live_proc = DockerProcess.from_harbor_env(env)
80
+ else:
81
+ live_proc = await DaytonaProcess.from_harbor_env(env)
82
+
83
+ agent_log = trial_dir / "agent" / f"{agent.replace('-', '_')}.txt"
84
+ transport = ContainerTransport(
85
+ container_process=live_proc,
86
+ command=agent_launch,
87
+ env=agent_env,
88
+ cwd=agent_cwd,
89
+ agent_log_path=agent_log,
90
+ )
91
+ acp_client = ACPClient(transport)
92
+ await acp_client.connect()
93
+
94
+ init_result = await asyncio.wait_for(acp_client.initialize(), timeout=60)
95
+ agent_name = init_result.agent_info.name if init_result.agent_info else agent
96
+ logger.info(f"ACP agent: {agent_name}")
97
+
98
+ session = await asyncio.wait_for(acp_client.session_new(cwd=agent_cwd), timeout=60)
99
+ logger.info(f"Session: {session.session_id}")
100
+ break
101
+ except ConnectionError as e:
102
+ # Close the failed client before retrying
103
+ if acp_client:
104
+ try:
105
+ await acp_client.close()
106
+ except Exception:
107
+ pass
108
+ acp_client = None
109
+ last_err = e
110
+ if attempt == _ACP_CONNECT_MAX_RETRIES:
111
+ raise
112
+ logger.warning(f"ACP connect failed (attempt {attempt + 1}): {e}")
113
+ continue
114
+ except Exception:
115
+ # Non-retryable error — close client to prevent leak
116
+ if acp_client:
117
+ try:
118
+ await acp_client.close()
119
+ except Exception:
120
+ pass
121
+ raise
86
122
 
123
+ if model:
87
124
  acp_model_id = strip_provider_prefix(model)
88
125
  try:
89
126
  await asyncio.wait_for(acp_client.set_model(acp_model_id), timeout=60)
@@ -102,7 +139,7 @@ async def execute_prompts(
102
139
  ) -> tuple[list[dict], int]:
103
140
  """Send prompts via ACP and capture trajectory. Return (trajectory, n_tool_calls)."""
104
141
  for i, prompt in enumerate(prompts):
105
- logger.info(f"Prompt {i + 1}/{len(prompts)}: {prompt[:80]}...")
142
+ logger.info(f"Prompt {i + 1}/{len(prompts)}: {(prompt or '<instruction.md>')[:80]}...")
106
143
  prompt_result = await asyncio.wait_for(
107
144
  acp_client.prompt(prompt),
108
145
  timeout=timeout,