benchflow 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. {benchflow-0.3.2 → benchflow-0.3.4}/.gitignore +2 -1
  2. {benchflow-0.3.2 → benchflow-0.3.4}/CHANGELOG.md +40 -1
  3. benchflow-0.3.4/PKG-INFO +143 -0
  4. benchflow-0.3.4/README.md +106 -0
  5. {benchflow-0.3.2 → benchflow-0.3.4}/pyproject.toml +17 -2
  6. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/__init__.py +115 -19
  7. benchflow-0.3.4/src/benchflow/_acp_run.py +358 -0
  8. benchflow-0.3.4/src/benchflow/_agent_env.py +368 -0
  9. benchflow-0.3.4/src/benchflow/_agent_setup.py +268 -0
  10. benchflow-0.3.4/src/benchflow/_daytona_patches.py +103 -0
  11. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_env_setup.py +241 -7
  12. benchflow-0.3.4/src/benchflow/_provider_runtime.py +172 -0
  13. benchflow-0.3.4/src/benchflow/_run.py +34 -0
  14. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_sandbox.py +247 -131
  15. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_scene.py +31 -5
  16. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_snapshot.py +19 -9
  17. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_trajectory.py +32 -13
  18. benchflow-0.3.4/src/benchflow/_types.py +94 -0
  19. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/client.py +7 -2
  20. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/container_transport.py +9 -10
  21. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/session.py +45 -4
  22. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/transport.py +30 -5
  23. benchflow-0.3.4/src/benchflow/adapters/__init__.py +25 -0
  24. benchflow-0.3.4/src/benchflow/adapters/inspect_ai.py +63 -0
  25. benchflow-0.3.4/src/benchflow/adapters/ors.py +68 -0
  26. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/agents/__init__.py +0 -4
  27. benchflow-0.3.4/src/benchflow/agents/harvey_lab_acp_shim.py +606 -0
  28. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/agents/openclaw_acp_shim.py +5 -4
  29. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/agents/pi_acp_launcher.py +17 -4
  30. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/agents/providers.py +27 -14
  31. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/agents/registry.py +270 -86
  32. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/cli/main.py +305 -219
  33. benchflow-0.3.2/src/benchflow/job.py → benchflow-0.3.4/src/benchflow/evaluation.py +150 -108
  34. benchflow-0.3.4/src/benchflow/job.py +29 -0
  35. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/mcp/reviewer_server.py +4 -6
  36. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/metrics.py +12 -3
  37. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/models.py +20 -7
  38. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/process.py +168 -4
  39. benchflow-0.3.4/src/benchflow/providers/__init__.py +25 -0
  40. benchflow-0.3.4/src/benchflow/providers/bedrock_proxy.py +534 -0
  41. benchflow-0.3.4/src/benchflow/providers/bedrock_runtime.py +665 -0
  42. benchflow-0.3.4/src/benchflow/rewards/README.md +125 -0
  43. benchflow-0.3.4/src/benchflow/rewards/__init__.py +34 -0
  44. benchflow-0.3.4/src/benchflow/rewards/builtins.py +471 -0
  45. benchflow-0.3.4/src/benchflow/rewards/events.py +26 -0
  46. benchflow-0.3.4/src/benchflow/rewards/file_readers.py +142 -0
  47. benchflow-0.3.4/src/benchflow/rewards/llm.py +186 -0
  48. benchflow-0.3.4/src/benchflow/rewards/protocol.py +33 -0
  49. benchflow-0.3.4/src/benchflow/rewards/rubric.py +76 -0
  50. benchflow-0.3.4/src/benchflow/rewards/rubric_config.py +127 -0
  51. benchflow-0.3.4/src/benchflow/rollout.py +1766 -0
  52. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/runtime.py +54 -33
  53. benchflow-0.3.4/src/benchflow/sandbox/__init__.py +9 -0
  54. benchflow-0.3.4/src/benchflow/sandbox/daytona.py +74 -0
  55. benchflow-0.3.4/src/benchflow/sandbox/docker.py +74 -0
  56. benchflow-0.3.4/src/benchflow/sandbox/protocol.py +74 -0
  57. benchflow-0.3.4/src/benchflow/sdk.py +193 -0
  58. benchflow-0.3.4/src/benchflow/self_gen.py +151 -0
  59. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/skill_eval.py +35 -15
  60. benchflow-0.3.4/src/benchflow/task_download.py +161 -0
  61. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/tasks.py +1 -1
  62. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/trajectories/__init__.py +0 -5
  63. benchflow-0.3.4/src/benchflow/trial.py +39 -0
  64. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/trial_yaml.py +20 -8
  65. benchflow-0.3.4/src/benchflow/user.py +101 -0
  66. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/viewer.py +21 -9
  67. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/README.md +3 -3
  68. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/proof_multi_agent.py +5 -5
  69. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/run_conformance.py +36 -5
  70. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conftest.py +6 -1
  71. {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/test_claude.sh +8 -8
  72. {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/test_codex.sh +68 -13
  73. benchflow-0.3.4/tests/examples/test_codex_custom_provider.sh +99 -0
  74. {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/test_gemini.sh +7 -7
  75. {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/test_openclaw.sh +9 -9
  76. benchflow-0.3.4/tests/fixtures/mock_acp_agent_multi_turn.py +162 -0
  77. benchflow-0.3.4/tests/fixtures/mock_openai_responses_server.py +98 -0
  78. benchflow-0.3.4/tests/integration/check_results.py +179 -0
  79. benchflow-0.3.4/tests/integration/configs/claude-agent-acp.yaml +23 -0
  80. benchflow-0.3.4/tests/integration/configs/codex-acp.yaml +23 -0
  81. benchflow-0.3.4/tests/integration/configs/gemini.yaml +23 -0
  82. benchflow-0.3.4/tests/integration/configs/harvey-lab-harness.yaml +23 -0
  83. benchflow-0.3.4/tests/integration/configs/openclaw.yaml +23 -0
  84. benchflow-0.3.4/tests/integration/configs/opencode.yaml +23 -0
  85. benchflow-0.3.4/tests/integration/configs/openhands.yaml +23 -0
  86. benchflow-0.3.4/tests/integration/configs/pi-acp.yaml +23 -0
  87. benchflow-0.3.4/tests/integration/run.sh +170 -0
  88. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_acp.py +208 -6
  89. benchflow-0.3.4/tests/test_adapters.py +218 -0
  90. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_agent_registry.py +42 -0
  91. benchflow-0.3.4/tests/test_agent_setup.py +461 -0
  92. benchflow-0.3.4/tests/test_bedrock_proxy.py +375 -0
  93. benchflow-0.3.4/tests/test_bedrock_runtime.py +405 -0
  94. benchflow-0.3.4/tests/test_capture_trajectory.py +735 -0
  95. benchflow-0.3.4/tests/test_connect_as_env.py +142 -0
  96. benchflow-0.3.4/tests/test_eng50_capabilities.py +248 -0
  97. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_env_setup.py +142 -1
  98. benchflow-0.3.4/tests/test_internet_policy.py +417 -0
  99. benchflow-0.3.4/tests/test_llm_judge.py +502 -0
  100. benchflow-0.3.4/tests/test_mock_openai_responses_server.py +73 -0
  101. benchflow-0.3.4/tests/test_notification_order_real.py +126 -0
  102. benchflow-0.3.4/tests/test_oracle.py +126 -0
  103. benchflow-0.3.4/tests/test_oracle_chokepoint.py +224 -0
  104. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_pi_acp_launcher.py +30 -3
  105. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_process.py +92 -0
  106. benchflow-0.3.4/tests/test_provider_runtime.py +224 -0
  107. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_providers.py +28 -1
  108. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_reexport.py +1 -1
  109. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_registry_invariants.py +85 -2
  110. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_resolve_env_helpers.py +121 -2
  111. benchflow-0.3.4/tests/test_rewards.py +338 -0
  112. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_rewards_jsonl.py +1 -1
  113. benchflow-0.3.4/tests/test_rubric_config.py +175 -0
  114. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_runtime.py +8 -6
  115. benchflow-0.3.4/tests/test_sandbox.py +97 -0
  116. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_sandbox_hardening.py +285 -113
  117. benchflow-0.3.4/tests/test_sandbox_protocol.py +250 -0
  118. benchflow-0.3.4/tests/test_sandbox_setup.py +110 -0
  119. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_scene.py +4 -8
  120. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_scene_outbox_trial.py +135 -12
  121. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_sdk_internals.py +208 -11
  122. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_sdk_lockdown.py +1 -1
  123. benchflow-0.3.4/tests/test_self_gen_cli.py +66 -0
  124. benchflow-0.3.4/tests/test_self_gen_orchestration.py +259 -0
  125. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_skill_eval.py +2 -2
  126. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_skill_eval_dryrun.py +133 -41
  127. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_skill_eval_integration.py +17 -7
  128. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_smoke.py +2 -1
  129. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_subscription_auth.py +5 -18
  130. benchflow-0.3.4/tests/test_task_download.py +185 -0
  131. benchflow-0.3.4/tests/test_trajectory_integration.py +261 -0
  132. benchflow-0.3.4/tests/test_trial_agent_timeout_verify.py +77 -0
  133. benchflow-0.3.4/tests/test_trial_bedrock_proxy.py +129 -0
  134. benchflow-0.3.4/tests/test_trial_install_agent_timeout.py +124 -0
  135. benchflow-0.3.4/tests/test_user.py +409 -0
  136. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_verify.py +16 -17
  137. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_yaml_config.py +29 -0
  138. benchflow-0.3.2/PKG-INFO +0 -231
  139. benchflow-0.3.2/README.md +0 -196
  140. benchflow-0.3.2/src/benchflow/_acp_run.py +0 -152
  141. benchflow-0.3.2/src/benchflow/_agent_env.py +0 -205
  142. benchflow-0.3.2/src/benchflow/_agent_setup.py +0 -121
  143. benchflow-0.3.2/src/benchflow/agents/user_agent.py +0 -62
  144. benchflow-0.3.2/src/benchflow/cli/eval.py +0 -373
  145. benchflow-0.3.2/src/benchflow/sdk.py +0 -518
  146. benchflow-0.3.2/src/benchflow/task_download.py +0 -72
  147. benchflow-0.3.2/src/benchflow/trajectories/atif.py +0 -112
  148. benchflow-0.3.2/src/benchflow/trajectories/claude_code.py +0 -249
  149. benchflow-0.3.2/src/benchflow/trial.py +0 -788
  150. benchflow-0.3.2/tests/test_capture_trajectory.py +0 -135
  151. benchflow-0.3.2/tests/test_eval_cli.py +0 -118
  152. benchflow-0.3.2/tests/test_oracle.py +0 -63
  153. benchflow-0.3.2/tests/test_sandbox.py +0 -64
  154. {benchflow-0.3.2 → benchflow-0.3.4}/LICENSE +0 -0
  155. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_credentials.py +0 -0
  156. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_scoring.py +0 -0
  157. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/__init__.py +0 -0
  158. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/types.py +0 -0
  159. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/cli/__init__.py +0 -0
  160. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/demo_task/environment/Dockerfile +0 -0
  161. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/demo_task/instruction.md +0 -0
  162. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/demo_task/task.toml +0 -0
  163. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/demo_task/tests/test.sh +0 -0
  164. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/environments.py +0 -0
  165. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/mcp/__init__.py +0 -0
  166. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/mcp/hooks.py +0 -0
  167. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/py.typed +0 -0
  168. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/skills.py +0 -0
  169. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/templates/__init__.py +0 -0
  170. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/templates/judge.py.tmpl +0 -0
  171. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/templates/test.sh.tmpl +0 -0
  172. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/trajectories/otel.py +0 -0
  173. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/trajectories/proxy.py +0 -0
  174. {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/trajectories/types.py +0 -0
  175. {benchflow-0.3.2 → benchflow-0.3.4}/tests/__init__.py +0 -0
  176. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
  177. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/acp_smoke/instruction.md +0 -0
  178. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
  179. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/acp_smoke/task.toml +0 -0
  180. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/acp_smoke/tests/test.sh +0 -0
  181. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/conformance-results.json +0 -0
  182. {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/proof_snapshot.py +0 -0
  183. {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
  184. {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/hello-world-task/instruction.md +0 -0
  185. {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/hello-world-task/solution/solve.sh +0 -0
  186. {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/hello-world-task/task.toml +0 -0
  187. {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/hello-world-task/tests/test.sh +0 -0
  188. {benchflow-0.3.2 → benchflow-0.3.4}/tests/fixtures/mock_acp_agent.py +0 -0
  189. {benchflow-0.3.2 → benchflow-0.3.4}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
  190. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_agent_model_decouple.py +0 -0
  191. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_agent_spec.py +0 -0
  192. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_atif_trajectory.py +0 -0
  193. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_exclude_tasks.py +0 -0
  194. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_job.py +0 -0
  195. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_metrics.py +0 -0
  196. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_sandbox_verifier_workspace.py +0 -0
  197. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_scoring.py +0 -0
  198. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_skills.py +0 -0
  199. {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_tasks.py +0 -0
@@ -173,7 +173,8 @@ cython_debug/
173
173
 
174
174
  .DS_Store
175
175
  # benchflow
176
- .ref/
176
+ .cache/
177
+
177
178
  trials/
178
179
  jobs/
179
180
  .jobs/
@@ -2,6 +2,45 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## 0.3.3 — 2026-05-15
6
+
7
+ ### Added
8
+
9
+ - **Harvey LAB benchmark** — converter, agent shim, and parity validation for 1,251 legal AI tasks (#239).
10
+ - **Harvey LAB Claude Sonnet judge** — switched verifier from Gemini to `claude-sonnet-4-6`, matching the original benchmark default (#264).
11
+ - **ProgramBench integration** — new benchmark adapter; TB2 removed; `.ref/` migrated to `benchmarks/` (#237).
12
+ - **CLI progress output** — `bench eval create` / `bench run` now show progress messages by default (#264).
13
+ - **Skill nudge** — optional prompt injection for skill-enhanced agent runs (#207).
14
+ - **Self-generated skill mode** for Codex agent (#233).
15
+ - **Integration test suite** for ENG-6 + `OPENAI_BASE_URL` inheritance fix (#255).
16
+ - **Modal backend support** — Dockerfile compatibility for Modal environments.
17
+ - **CITATION.cff** (#246).
18
+ - **`AGENTS.md`** — canonical contributor guide; `CLAUDE.md` deprecated (#258).
19
+
20
+ ### Changed
21
+
22
+ - **Two-field source pattern** for dataset sourcing (#252).
23
+ - **Docs overhaul** — synced from www.benchflow.ai; Mintlify config added then orphaned config removed (#259, #257, #226).
24
+ - **`uv sync`** for package management (#232).
25
+
26
+ ### Fixed
27
+
28
+ - Prevent `TypeError` in `metrics.collect_metrics` when reward is `None` (#243).
29
+ - Copy eval `requirements.txt` into Docker build context (#245).
30
+ - Resolve agent aliases in `bench agent show` and display aliases in `bench agent list` (#251).
31
+ - Guard ACP transports against JSON scalar logs (#236).
32
+ - Agent timeout reward fallback for Codex (#234).
33
+ - Isolate JS agent runtime installs (#231).
34
+ - Route Codex ACP through responses API (#224).
35
+ - Deploy skills and forward `solution.env` for oracle runs (#223).
36
+ - Honor no-internet tasks for agent runs; disable web tools without prompt mutation (#215).
37
+ - Propagate `OPENAI_API_KEY` for vllm provider (#3).
38
+ - Preserve arrival order of thought/message within flush windows (#214).
39
+ - Record user messages and per-turn agent text in ACP trajectory (#745).
40
+ - Chown skill-link parent dirs so sandbox user can write into them.
41
+ - Dynamic `--rootdir` in `PYTEST_ADDOPTS` based on task workspace.
42
+ - Unique env-file path in `DaytonaPtyProcess` to avoid race conditions (#200).
43
+
5
44
  ## 0.2.3 — 2026-04-15
6
45
 
7
46
  ### Added
@@ -66,7 +105,7 @@
66
105
  - **Vertex AI support** — ADC auth for `google-vertex/`, `anthropic-vertex/`, `vertex-zai/` prefixed models.
67
106
  - **Provider registry** — add a new LLM endpoint via a dict entry in `providers.py`, no code changes.
68
107
  - **`benchmarks/` directory** with reusable YAML configs and runner scripts for TB2 and SkillsBench.
69
- - **Auto task download** via `ensure_tasks()` `terminal-bench-2` and `skillsbench` clone into `.ref/` on first run.
108
+ - **Auto task download** YAML configs reference datasets as `org/repo/path` (e.g. `harbor-framework/terminal-bench-2`). Repos are cloned on first use and cached under `.cache/datasets/`.
70
109
  - **`benchflow tasks init`** — scaffold new tasks.
71
110
  - **`benchflow tasks check`** — validate task structure.
72
111
  - **`benchflow cleanup`** — delete old sandboxes with `--max-age` filtering (default 24h).
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: benchflow
3
+ Version: 0.3.4
4
+ Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
5
+ Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
6
+ Project-URL: Repository, https://github.com/benchflow-ai/benchflow
7
+ Project-URL: Issues, https://github.com/benchflow-ai/benchflow/issues
8
+ Project-URL: Discord, https://discord.gg/mZ9Rc8q8W3
9
+ Project-URL: Changelog, https://github.com/benchflow-ai/benchflow/blob/main/CHANGELOG.md
10
+ Author-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
11
+ Maintainer-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
12
+ License: Apache-2.0
13
+ License-File: LICENSE
14
+ Keywords: acp,agent-evaluation,benchmark,llm-agents,multi-turn,skillsbench,terminal-bench
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Python: >=3.12
21
+ Requires-Dist: anyio>=4.0
22
+ Requires-Dist: harbor==0.3.0
23
+ Requires-Dist: httpx>=0.27.0
24
+ Requires-Dist: pydantic>=2.0
25
+ Requires-Dist: pyyaml>=6.0
26
+ Requires-Dist: rich>=13.0
27
+ Requires-Dist: typer>=0.9
28
+ Provides-Extra: bedrock
29
+ Requires-Dist: boto3>=1.40; extra == 'bedrock'
30
+ Provides-Extra: dev
31
+ Requires-Dist: pre-commit>=3.7; extra == 'dev'
32
+ Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
33
+ Requires-Dist: pytest>=9.0.3; extra == 'dev'
34
+ Requires-Dist: ruff>=0.7.0; extra == 'dev'
35
+ Requires-Dist: ty>=0.0.1a1; extra == 'dev'
36
+ Description-Content-Type: text/markdown
37
+
38
+ <div align="center">
39
+ <h1>BenchFlow</h1>
40
+ <p>Multi-turn agent benchmarking — Scene-based lifecycle for any ACP agent</p>
41
+ <a href="https://pypi.org/project/benchflow/" target="_blank">
42
+ <img src="https://img.shields.io/pypi/v/benchflow?style=for-the-badge&logo=pypi" alt="PyPI">
43
+ </a>
44
+ <a href="https://discord.gg/mZ9Rc8q8W3" target="_blank">
45
+ <img src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord">
46
+ </a>
47
+ </div>
48
+
49
+ ## What
50
+
51
+ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. Single-agent, multi-agent, and multi-round patterns share one Scene-based lifecycle.
52
+
53
+ - **Any ACP agent** — Gemini CLI, Claude Code, Codex, OpenCode, OpenHands, OpenClaw, Pi, or your own
54
+ - **Single + multi + progressive** — single-agent / multi-agent (coder + reviewer, simulated user) / multi-round with a Python `BaseUser` callback
55
+ - **Sandboxes** — Docker locally, Daytona for parallel cloud runs, Modal for serverless/GPU-backed task environments
56
+ - **Hardened verifier** — defaults block BenchJack/Meerkat-style reward-hacking; tasks opt out per-feature
57
+
58
+ ## Install
59
+
60
+ ```bash
61
+ uv tool install benchflow
62
+ ```
63
+
64
+ Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth.
65
+
66
+ ## Documentation
67
+
68
+ Start with [Getting started](./docs/getting-started.md), then [Concepts](./docs/concepts.md) for the mental model. Then by goal:
69
+
70
+ | If you want to… | Read |
71
+ |------------------|------|
72
+ | Run an eval on an existing task | [Getting started](./docs/getting-started.md) |
73
+ | Understand Rollout / Scene / Role / Verifier | [Concepts](./docs/concepts.md) |
74
+ | Author a new task | [Task authoring](./docs/task-authoring.md) |
75
+ | Multi-agent: coder + reviewer, simulated user, BYOS, stateful envs | [Use cases](./docs/use-cases.md) |
76
+ | Multi-round single-agent (progressive disclosure, oracle access) | [Progressive disclosure](./docs/progressive-disclosure.md) |
77
+ | Skill evaluation (when the artifact is a skill, not a workspace) | [Skill eval](./docs/skill-eval.md) |
78
+ | Understand the security model | [Sandbox hardening](./docs/sandbox-hardening.md) |
79
+ | CLI flags + commands | [CLI reference](./docs/reference/cli.md) |
80
+ | Python API surface | [Python API reference](./docs/reference/python-api.md) |
81
+
82
+ Notebooks and runnable example scripts live under [`docs/examples/`](./docs/examples/) so examples stay versioned with the docs that explain them.
83
+
84
+ ## Benchmark task sources
85
+
86
+ Benchmark datasets live in external Git repos and are referenced with two fields:
87
+
88
+ ```yaml
89
+ # benchmarks/skillsbench-claude-glm51.yaml
90
+ source:
91
+ repo: benchflow-ai/skillsbench # GitHub org/repo
92
+ path: tasks # optional subpath within repo
93
+ ref: main # optional branch/tag
94
+ agent: claude-agent-acp
95
+ model: claude-sonnet-4-6
96
+ ```
97
+
98
+ Run any benchmark via the CLI:
99
+
100
+ ```bash
101
+ # From a YAML config
102
+ bench eval create --config benchmarks/skillsbench-claude-glm51.yaml
103
+
104
+ # Inline — mirrors the YAML source fields
105
+ bench eval create \
106
+ --source-repo benchflow-ai/skillsbench --source-path tasks \
107
+ --agent gemini --model gemini-3.1-flash-lite-preview --sandbox daytona --concurrency 64
108
+ ```
109
+
110
+ Repos are cloned and cached locally under `.cache/datasets/` on first use.
111
+
112
+ SkillsBench itself sources BenchFlow from GitHub `main` in its
113
+ [`pyproject.toml`](https://github.com/benchflow-ai/skillsbench/blob/main/pyproject.toml).
114
+ After a BenchFlow change lands, run `uv lock --upgrade-package benchflow` in
115
+ SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
116
+
117
+ ## Featured
118
+
119
+ - **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). Also benchflow's [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) parity answer for the no-second-LLM case. See [Progressive disclosure](./docs/progressive-disclosure.md).
120
+
121
+ ## Research artifacts
122
+
123
+ Two runnable labs validate the security story:
124
+
125
+ - [`labs/benchjack-sandbox-hardening/`](./labs/benchjack-sandbox-hardening/) — end-to-end demo that 0.2.1+ blocks three [BenchJack](https://rdi.berkeley.edu/blog/trustworthy-benchmarks-cont/) exploits that flip 0.2.0's reward from 0.0 to 1.0.
126
+ - [`labs/reward-hack-matrix/`](./labs/reward-hack-matrix/) — full reward-hack sweep across real benchmarks comparing 0.2.0 vs 0.2.2.
127
+
128
+ ## Audience
129
+
130
+ - **Eval researchers / paper writers** → [Getting started](./docs/getting-started.md) → [Concepts](./docs/concepts.md) → [Use cases](./docs/use-cases.md)
131
+ - **Task authors** → [Task authoring](./docs/task-authoring.md) → [Sandbox hardening](./docs/sandbox-hardening.md)
132
+ - **Agent builders integrating with benchflow** → [Concepts](./docs/concepts.md) → [Python API reference](./docs/reference/python-api.md) → [`benchflow.agents.registry`](./src/benchflow/agents/registry.py)
133
+ - **Existing Harbor users migrating** → [Use cases — migration section](./docs/use-cases.md#migration-from-harbor) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
134
+
135
+ ## Contributing
136
+
137
+ PRs welcome. Open against `main`. CI runs ruff + tests on every PR; please run `ruff check .` and `pytest tests/` locally first.
138
+
139
+ For a release: bump `pyproject.toml` to the next stable version, tag `v<version>` on main, push the tag — CI publishes to PyPI. Then bump main to the next `.dev0`.
140
+
141
+ ## License
142
+
143
+ Apache-2.0.
@@ -0,0 +1,106 @@
1
+ <div align="center">
2
+ <h1>BenchFlow</h1>
3
+ <p>Multi-turn agent benchmarking — Scene-based lifecycle for any ACP agent</p>
4
+ <a href="https://pypi.org/project/benchflow/" target="_blank">
5
+ <img src="https://img.shields.io/pypi/v/benchflow?style=for-the-badge&logo=pypi" alt="PyPI">
6
+ </a>
7
+ <a href="https://discord.gg/mZ9Rc8q8W3" target="_blank">
8
+ <img src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord">
9
+ </a>
10
+ </div>
11
+
12
+ ## What
13
+
14
+ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. Single-agent, multi-agent, and multi-round patterns share one Scene-based lifecycle.
15
+
16
+ - **Any ACP agent** — Gemini CLI, Claude Code, Codex, OpenCode, OpenHands, OpenClaw, Pi, or your own
17
+ - **Single + multi + progressive** — single-agent / multi-agent (coder + reviewer, simulated user) / multi-round with a Python `BaseUser` callback
18
+ - **Sandboxes** — Docker locally, Daytona for parallel cloud runs, Modal for serverless/GPU-backed task environments
19
+ - **Hardened verifier** — defaults block BenchJack/Meerkat-style reward-hacking; tasks opt out per-feature
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ uv tool install benchflow
25
+ ```
26
+
27
+ Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth.
28
+
29
+ ## Documentation
30
+
31
+ Start with [Getting started](./docs/getting-started.md), then [Concepts](./docs/concepts.md) for the mental model. Then by goal:
32
+
33
+ | If you want to… | Read |
34
+ |------------------|------|
35
+ | Run an eval on an existing task | [Getting started](./docs/getting-started.md) |
36
+ | Understand Rollout / Scene / Role / Verifier | [Concepts](./docs/concepts.md) |
37
+ | Author a new task | [Task authoring](./docs/task-authoring.md) |
38
+ | Multi-agent: coder + reviewer, simulated user, BYOS, stateful envs | [Use cases](./docs/use-cases.md) |
39
+ | Multi-round single-agent (progressive disclosure, oracle access) | [Progressive disclosure](./docs/progressive-disclosure.md) |
40
+ | Skill evaluation (when the artifact is a skill, not a workspace) | [Skill eval](./docs/skill-eval.md) |
41
+ | Understand the security model | [Sandbox hardening](./docs/sandbox-hardening.md) |
42
+ | CLI flags + commands | [CLI reference](./docs/reference/cli.md) |
43
+ | Python API surface | [Python API reference](./docs/reference/python-api.md) |
44
+
45
+ Notebooks and runnable example scripts live under [`docs/examples/`](./docs/examples/) so examples stay versioned with the docs that explain them.
46
+
47
+ ## Benchmark task sources
48
+
49
+ Benchmark datasets live in external Git repos and are referenced with two fields:
50
+
51
+ ```yaml
52
+ # benchmarks/skillsbench-claude-glm51.yaml
53
+ source:
54
+ repo: benchflow-ai/skillsbench # GitHub org/repo
55
+ path: tasks # optional subpath within repo
56
+ ref: main # optional branch/tag
57
+ agent: claude-agent-acp
58
+ model: claude-sonnet-4-6
59
+ ```
60
+
61
+ Run any benchmark via the CLI:
62
+
63
+ ```bash
64
+ # From a YAML config
65
+ bench eval create --config benchmarks/skillsbench-claude-glm51.yaml
66
+
67
+ # Inline — mirrors the YAML source fields
68
+ bench eval create \
69
+ --source-repo benchflow-ai/skillsbench --source-path tasks \
70
+ --agent gemini --model gemini-3.1-flash-lite-preview --sandbox daytona --concurrency 64
71
+ ```
72
+
73
+ Repos are cloned and cached locally under `.cache/datasets/` on first use.
74
+
75
+ SkillsBench itself sources BenchFlow from GitHub `main` in its
76
+ [`pyproject.toml`](https://github.com/benchflow-ai/skillsbench/blob/main/pyproject.toml).
77
+ After a BenchFlow change lands, run `uv lock --upgrade-package benchflow` in
78
+ SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
79
+
80
+ ## Featured
81
+
82
+ - **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). Also benchflow's [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) parity answer for the no-second-LLM case. See [Progressive disclosure](./docs/progressive-disclosure.md).
83
+
84
+ ## Research artifacts
85
+
86
+ Two runnable labs validate the security story:
87
+
88
+ - [`labs/benchjack-sandbox-hardening/`](./labs/benchjack-sandbox-hardening/) — end-to-end demo that 0.2.1+ blocks three [BenchJack](https://rdi.berkeley.edu/blog/trustworthy-benchmarks-cont/) exploits that flip 0.2.0's reward from 0.0 to 1.0.
89
+ - [`labs/reward-hack-matrix/`](./labs/reward-hack-matrix/) — full reward-hack sweep across real benchmarks comparing 0.2.0 vs 0.2.2.
90
+
91
+ ## Audience
92
+
93
+ - **Eval researchers / paper writers** → [Getting started](./docs/getting-started.md) → [Concepts](./docs/concepts.md) → [Use cases](./docs/use-cases.md)
94
+ - **Task authors** → [Task authoring](./docs/task-authoring.md) → [Sandbox hardening](./docs/sandbox-hardening.md)
95
+ - **Agent builders integrating with benchflow** → [Concepts](./docs/concepts.md) → [Python API reference](./docs/reference/python-api.md) → [`benchflow.agents.registry`](./src/benchflow/agents/registry.py)
96
+ - **Existing Harbor users migrating** → [Use cases — migration section](./docs/use-cases.md#migration-from-harbor) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
97
+
98
+ ## Contributing
99
+
100
+ PRs welcome. Open against `main`. CI runs ruff + tests on every PR; please run `ruff check .` and `pytest tests/` locally first.
101
+
102
+ For a release: bump `pyproject.toml` to the next stable version, tag `v<version>` on main, push the tag — CI publishes to PyPI. Then bump main to the next `.dev0`.
103
+
104
+ ## License
105
+
106
+ Apache-2.0.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "benchflow"
3
- version = "0.3.2"
3
+ version = "0.3.4"
4
4
  description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -42,6 +42,9 @@ dev = [
42
42
  "ruff>=0.7.0",
43
43
  "ty>=0.0.1a1",
44
44
  ]
45
+ bedrock = [
46
+ "boto3>=1.40",
47
+ ]
45
48
 
46
49
  [project.scripts]
47
50
  benchflow = "benchflow.cli.main:app"
@@ -71,14 +74,16 @@ only-include = [
71
74
 
72
75
  [tool.pytest.ini_options]
73
76
  asyncio_mode = "auto"
74
- addopts = "-m 'not live'"
77
+ addopts = "-m 'not live and not integration'"
75
78
  testpaths = ["tests"]
76
79
  markers = [
77
80
  "live: requires real Anthropic API and Docker daemon (run with -m live)",
81
+ "integration: full integration tests — requires GEMINI_API_KEY + DAYTONA_API_KEY (run with -m integration)",
78
82
  ]
79
83
 
80
84
  [tool.ruff]
81
85
  target-version = "py312"
86
+ extend-exclude = [".claude/skills/skill-creator"]
82
87
 
83
88
  [tool.ruff.lint]
84
89
  select = [
@@ -96,6 +101,16 @@ ignore = [
96
101
  "RUF022", # __all__ unsorted — grouped by section for agent-friendliness
97
102
  ]
98
103
 
104
+ [tool.ruff.lint.per-file-ignores]
105
+ # Standalone scripts — sys.path manipulation before imports is intentional
106
+ "experiments/*.py" = ["E402"]
107
+ "tests/conformance/*.py" = ["E402"]
108
+ # Notebooks: cell-local imports + short loop vars are notebook conventions
109
+ "docs/examples/*.ipynb" = ["E402", "E741", "SIM115"]
110
+ # Forward references resolved via __future__ annotations — ruff flags them
111
+ # but they work at runtime; explicit TYPE_CHECKING imports would force eager loads.
112
+ "src/benchflow/runtime.py" = ["F821"]
113
+
99
114
  [tool.ty.environment]
100
115
  python-version = "3.12"
101
116
 
@@ -3,8 +3,8 @@
3
3
  Re-exports environment APIs and adds:
4
4
  - ACP client for multi-turn agent communication
5
5
  - Trajectory capture (HTTP proxy, OTel collector, ACP native)
6
- - SDK for programmatic usage
7
- - Job orchestration with retries and concurrency
6
+ - Rollout lifecycle for single-task execution
7
+ - Evaluation orchestration with retries and concurrency
8
8
  - Metrics collection and aggregation
9
9
  """
10
10
 
@@ -19,15 +19,24 @@ from harbor import (
19
19
  ExecResult,
20
20
  Task,
21
21
  TaskConfig,
22
- Trial,
23
22
  Verifier,
24
23
  VerifierResult,
25
24
  )
26
25
 
27
26
  # benchflow's additions
28
27
  from benchflow._env_setup import stage_dockerfile_deps
28
+ from benchflow._scene import MailboxTransport, Message, MessageTransport, SceneRole
29
+ from benchflow._scene import Scene as SceneRuntime
30
+ from benchflow._snapshot import list_snapshots, restore, snapshot
31
+ from benchflow._types import Role, Scene, Turn
29
32
  from benchflow.acp.client import ACPClient
30
33
  from benchflow.acp.session import ACPSession
34
+ from benchflow.adapters import (
35
+ InspectAdapter,
36
+ ORSAdapter,
37
+ to_inspect_task,
38
+ to_ors_reward,
39
+ )
31
40
  from benchflow.agents.registry import (
32
41
  AGENTS,
33
42
  get_agent,
@@ -42,27 +51,61 @@ from benchflow.environments import (
42
51
  detect_services_from_dockerfile,
43
52
  register_service,
44
53
  )
45
- from benchflow.job import Job, JobConfig, JobResult, RetryConfig
54
+ from benchflow.evaluation import (
55
+ Evaluation,
56
+ EvaluationConfig,
57
+ EvaluationResult,
58
+ RetryConfig,
59
+ )
46
60
  from benchflow.metrics import BenchmarkMetrics, collect_metrics
47
- from benchflow.models import AgentInstallError, AgentTimeoutError, RunResult
61
+ from benchflow.models import AgentInstallError, AgentTimeoutError, RolloutResult
62
+
63
+ # Rewards protocol (v0.4 — composable Rubric + RewardFunc)
64
+ from benchflow.rewards import (
65
+ CodeExecRewardFunc,
66
+ Criterion,
67
+ JudgeConfig,
68
+ LLMJudgeRewardFunc,
69
+ RewardEvent,
70
+ RewardFunc,
71
+ Rubric,
72
+ RubricConfig,
73
+ ScoringConfig,
74
+ StringMatchRewardFunc,
75
+ TestRewardFunc,
76
+ VerifyResult,
77
+ load_rubric_toml,
78
+ )
79
+ from benchflow.rollout import Rollout, RolloutConfig
48
80
  from benchflow.runtime import (
49
81
  Agent,
50
82
  Environment,
51
83
  Runtime,
52
84
  RuntimeConfig,
53
85
  RuntimeResult,
54
- run, # bf.run(agent, env) — the primary 0.3 API
55
- )
56
- from benchflow._scene import MailboxTransport, Message, MessageTransport, Role, Scene
57
- from benchflow._snapshot import list_snapshots, restore, snapshot
86
+ run,
87
+ ) # bf.run() — supports Agent, RolloutConfig, and str calling conventions
88
+
89
+ # Sandbox protocol (v0.4 parallel types, Harbor not yet removed)
90
+ from benchflow.sandbox import ExecResult as SandboxExecResult
91
+ from benchflow.sandbox import ImageBuilder, ImageConfig, ImageRef, Sandbox
58
92
  from benchflow.sdk import SDK
59
- from benchflow.trial import Trial, TrialConfig
60
- from benchflow.trial import Role as TrialRole, Scene as TrialScene, Turn
61
- from benchflow.trial_yaml import trial_config_from_yaml
62
93
  from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
63
94
  from benchflow.trajectories.otel import OTelCollector
64
95
  from benchflow.trajectories.proxy import TrajectoryProxy
65
96
  from benchflow.trajectories.types import Trajectory
97
+ from benchflow.trial_yaml import trial_config_from_yaml
98
+ from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
99
+
100
+ # Backward-compat aliases
101
+ Trial = Rollout
102
+ TrialConfig = RolloutConfig
103
+ TrialRole = Role
104
+ TrialScene = Scene
105
+ RunResult = RolloutResult
106
+ Job = Evaluation
107
+ JobConfig = EvaluationConfig
108
+ JobResult = EvaluationResult
66
109
 
67
110
  # Public API surface. Anything not in this list is implementation detail and
68
111
  # may change without notice. Names are grouped by source module to match the
@@ -70,6 +113,27 @@ from benchflow.trajectories.types import Trajectory
70
113
  # what.
71
114
  __all__ = [
72
115
  "__version__",
116
+ # Rewards protocol (v0.4)
117
+ "Rubric",
118
+ "RewardFunc",
119
+ "RewardEvent",
120
+ "VerifyResult",
121
+ "TestRewardFunc",
122
+ "LLMJudgeRewardFunc",
123
+ "StringMatchRewardFunc",
124
+ "CodeExecRewardFunc",
125
+ # Rubric config (ENG-55)
126
+ "Criterion",
127
+ "JudgeConfig",
128
+ "RubricConfig",
129
+ "ScoringConfig",
130
+ "load_rubric_toml",
131
+ # Sandbox protocol (v0.4)
132
+ "Sandbox",
133
+ "SandboxExecResult",
134
+ "ImageBuilder",
135
+ "ImageConfig",
136
+ "ImageRef",
73
137
  # Harbor re-exports
74
138
  "BaseAgent",
75
139
  "BaseEnvironment",
@@ -88,28 +152,38 @@ __all__ = [
88
152
  "is_vertex_model",
89
153
  "list_agents",
90
154
  "register_agent",
91
- # Job orchestration
155
+ # Evaluation orchestration (new names)
156
+ "Evaluation",
157
+ "EvaluationConfig",
158
+ "EvaluationResult",
159
+ "RetryConfig",
160
+ # Backward-compat aliases for Job
92
161
  "Job",
93
162
  "JobConfig",
94
163
  "JobResult",
95
- "RetryConfig",
96
164
  # Metrics
97
165
  "BenchmarkMetrics",
98
166
  "collect_metrics",
99
167
  # Models / errors
100
168
  "AgentInstallError",
101
169
  "AgentTimeoutError",
170
+ "RolloutResult",
102
171
  "RunResult",
103
- # Runtime (0.3 primary API)
172
+ # Runtime (0.3 compat)
104
173
  "Agent",
105
174
  "Environment",
106
175
  "Runtime",
107
176
  "RuntimeConfig",
108
177
  "RuntimeResult",
178
+ # Single entry point
109
179
  "run",
110
- # Multi-agent scene
111
- "Scene",
180
+ # Canonical declarative types (_types.py — ENG-47)
112
181
  "Role",
182
+ "Scene",
183
+ "Turn",
184
+ # Multi-agent scene runtime
185
+ "SceneRole",
186
+ "SceneRuntime",
113
187
  "Message",
114
188
  "MessageTransport",
115
189
  "MailboxTransport",
@@ -117,12 +191,20 @@ __all__ = [
117
191
  "snapshot",
118
192
  "restore",
119
193
  "list_snapshots",
120
- # Trial (decomposed lifecycle)
194
+ # Rollout (single execution path — ENG-46)
195
+ "Rollout",
196
+ "RolloutConfig",
197
+ # Backward-compat aliases for Trial
121
198
  "Trial",
122
199
  "TrialConfig",
123
200
  "TrialRole",
124
201
  "TrialScene",
125
- "Turn",
202
+ "trial_config_from_yaml",
203
+ # User abstraction (progressive disclosure)
204
+ "BaseUser",
205
+ "FunctionUser",
206
+ "PassthroughUser",
207
+ "RoundResult",
126
208
  # SDK (backwards compat)
127
209
  "SDK",
128
210
  # Environments / dep staging
@@ -140,11 +222,25 @@ __all__ = [
140
222
  "OTelCollector",
141
223
  "TrajectoryProxy",
142
224
  "Trajectory",
225
+ # External adapters (ENG-51)
226
+ "InspectAdapter",
227
+ "ORSAdapter",
228
+ "to_inspect_task",
229
+ "to_ors_reward",
143
230
  ]
144
231
 
145
232
 
146
233
  def __getattr__(name: str):
147
234
  """Fall through to harbor for names not explicitly re-exported."""
235
+ # Let Python's normal submodule resolution handle subpackages first.
236
+ import importlib
237
+
238
+ try:
239
+ return importlib.import_module(f"benchflow.{name}")
240
+ except ModuleNotFoundError as e:
241
+ if e.name != f"benchflow.{name}":
242
+ raise
243
+
148
244
  import harbor
149
245
 
150
246
  if hasattr(harbor, name):