benchflow 0.3.2__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.3.2 → benchflow-0.3.4}/.gitignore +2 -1
- {benchflow-0.3.2 → benchflow-0.3.4}/CHANGELOG.md +40 -1
- benchflow-0.3.4/PKG-INFO +143 -0
- benchflow-0.3.4/README.md +106 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/pyproject.toml +17 -2
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/__init__.py +115 -19
- benchflow-0.3.4/src/benchflow/_acp_run.py +358 -0
- benchflow-0.3.4/src/benchflow/_agent_env.py +368 -0
- benchflow-0.3.4/src/benchflow/_agent_setup.py +268 -0
- benchflow-0.3.4/src/benchflow/_daytona_patches.py +103 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_env_setup.py +241 -7
- benchflow-0.3.4/src/benchflow/_provider_runtime.py +172 -0
- benchflow-0.3.4/src/benchflow/_run.py +34 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_sandbox.py +247 -131
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_scene.py +31 -5
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_snapshot.py +19 -9
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_trajectory.py +32 -13
- benchflow-0.3.4/src/benchflow/_types.py +94 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/client.py +7 -2
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/container_transport.py +9 -10
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/session.py +45 -4
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/transport.py +30 -5
- benchflow-0.3.4/src/benchflow/adapters/__init__.py +25 -0
- benchflow-0.3.4/src/benchflow/adapters/inspect_ai.py +63 -0
- benchflow-0.3.4/src/benchflow/adapters/ors.py +68 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/agents/__init__.py +0 -4
- benchflow-0.3.4/src/benchflow/agents/harvey_lab_acp_shim.py +606 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/agents/openclaw_acp_shim.py +5 -4
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/agents/pi_acp_launcher.py +17 -4
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/agents/providers.py +27 -14
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/agents/registry.py +270 -86
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/cli/main.py +305 -219
- benchflow-0.3.2/src/benchflow/job.py → benchflow-0.3.4/src/benchflow/evaluation.py +150 -108
- benchflow-0.3.4/src/benchflow/job.py +29 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/mcp/reviewer_server.py +4 -6
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/metrics.py +12 -3
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/models.py +20 -7
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/process.py +168 -4
- benchflow-0.3.4/src/benchflow/providers/__init__.py +25 -0
- benchflow-0.3.4/src/benchflow/providers/bedrock_proxy.py +534 -0
- benchflow-0.3.4/src/benchflow/providers/bedrock_runtime.py +665 -0
- benchflow-0.3.4/src/benchflow/rewards/README.md +125 -0
- benchflow-0.3.4/src/benchflow/rewards/__init__.py +34 -0
- benchflow-0.3.4/src/benchflow/rewards/builtins.py +471 -0
- benchflow-0.3.4/src/benchflow/rewards/events.py +26 -0
- benchflow-0.3.4/src/benchflow/rewards/file_readers.py +142 -0
- benchflow-0.3.4/src/benchflow/rewards/llm.py +186 -0
- benchflow-0.3.4/src/benchflow/rewards/protocol.py +33 -0
- benchflow-0.3.4/src/benchflow/rewards/rubric.py +76 -0
- benchflow-0.3.4/src/benchflow/rewards/rubric_config.py +127 -0
- benchflow-0.3.4/src/benchflow/rollout.py +1766 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/runtime.py +54 -33
- benchflow-0.3.4/src/benchflow/sandbox/__init__.py +9 -0
- benchflow-0.3.4/src/benchflow/sandbox/daytona.py +74 -0
- benchflow-0.3.4/src/benchflow/sandbox/docker.py +74 -0
- benchflow-0.3.4/src/benchflow/sandbox/protocol.py +74 -0
- benchflow-0.3.4/src/benchflow/sdk.py +193 -0
- benchflow-0.3.4/src/benchflow/self_gen.py +151 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/skill_eval.py +35 -15
- benchflow-0.3.4/src/benchflow/task_download.py +161 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/tasks.py +1 -1
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/trajectories/__init__.py +0 -5
- benchflow-0.3.4/src/benchflow/trial.py +39 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/trial_yaml.py +20 -8
- benchflow-0.3.4/src/benchflow/user.py +101 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/viewer.py +21 -9
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/README.md +3 -3
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/proof_multi_agent.py +5 -5
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/run_conformance.py +36 -5
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conftest.py +6 -1
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/test_claude.sh +8 -8
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/test_codex.sh +68 -13
- benchflow-0.3.4/tests/examples/test_codex_custom_provider.sh +99 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/test_gemini.sh +7 -7
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/test_openclaw.sh +9 -9
- benchflow-0.3.4/tests/fixtures/mock_acp_agent_multi_turn.py +162 -0
- benchflow-0.3.4/tests/fixtures/mock_openai_responses_server.py +98 -0
- benchflow-0.3.4/tests/integration/check_results.py +179 -0
- benchflow-0.3.4/tests/integration/configs/claude-agent-acp.yaml +23 -0
- benchflow-0.3.4/tests/integration/configs/codex-acp.yaml +23 -0
- benchflow-0.3.4/tests/integration/configs/gemini.yaml +23 -0
- benchflow-0.3.4/tests/integration/configs/harvey-lab-harness.yaml +23 -0
- benchflow-0.3.4/tests/integration/configs/openclaw.yaml +23 -0
- benchflow-0.3.4/tests/integration/configs/opencode.yaml +23 -0
- benchflow-0.3.4/tests/integration/configs/openhands.yaml +23 -0
- benchflow-0.3.4/tests/integration/configs/pi-acp.yaml +23 -0
- benchflow-0.3.4/tests/integration/run.sh +170 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_acp.py +208 -6
- benchflow-0.3.4/tests/test_adapters.py +218 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_agent_registry.py +42 -0
- benchflow-0.3.4/tests/test_agent_setup.py +461 -0
- benchflow-0.3.4/tests/test_bedrock_proxy.py +375 -0
- benchflow-0.3.4/tests/test_bedrock_runtime.py +405 -0
- benchflow-0.3.4/tests/test_capture_trajectory.py +735 -0
- benchflow-0.3.4/tests/test_connect_as_env.py +142 -0
- benchflow-0.3.4/tests/test_eng50_capabilities.py +248 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_env_setup.py +142 -1
- benchflow-0.3.4/tests/test_internet_policy.py +417 -0
- benchflow-0.3.4/tests/test_llm_judge.py +502 -0
- benchflow-0.3.4/tests/test_mock_openai_responses_server.py +73 -0
- benchflow-0.3.4/tests/test_notification_order_real.py +126 -0
- benchflow-0.3.4/tests/test_oracle.py +126 -0
- benchflow-0.3.4/tests/test_oracle_chokepoint.py +224 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_pi_acp_launcher.py +30 -3
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_process.py +92 -0
- benchflow-0.3.4/tests/test_provider_runtime.py +224 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_providers.py +28 -1
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_reexport.py +1 -1
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_registry_invariants.py +85 -2
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_resolve_env_helpers.py +121 -2
- benchflow-0.3.4/tests/test_rewards.py +338 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_rewards_jsonl.py +1 -1
- benchflow-0.3.4/tests/test_rubric_config.py +175 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_runtime.py +8 -6
- benchflow-0.3.4/tests/test_sandbox.py +97 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_sandbox_hardening.py +285 -113
- benchflow-0.3.4/tests/test_sandbox_protocol.py +250 -0
- benchflow-0.3.4/tests/test_sandbox_setup.py +110 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_scene.py +4 -8
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_scene_outbox_trial.py +135 -12
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_sdk_internals.py +208 -11
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_sdk_lockdown.py +1 -1
- benchflow-0.3.4/tests/test_self_gen_cli.py +66 -0
- benchflow-0.3.4/tests/test_self_gen_orchestration.py +259 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_skill_eval.py +2 -2
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_skill_eval_dryrun.py +133 -41
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_skill_eval_integration.py +17 -7
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_smoke.py +2 -1
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_subscription_auth.py +5 -18
- benchflow-0.3.4/tests/test_task_download.py +185 -0
- benchflow-0.3.4/tests/test_trajectory_integration.py +261 -0
- benchflow-0.3.4/tests/test_trial_agent_timeout_verify.py +77 -0
- benchflow-0.3.4/tests/test_trial_bedrock_proxy.py +129 -0
- benchflow-0.3.4/tests/test_trial_install_agent_timeout.py +124 -0
- benchflow-0.3.4/tests/test_user.py +409 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_verify.py +16 -17
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_yaml_config.py +29 -0
- benchflow-0.3.2/PKG-INFO +0 -231
- benchflow-0.3.2/README.md +0 -196
- benchflow-0.3.2/src/benchflow/_acp_run.py +0 -152
- benchflow-0.3.2/src/benchflow/_agent_env.py +0 -205
- benchflow-0.3.2/src/benchflow/_agent_setup.py +0 -121
- benchflow-0.3.2/src/benchflow/agents/user_agent.py +0 -62
- benchflow-0.3.2/src/benchflow/cli/eval.py +0 -373
- benchflow-0.3.2/src/benchflow/sdk.py +0 -518
- benchflow-0.3.2/src/benchflow/task_download.py +0 -72
- benchflow-0.3.2/src/benchflow/trajectories/atif.py +0 -112
- benchflow-0.3.2/src/benchflow/trajectories/claude_code.py +0 -249
- benchflow-0.3.2/src/benchflow/trial.py +0 -788
- benchflow-0.3.2/tests/test_capture_trajectory.py +0 -135
- benchflow-0.3.2/tests/test_eval_cli.py +0 -118
- benchflow-0.3.2/tests/test_oracle.py +0 -63
- benchflow-0.3.2/tests/test_sandbox.py +0 -64
- {benchflow-0.3.2 → benchflow-0.3.4}/LICENSE +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_credentials.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/_scoring.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/demo_task/environment/Dockerfile +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/demo_task/instruction.md +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/demo_task/task.toml +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/demo_task/tests/test.sh +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/environments.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/mcp/__init__.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/mcp/hooks.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/py.typed +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/skills.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/templates/__init__.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/templates/judge.py.tmpl +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/templates/test.sh.tmpl +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/trajectories/proxy.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/__init__.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/acp_smoke/instruction.md +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/acp_smoke/task.toml +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/acp_smoke/tests/test.sh +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/conformance-results.json +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/conformance/proof_snapshot.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_agent_spec.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_atif_trajectory.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_job.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_metrics.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_sandbox_verifier_workspace.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_scoring.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_skills.py +0 -0
- {benchflow-0.3.2 → benchflow-0.3.4}/tests/test_tasks.py +0 -0
|
@@ -2,6 +2,45 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
## 0.3.3 — 2026-05-15
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
|
|
9
|
+
- **Harvey LAB benchmark** — converter, agent shim, and parity validation for 1,251 legal AI tasks (#239).
|
|
10
|
+
- **Harvey LAB Claude Sonnet judge** — switched verifier from Gemini to `claude-sonnet-4-6`, matching the original benchmark default (#264).
|
|
11
|
+
- **ProgramBench integration** — new benchmark adapter; TB2 removed; `.ref/` migrated to `benchmarks/` (#237).
|
|
12
|
+
- **CLI progress output** — `bench eval create` / `bench run` now show progress messages by default (#264).
|
|
13
|
+
- **Skill nudge** — optional prompt injection for skill-enhanced agent runs (#207).
|
|
14
|
+
- **Self-generated skill mode** for Codex agent (#233).
|
|
15
|
+
- **Integration test suite** for ENG-6 + `OPENAI_BASE_URL` inheritance fix (#255).
|
|
16
|
+
- **Modal backend support** — Dockerfile compatibility for Modal environments.
|
|
17
|
+
- **CITATION.cff** (#246).
|
|
18
|
+
- **`AGENTS.md`** — canonical contributor guide; `CLAUDE.md` deprecated (#258).
|
|
19
|
+
|
|
20
|
+
### Changed
|
|
21
|
+
|
|
22
|
+
- **Two-field source pattern** for dataset sourcing (#252).
|
|
23
|
+
- **Docs overhaul** — synced from www.benchflow.ai; Mintlify config added then orphaned config removed (#259, #257, #226).
|
|
24
|
+
- **`uv sync`** for package management (#232).
|
|
25
|
+
|
|
26
|
+
### Fixed
|
|
27
|
+
|
|
28
|
+
- Prevent `TypeError` in `metrics.collect_metrics` when reward is `None` (#243).
|
|
29
|
+
- Copy eval `requirements.txt` into Docker build context (#245).
|
|
30
|
+
- Resolve agent aliases in `bench agent show` and display aliases in `bench agent list` (#251).
|
|
31
|
+
- Guard ACP transports against JSON scalar logs (#236).
|
|
32
|
+
- Agent timeout reward fallback for Codex (#234).
|
|
33
|
+
- Isolate JS agent runtime installs (#231).
|
|
34
|
+
- Route Codex ACP through responses API (#224).
|
|
35
|
+
- Deploy skills and forward `solution.env` for oracle runs (#223).
|
|
36
|
+
- Honor no-internet tasks for agent runs; disable web tools without prompt mutation (#215).
|
|
37
|
+
- Propagate `OPENAI_API_KEY` for vllm provider (#3).
|
|
38
|
+
- Preserve arrival order of thought/message within flush windows (#214).
|
|
39
|
+
- Record user messages and per-turn agent text in ACP trajectory (#745).
|
|
40
|
+
- Chown skill-link parent dirs so sandbox user can write into them.
|
|
41
|
+
- Dynamic `--rootdir` in `PYTEST_ADDOPTS` based on task workspace.
|
|
42
|
+
- Unique env-file path in `DaytonaPtyProcess` to avoid race conditions (#200).
|
|
43
|
+
|
|
5
44
|
## 0.2.3 — 2026-04-15
|
|
6
45
|
|
|
7
46
|
### Added
|
|
@@ -66,7 +105,7 @@
|
|
|
66
105
|
- **Vertex AI support** — ADC auth for `google-vertex/`, `anthropic-vertex/`, `vertex-zai/` prefixed models.
|
|
67
106
|
- **Provider registry** — add a new LLM endpoint via a dict entry in `providers.py`, no code changes.
|
|
68
107
|
- **`benchmarks/` directory** with reusable YAML configs and runner scripts for TB2 and SkillsBench.
|
|
69
|
-
- **Auto task download**
|
|
108
|
+
- **Auto task download** — YAML configs reference datasets as `org/repo/path` (e.g. `harbor-framework/terminal-bench-2`). Repos are cloned on first use and cached under `.cache/datasets/`.
|
|
70
109
|
- **`benchflow tasks init`** — scaffold new tasks.
|
|
71
110
|
- **`benchflow tasks check`** — validate task structure.
|
|
72
111
|
- **`benchflow cleanup`** — delete old sandboxes with `--max-age` filtering (default 24h).
|
benchflow-0.3.4/PKG-INFO
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: benchflow
|
|
3
|
+
Version: 0.3.4
|
|
4
|
+
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
|
+
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
|
+
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
7
|
+
Project-URL: Issues, https://github.com/benchflow-ai/benchflow/issues
|
|
8
|
+
Project-URL: Discord, https://discord.gg/mZ9Rc8q8W3
|
|
9
|
+
Project-URL: Changelog, https://github.com/benchflow-ai/benchflow/blob/main/CHANGELOG.md
|
|
10
|
+
Author-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
|
|
11
|
+
Maintainer-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
|
|
12
|
+
License: Apache-2.0
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Keywords: acp,agent-evaluation,benchmark,llm-agents,multi-turn,skillsbench,terminal-bench
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Requires-Dist: anyio>=4.0
|
|
22
|
+
Requires-Dist: harbor==0.3.0
|
|
23
|
+
Requires-Dist: httpx>=0.27.0
|
|
24
|
+
Requires-Dist: pydantic>=2.0
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Requires-Dist: rich>=13.0
|
|
27
|
+
Requires-Dist: typer>=0.9
|
|
28
|
+
Provides-Extra: bedrock
|
|
29
|
+
Requires-Dist: boto3>=1.40; extra == 'bedrock'
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pre-commit>=3.7; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest>=9.0.3; extra == 'dev'
|
|
34
|
+
Requires-Dist: ruff>=0.7.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: ty>=0.0.1a1; extra == 'dev'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
<div align="center">
|
|
39
|
+
<h1>BenchFlow</h1>
|
|
40
|
+
<p>Multi-turn agent benchmarking — Scene-based lifecycle for any ACP agent</p>
|
|
41
|
+
<a href="https://pypi.org/project/benchflow/" target="_blank">
|
|
42
|
+
<img src="https://img.shields.io/pypi/v/benchflow?style=for-the-badge&logo=pypi" alt="PyPI">
|
|
43
|
+
</a>
|
|
44
|
+
<a href="https://discord.gg/mZ9Rc8q8W3" target="_blank">
|
|
45
|
+
<img src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord">
|
|
46
|
+
</a>
|
|
47
|
+
</div>
|
|
48
|
+
|
|
49
|
+
## What
|
|
50
|
+
|
|
51
|
+
BenchFlow runs AI agents against benchmark tasks in sandboxed environments. Single-agent, multi-agent, and multi-round patterns share one Scene-based lifecycle.
|
|
52
|
+
|
|
53
|
+
- **Any ACP agent** — Gemini CLI, Claude Code, Codex, OpenCode, OpenHands, OpenClaw, Pi, or your own
|
|
54
|
+
- **Single + multi + progressive** — single-agent / multi-agent (coder + reviewer, simulated user) / multi-round with a Python `BaseUser` callback
|
|
55
|
+
- **Sandboxes** — Docker locally, Daytona for parallel cloud runs, Modal for serverless/GPU-backed task environments
|
|
56
|
+
- **Hardened verifier** — defaults block BenchJack/Meerkat-style reward-hacking; tasks opt out per-feature
|
|
57
|
+
|
|
58
|
+
## Install
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
uv tool install benchflow
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth.
|
|
65
|
+
|
|
66
|
+
## Documentation
|
|
67
|
+
|
|
68
|
+
Start with [Getting started](./docs/getting-started.md), then [Concepts](./docs/concepts.md) for the mental model. Then by goal:
|
|
69
|
+
|
|
70
|
+
| If you want to… | Read |
|
|
71
|
+
|------------------|------|
|
|
72
|
+
| Run an eval on an existing task | [Getting started](./docs/getting-started.md) |
|
|
73
|
+
| Understand Rollout / Scene / Role / Verifier | [Concepts](./docs/concepts.md) |
|
|
74
|
+
| Author a new task | [Task authoring](./docs/task-authoring.md) |
|
|
75
|
+
| Multi-agent: coder + reviewer, simulated user, BYOS, stateful envs | [Use cases](./docs/use-cases.md) |
|
|
76
|
+
| Multi-round single-agent (progressive disclosure, oracle access) | [Progressive disclosure](./docs/progressive-disclosure.md) |
|
|
77
|
+
| Skill evaluation (when the artifact is a skill, not a workspace) | [Skill eval](./docs/skill-eval.md) |
|
|
78
|
+
| Understand the security model | [Sandbox hardening](./docs/sandbox-hardening.md) |
|
|
79
|
+
| CLI flags + commands | [CLI reference](./docs/reference/cli.md) |
|
|
80
|
+
| Python API surface | [Python API reference](./docs/reference/python-api.md) |
|
|
81
|
+
|
|
82
|
+
Notebooks and runnable example scripts live under [`docs/examples/`](./docs/examples/) so examples stay versioned with the docs that explain them.
|
|
83
|
+
|
|
84
|
+
## Benchmark task sources
|
|
85
|
+
|
|
86
|
+
Benchmark datasets live in external Git repos and are referenced with two fields:
|
|
87
|
+
|
|
88
|
+
```yaml
|
|
89
|
+
# benchmarks/skillsbench-claude-glm51.yaml
|
|
90
|
+
source:
|
|
91
|
+
repo: benchflow-ai/skillsbench # GitHub org/repo
|
|
92
|
+
path: tasks # optional subpath within repo
|
|
93
|
+
ref: main # optional branch/tag
|
|
94
|
+
agent: claude-agent-acp
|
|
95
|
+
model: claude-sonnet-4-6
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Run any benchmark via the CLI:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# From a YAML config
|
|
102
|
+
bench eval create --config benchmarks/skillsbench-claude-glm51.yaml
|
|
103
|
+
|
|
104
|
+
# Inline — mirrors the YAML source fields
|
|
105
|
+
bench eval create \
|
|
106
|
+
--source-repo benchflow-ai/skillsbench --source-path tasks \
|
|
107
|
+
--agent gemini --model gemini-3.1-flash-lite-preview --sandbox daytona --concurrency 64
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Repos are cloned and cached locally under `.cache/datasets/` on first use.
|
|
111
|
+
|
|
112
|
+
SkillsBench itself sources BenchFlow from GitHub `main` in its
|
|
113
|
+
[`pyproject.toml`](https://github.com/benchflow-ai/skillsbench/blob/main/pyproject.toml).
|
|
114
|
+
After a BenchFlow change lands, run `uv lock --upgrade-package benchflow` in
|
|
115
|
+
SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
|
|
116
|
+
|
|
117
|
+
## Featured
|
|
118
|
+
|
|
119
|
+
- **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). Also benchflow's [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) parity answer for the no-second-LLM case. See [Progressive disclosure](./docs/progressive-disclosure.md).
|
|
120
|
+
|
|
121
|
+
## Research artifacts
|
|
122
|
+
|
|
123
|
+
Two runnable labs validate the security story:
|
|
124
|
+
|
|
125
|
+
- [`labs/benchjack-sandbox-hardening/`](./labs/benchjack-sandbox-hardening/) — end-to-end demo that 0.2.1+ blocks three [BenchJack](https://rdi.berkeley.edu/blog/trustworthy-benchmarks-cont/) exploits that flip 0.2.0's reward from 0.0 to 1.0.
|
|
126
|
+
- [`labs/reward-hack-matrix/`](./labs/reward-hack-matrix/) — full reward-hack sweep across real benchmarks comparing 0.2.0 vs 0.2.2.
|
|
127
|
+
|
|
128
|
+
## Audience
|
|
129
|
+
|
|
130
|
+
- **Eval researchers / paper writers** → [Getting started](./docs/getting-started.md) → [Concepts](./docs/concepts.md) → [Use cases](./docs/use-cases.md)
|
|
131
|
+
- **Task authors** → [Task authoring](./docs/task-authoring.md) → [Sandbox hardening](./docs/sandbox-hardening.md)
|
|
132
|
+
- **Agent builders integrating with benchflow** → [Concepts](./docs/concepts.md) → [Python API reference](./docs/reference/python-api.md) → [`benchflow.agents.registry`](./src/benchflow/agents/registry.py)
|
|
133
|
+
- **Existing Harbor users migrating** → [Use cases — migration section](./docs/use-cases.md#migration-from-harbor) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
|
|
134
|
+
|
|
135
|
+
## Contributing
|
|
136
|
+
|
|
137
|
+
PRs welcome. Open against `main`. CI runs ruff + tests on every PR; please run `ruff check .` and `pytest tests/` locally first.
|
|
138
|
+
|
|
139
|
+
For a release: bump `pyproject.toml` to the next stable version, tag `v<version>` on main, push the tag — CI publishes to PyPI. Then bump main to the next `.dev0`.
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
Apache-2.0.
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<h1>BenchFlow</h1>
|
|
3
|
+
<p>Multi-turn agent benchmarking — Scene-based lifecycle for any ACP agent</p>
|
|
4
|
+
<a href="https://pypi.org/project/benchflow/" target="_blank">
|
|
5
|
+
<img src="https://img.shields.io/pypi/v/benchflow?style=for-the-badge&logo=pypi" alt="PyPI">
|
|
6
|
+
</a>
|
|
7
|
+
<a href="https://discord.gg/mZ9Rc8q8W3" target="_blank">
|
|
8
|
+
<img src="https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Discord">
|
|
9
|
+
</a>
|
|
10
|
+
</div>
|
|
11
|
+
|
|
12
|
+
## What
|
|
13
|
+
|
|
14
|
+
BenchFlow runs AI agents against benchmark tasks in sandboxed environments. Single-agent, multi-agent, and multi-round patterns share one Scene-based lifecycle.
|
|
15
|
+
|
|
16
|
+
- **Any ACP agent** — Gemini CLI, Claude Code, Codex, OpenCode, OpenHands, OpenClaw, Pi, or your own
|
|
17
|
+
- **Single + multi + progressive** — single-agent / multi-agent (coder + reviewer, simulated user) / multi-round with a Python `BaseUser` callback
|
|
18
|
+
- **Sandboxes** — Docker locally, Daytona for parallel cloud runs, Modal for serverless/GPU-backed task environments
|
|
19
|
+
- **Hardened verifier** — defaults block BenchJack/Meerkat-style reward-hacking; tasks opt out per-feature
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
uv tool install benchflow
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth.
|
|
28
|
+
|
|
29
|
+
## Documentation
|
|
30
|
+
|
|
31
|
+
Start with [Getting started](./docs/getting-started.md), then [Concepts](./docs/concepts.md) for the mental model. Then by goal:
|
|
32
|
+
|
|
33
|
+
| If you want to… | Read |
|
|
34
|
+
|------------------|------|
|
|
35
|
+
| Run an eval on an existing task | [Getting started](./docs/getting-started.md) |
|
|
36
|
+
| Understand Rollout / Scene / Role / Verifier | [Concepts](./docs/concepts.md) |
|
|
37
|
+
| Author a new task | [Task authoring](./docs/task-authoring.md) |
|
|
38
|
+
| Multi-agent: coder + reviewer, simulated user, BYOS, stateful envs | [Use cases](./docs/use-cases.md) |
|
|
39
|
+
| Multi-round single-agent (progressive disclosure, oracle access) | [Progressive disclosure](./docs/progressive-disclosure.md) |
|
|
40
|
+
| Skill evaluation (when the artifact is a skill, not a workspace) | [Skill eval](./docs/skill-eval.md) |
|
|
41
|
+
| Understand the security model | [Sandbox hardening](./docs/sandbox-hardening.md) |
|
|
42
|
+
| CLI flags + commands | [CLI reference](./docs/reference/cli.md) |
|
|
43
|
+
| Python API surface | [Python API reference](./docs/reference/python-api.md) |
|
|
44
|
+
|
|
45
|
+
Notebooks and runnable example scripts live under [`docs/examples/`](./docs/examples/) so examples stay versioned with the docs that explain them.
|
|
46
|
+
|
|
47
|
+
## Benchmark task sources
|
|
48
|
+
|
|
49
|
+
Benchmark datasets live in external Git repos and are referenced with two fields:
|
|
50
|
+
|
|
51
|
+
```yaml
|
|
52
|
+
# benchmarks/skillsbench-claude-glm51.yaml
|
|
53
|
+
source:
|
|
54
|
+
repo: benchflow-ai/skillsbench # GitHub org/repo
|
|
55
|
+
path: tasks # optional subpath within repo
|
|
56
|
+
ref: main # optional branch/tag
|
|
57
|
+
agent: claude-agent-acp
|
|
58
|
+
model: claude-sonnet-4-6
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Run any benchmark via the CLI:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# From a YAML config
|
|
65
|
+
bench eval create --config benchmarks/skillsbench-claude-glm51.yaml
|
|
66
|
+
|
|
67
|
+
# Inline — mirrors the YAML source fields
|
|
68
|
+
bench eval create \
|
|
69
|
+
--source-repo benchflow-ai/skillsbench --source-path tasks \
|
|
70
|
+
--agent gemini --model gemini-3.1-flash-lite-preview --sandbox daytona --concurrency 64
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Repos are cloned and cached locally under `.cache/datasets/` on first use.
|
|
74
|
+
|
|
75
|
+
SkillsBench itself sources BenchFlow from GitHub `main` in its
|
|
76
|
+
[`pyproject.toml`](https://github.com/benchflow-ai/skillsbench/blob/main/pyproject.toml).
|
|
77
|
+
After a BenchFlow change lands, run `uv lock --upgrade-package benchflow` in
|
|
78
|
+
SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
|
|
79
|
+
|
|
80
|
+
## Featured
|
|
81
|
+
|
|
82
|
+
- **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). Also benchflow's [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) parity answer for the no-second-LLM case. See [Progressive disclosure](./docs/progressive-disclosure.md).
|
|
83
|
+
|
|
84
|
+
## Research artifacts
|
|
85
|
+
|
|
86
|
+
Two runnable labs validate the security story:
|
|
87
|
+
|
|
88
|
+
- [`labs/benchjack-sandbox-hardening/`](./labs/benchjack-sandbox-hardening/) — end-to-end demo that 0.2.1+ blocks three [BenchJack](https://rdi.berkeley.edu/blog/trustworthy-benchmarks-cont/) exploits that flip 0.2.0's reward from 0.0 to 1.0.
|
|
89
|
+
- [`labs/reward-hack-matrix/`](./labs/reward-hack-matrix/) — full reward-hack sweep across real benchmarks comparing 0.2.0 vs 0.2.2.
|
|
90
|
+
|
|
91
|
+
## Audience
|
|
92
|
+
|
|
93
|
+
- **Eval researchers / paper writers** → [Getting started](./docs/getting-started.md) → [Concepts](./docs/concepts.md) → [Use cases](./docs/use-cases.md)
|
|
94
|
+
- **Task authors** → [Task authoring](./docs/task-authoring.md) → [Sandbox hardening](./docs/sandbox-hardening.md)
|
|
95
|
+
- **Agent builders integrating with benchflow** → [Concepts](./docs/concepts.md) → [Python API reference](./docs/reference/python-api.md) → [`benchflow.agents.registry`](./src/benchflow/agents/registry.py)
|
|
96
|
+
- **Existing Harbor users migrating** → [Use cases — migration section](./docs/use-cases.md#migration-from-harbor) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
|
|
97
|
+
|
|
98
|
+
## Contributing
|
|
99
|
+
|
|
100
|
+
PRs welcome. Open against `main`. CI runs ruff + tests on every PR; please run `ruff check .` and `pytest tests/` locally first.
|
|
101
|
+
|
|
102
|
+
For a release: bump `pyproject.toml` to the next stable version, tag `v<version>` on main, push the tag — CI publishes to PyPI. Then bump main to the next `.dev0`.
|
|
103
|
+
|
|
104
|
+
## License
|
|
105
|
+
|
|
106
|
+
Apache-2.0.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "benchflow"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.4"
|
|
4
4
|
description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -42,6 +42,9 @@ dev = [
|
|
|
42
42
|
"ruff>=0.7.0",
|
|
43
43
|
"ty>=0.0.1a1",
|
|
44
44
|
]
|
|
45
|
+
bedrock = [
|
|
46
|
+
"boto3>=1.40",
|
|
47
|
+
]
|
|
45
48
|
|
|
46
49
|
[project.scripts]
|
|
47
50
|
benchflow = "benchflow.cli.main:app"
|
|
@@ -71,14 +74,16 @@ only-include = [
|
|
|
71
74
|
|
|
72
75
|
[tool.pytest.ini_options]
|
|
73
76
|
asyncio_mode = "auto"
|
|
74
|
-
addopts = "-m 'not live'"
|
|
77
|
+
addopts = "-m 'not live and not integration'"
|
|
75
78
|
testpaths = ["tests"]
|
|
76
79
|
markers = [
|
|
77
80
|
"live: requires real Anthropic API and Docker daemon (run with -m live)",
|
|
81
|
+
"integration: full integration tests — requires GEMINI_API_KEY + DAYTONA_API_KEY (run with -m integration)",
|
|
78
82
|
]
|
|
79
83
|
|
|
80
84
|
[tool.ruff]
|
|
81
85
|
target-version = "py312"
|
|
86
|
+
extend-exclude = [".claude/skills/skill-creator"]
|
|
82
87
|
|
|
83
88
|
[tool.ruff.lint]
|
|
84
89
|
select = [
|
|
@@ -96,6 +101,16 @@ ignore = [
|
|
|
96
101
|
"RUF022", # __all__ unsorted — grouped by section for agent-friendliness
|
|
97
102
|
]
|
|
98
103
|
|
|
104
|
+
[tool.ruff.lint.per-file-ignores]
|
|
105
|
+
# Standalone scripts — sys.path manipulation before imports is intentional
|
|
106
|
+
"experiments/*.py" = ["E402"]
|
|
107
|
+
"tests/conformance/*.py" = ["E402"]
|
|
108
|
+
# Notebooks: cell-local imports + short loop vars are notebook conventions
|
|
109
|
+
"docs/examples/*.ipynb" = ["E402", "E741", "SIM115"]
|
|
110
|
+
# Forward references resolved via __future__ annotations — ruff flags them
|
|
111
|
+
# but they work at runtime; explicit TYPE_CHECKING imports would force eager loads.
|
|
112
|
+
"src/benchflow/runtime.py" = ["F821"]
|
|
113
|
+
|
|
99
114
|
[tool.ty.environment]
|
|
100
115
|
python-version = "3.12"
|
|
101
116
|
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
Re-exports environment APIs and adds:
|
|
4
4
|
- ACP client for multi-turn agent communication
|
|
5
5
|
- Trajectory capture (HTTP proxy, OTel collector, ACP native)
|
|
6
|
-
-
|
|
7
|
-
-
|
|
6
|
+
- Rollout lifecycle for single-task execution
|
|
7
|
+
- Evaluation orchestration with retries and concurrency
|
|
8
8
|
- Metrics collection and aggregation
|
|
9
9
|
"""
|
|
10
10
|
|
|
@@ -19,15 +19,24 @@ from harbor import (
|
|
|
19
19
|
ExecResult,
|
|
20
20
|
Task,
|
|
21
21
|
TaskConfig,
|
|
22
|
-
Trial,
|
|
23
22
|
Verifier,
|
|
24
23
|
VerifierResult,
|
|
25
24
|
)
|
|
26
25
|
|
|
27
26
|
# benchflow's additions
|
|
28
27
|
from benchflow._env_setup import stage_dockerfile_deps
|
|
28
|
+
from benchflow._scene import MailboxTransport, Message, MessageTransport, SceneRole
|
|
29
|
+
from benchflow._scene import Scene as SceneRuntime
|
|
30
|
+
from benchflow._snapshot import list_snapshots, restore, snapshot
|
|
31
|
+
from benchflow._types import Role, Scene, Turn
|
|
29
32
|
from benchflow.acp.client import ACPClient
|
|
30
33
|
from benchflow.acp.session import ACPSession
|
|
34
|
+
from benchflow.adapters import (
|
|
35
|
+
InspectAdapter,
|
|
36
|
+
ORSAdapter,
|
|
37
|
+
to_inspect_task,
|
|
38
|
+
to_ors_reward,
|
|
39
|
+
)
|
|
31
40
|
from benchflow.agents.registry import (
|
|
32
41
|
AGENTS,
|
|
33
42
|
get_agent,
|
|
@@ -42,27 +51,61 @@ from benchflow.environments import (
|
|
|
42
51
|
detect_services_from_dockerfile,
|
|
43
52
|
register_service,
|
|
44
53
|
)
|
|
45
|
-
from benchflow.
|
|
54
|
+
from benchflow.evaluation import (
|
|
55
|
+
Evaluation,
|
|
56
|
+
EvaluationConfig,
|
|
57
|
+
EvaluationResult,
|
|
58
|
+
RetryConfig,
|
|
59
|
+
)
|
|
46
60
|
from benchflow.metrics import BenchmarkMetrics, collect_metrics
|
|
47
|
-
from benchflow.models import AgentInstallError, AgentTimeoutError,
|
|
61
|
+
from benchflow.models import AgentInstallError, AgentTimeoutError, RolloutResult
|
|
62
|
+
|
|
63
|
+
# Rewards protocol (v0.4 — composable Rubric + RewardFunc)
|
|
64
|
+
from benchflow.rewards import (
|
|
65
|
+
CodeExecRewardFunc,
|
|
66
|
+
Criterion,
|
|
67
|
+
JudgeConfig,
|
|
68
|
+
LLMJudgeRewardFunc,
|
|
69
|
+
RewardEvent,
|
|
70
|
+
RewardFunc,
|
|
71
|
+
Rubric,
|
|
72
|
+
RubricConfig,
|
|
73
|
+
ScoringConfig,
|
|
74
|
+
StringMatchRewardFunc,
|
|
75
|
+
TestRewardFunc,
|
|
76
|
+
VerifyResult,
|
|
77
|
+
load_rubric_toml,
|
|
78
|
+
)
|
|
79
|
+
from benchflow.rollout import Rollout, RolloutConfig
|
|
48
80
|
from benchflow.runtime import (
|
|
49
81
|
Agent,
|
|
50
82
|
Environment,
|
|
51
83
|
Runtime,
|
|
52
84
|
RuntimeConfig,
|
|
53
85
|
RuntimeResult,
|
|
54
|
-
run,
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
|
|
86
|
+
run,
|
|
87
|
+
) # bf.run() — supports Agent, RolloutConfig, and str calling conventions
|
|
88
|
+
|
|
89
|
+
# Sandbox protocol (v0.4 — parallel types, Harbor not yet removed)
|
|
90
|
+
from benchflow.sandbox import ExecResult as SandboxExecResult
|
|
91
|
+
from benchflow.sandbox import ImageBuilder, ImageConfig, ImageRef, Sandbox
|
|
58
92
|
from benchflow.sdk import SDK
|
|
59
|
-
from benchflow.trial import Trial, TrialConfig
|
|
60
|
-
from benchflow.trial import Role as TrialRole, Scene as TrialScene, Turn
|
|
61
|
-
from benchflow.trial_yaml import trial_config_from_yaml
|
|
62
93
|
from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
|
|
63
94
|
from benchflow.trajectories.otel import OTelCollector
|
|
64
95
|
from benchflow.trajectories.proxy import TrajectoryProxy
|
|
65
96
|
from benchflow.trajectories.types import Trajectory
|
|
97
|
+
from benchflow.trial_yaml import trial_config_from_yaml
|
|
98
|
+
from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
|
|
99
|
+
|
|
100
|
+
# Backward-compat aliases
|
|
101
|
+
Trial = Rollout
|
|
102
|
+
TrialConfig = RolloutConfig
|
|
103
|
+
TrialRole = Role
|
|
104
|
+
TrialScene = Scene
|
|
105
|
+
RunResult = RolloutResult
|
|
106
|
+
Job = Evaluation
|
|
107
|
+
JobConfig = EvaluationConfig
|
|
108
|
+
JobResult = EvaluationResult
|
|
66
109
|
|
|
67
110
|
# Public API surface. Anything not in this list is implementation detail and
|
|
68
111
|
# may change without notice. Names are grouped by source module to match the
|
|
@@ -70,6 +113,27 @@ from benchflow.trajectories.types import Trajectory
|
|
|
70
113
|
# what.
|
|
71
114
|
__all__ = [
|
|
72
115
|
"__version__",
|
|
116
|
+
# Rewards protocol (v0.4)
|
|
117
|
+
"Rubric",
|
|
118
|
+
"RewardFunc",
|
|
119
|
+
"RewardEvent",
|
|
120
|
+
"VerifyResult",
|
|
121
|
+
"TestRewardFunc",
|
|
122
|
+
"LLMJudgeRewardFunc",
|
|
123
|
+
"StringMatchRewardFunc",
|
|
124
|
+
"CodeExecRewardFunc",
|
|
125
|
+
# Rubric config (ENG-55)
|
|
126
|
+
"Criterion",
|
|
127
|
+
"JudgeConfig",
|
|
128
|
+
"RubricConfig",
|
|
129
|
+
"ScoringConfig",
|
|
130
|
+
"load_rubric_toml",
|
|
131
|
+
# Sandbox protocol (v0.4)
|
|
132
|
+
"Sandbox",
|
|
133
|
+
"SandboxExecResult",
|
|
134
|
+
"ImageBuilder",
|
|
135
|
+
"ImageConfig",
|
|
136
|
+
"ImageRef",
|
|
73
137
|
# Harbor re-exports
|
|
74
138
|
"BaseAgent",
|
|
75
139
|
"BaseEnvironment",
|
|
@@ -88,28 +152,38 @@ __all__ = [
|
|
|
88
152
|
"is_vertex_model",
|
|
89
153
|
"list_agents",
|
|
90
154
|
"register_agent",
|
|
91
|
-
#
|
|
155
|
+
# Evaluation orchestration (new names)
|
|
156
|
+
"Evaluation",
|
|
157
|
+
"EvaluationConfig",
|
|
158
|
+
"EvaluationResult",
|
|
159
|
+
"RetryConfig",
|
|
160
|
+
# Backward-compat aliases for Job
|
|
92
161
|
"Job",
|
|
93
162
|
"JobConfig",
|
|
94
163
|
"JobResult",
|
|
95
|
-
"RetryConfig",
|
|
96
164
|
# Metrics
|
|
97
165
|
"BenchmarkMetrics",
|
|
98
166
|
"collect_metrics",
|
|
99
167
|
# Models / errors
|
|
100
168
|
"AgentInstallError",
|
|
101
169
|
"AgentTimeoutError",
|
|
170
|
+
"RolloutResult",
|
|
102
171
|
"RunResult",
|
|
103
|
-
# Runtime (0.3
|
|
172
|
+
# Runtime (0.3 compat)
|
|
104
173
|
"Agent",
|
|
105
174
|
"Environment",
|
|
106
175
|
"Runtime",
|
|
107
176
|
"RuntimeConfig",
|
|
108
177
|
"RuntimeResult",
|
|
178
|
+
# Single entry point
|
|
109
179
|
"run",
|
|
110
|
-
#
|
|
111
|
-
"Scene",
|
|
180
|
+
# Canonical declarative types (_types.py — ENG-47)
|
|
112
181
|
"Role",
|
|
182
|
+
"Scene",
|
|
183
|
+
"Turn",
|
|
184
|
+
# Multi-agent scene runtime
|
|
185
|
+
"SceneRole",
|
|
186
|
+
"SceneRuntime",
|
|
113
187
|
"Message",
|
|
114
188
|
"MessageTransport",
|
|
115
189
|
"MailboxTransport",
|
|
@@ -117,12 +191,20 @@ __all__ = [
|
|
|
117
191
|
"snapshot",
|
|
118
192
|
"restore",
|
|
119
193
|
"list_snapshots",
|
|
120
|
-
#
|
|
194
|
+
# Rollout (single execution path — ENG-46)
|
|
195
|
+
"Rollout",
|
|
196
|
+
"RolloutConfig",
|
|
197
|
+
# Backward-compat aliases for Trial
|
|
121
198
|
"Trial",
|
|
122
199
|
"TrialConfig",
|
|
123
200
|
"TrialRole",
|
|
124
201
|
"TrialScene",
|
|
125
|
-
"
|
|
202
|
+
"trial_config_from_yaml",
|
|
203
|
+
# User abstraction (progressive disclosure)
|
|
204
|
+
"BaseUser",
|
|
205
|
+
"FunctionUser",
|
|
206
|
+
"PassthroughUser",
|
|
207
|
+
"RoundResult",
|
|
126
208
|
# SDK (backwards compat)
|
|
127
209
|
"SDK",
|
|
128
210
|
# Environments / dep staging
|
|
@@ -140,11 +222,25 @@ __all__ = [
|
|
|
140
222
|
"OTelCollector",
|
|
141
223
|
"TrajectoryProxy",
|
|
142
224
|
"Trajectory",
|
|
225
|
+
# External adapters (ENG-51)
|
|
226
|
+
"InspectAdapter",
|
|
227
|
+
"ORSAdapter",
|
|
228
|
+
"to_inspect_task",
|
|
229
|
+
"to_ors_reward",
|
|
143
230
|
]
|
|
144
231
|
|
|
145
232
|
|
|
146
233
|
def __getattr__(name: str):
|
|
147
234
|
"""Fall through to harbor for names not explicitly re-exported."""
|
|
235
|
+
# Let Python's normal submodule resolution handle subpackages first.
|
|
236
|
+
import importlib
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
return importlib.import_module(f"benchflow.{name}")
|
|
240
|
+
except ModuleNotFoundError as e:
|
|
241
|
+
if e.name != f"benchflow.{name}":
|
|
242
|
+
raise
|
|
243
|
+
|
|
148
244
|
import harbor
|
|
149
245
|
|
|
150
246
|
if hasattr(harbor, name):
|