benchflow 0.3.4__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.3.4 → benchflow-0.4.0}/.gitignore +1 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/PKG-INFO +10 -5
- {benchflow-0.3.4 → benchflow-0.4.0}/README.md +2 -2
- {benchflow-0.3.4 → benchflow-0.4.0}/pyproject.toml +29 -3
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/__init__.py +39 -77
- benchflow-0.4.0/src/benchflow/_dotenv.py +47 -0
- benchflow-0.4.0/src/benchflow/_utils/__init__.py +11 -0
- benchflow-0.3.4/src/benchflow/task_download.py → benchflow-0.4.0/src/benchflow/_utils/benchmark_repos.py +5 -1
- benchflow-0.4.0/src/benchflow/_utils/config.py +22 -0
- benchflow-0.3.4/src/benchflow/trial_yaml.py → benchflow-0.4.0/src/benchflow/_utils/yaml_loader.py +18 -18
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/container_transport.py +1 -1
- benchflow-0.3.4/src/benchflow/_acp_run.py → benchflow-0.4.0/src/benchflow/acp/runtime.py +9 -9
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/transport.py +1 -1
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/adapters/ors.py +28 -8
- benchflow-0.3.4/src/benchflow/_credentials.py → benchflow-0.4.0/src/benchflow/agents/credentials.py +31 -6
- benchflow-0.3.4/src/benchflow/_agent_env.py → benchflow-0.4.0/src/benchflow/agents/env.py +125 -42
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/harvey_lab_acp_shim.py +144 -2
- benchflow-0.3.4/src/benchflow/_agent_setup.py → benchflow-0.4.0/src/benchflow/agents/install.py +3 -3
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/registry.py +59 -23
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/cli/main.py +401 -102
- benchflow-0.4.0/src/benchflow/cli/trace_import.py +383 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/evaluation.py +18 -17
- benchflow-0.4.0/src/benchflow/experimental/__init__.py +1 -0
- {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/experimental}/mcp/hooks.py +1 -1
- {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/experimental}/mcp/reviewer_server.py +1 -1
- benchflow-0.4.0/src/benchflow/hosted_env.py +408 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/metrics.py +1 -1
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/models.py +3 -3
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/builtins.py +6 -10
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/file_readers.py +1 -3
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/llm.py +2 -6
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/rubric_config.py +3 -3
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rollout.py +192 -113
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/runtime.py +39 -32
- benchflow-0.4.0/src/benchflow/sandbox/__init__.py +37 -0
- benchflow-0.4.0/src/benchflow/sandbox/_base.py +189 -0
- benchflow-0.4.0/src/benchflow/sandbox/_compose.py +9 -0
- benchflow-0.4.0/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +11 -0
- benchflow-0.4.0/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +6 -0
- benchflow-0.4.0/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +3 -0
- benchflow-0.4.0/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +4 -0
- benchflow-0.4.0/src/benchflow/sandbox/daytona.py +1077 -0
- benchflow-0.4.0/src/benchflow/sandbox/docker.py +465 -0
- benchflow-0.3.4/src/benchflow/_sandbox.py → benchflow-0.4.0/src/benchflow/sandbox/lockdown.py +1 -1
- benchflow-0.4.0/src/benchflow/sandbox/modal_impl.py +368 -0
- {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/sandbox}/process.py +5 -5
- benchflow-0.3.4/src/benchflow/_env_setup.py → benchflow-0.4.0/src/benchflow/sandbox/setup.py +72 -50
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/sdk.py +17 -14
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/self_gen.py +11 -11
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/skill_eval.py +97 -33
- benchflow-0.4.0/src/benchflow/task/__init__.py +69 -0
- benchflow-0.4.0/src/benchflow/task/config.py +235 -0
- benchflow-0.4.0/src/benchflow/task/env.py +41 -0
- benchflow-0.4.0/src/benchflow/task/paths.py +171 -0
- benchflow-0.4.0/src/benchflow/task/task.py +49 -0
- benchflow-0.4.0/src/benchflow/task/verifier.py +166 -0
- benchflow-0.4.0/src/benchflow/traces/__init__.py +28 -0
- benchflow-0.4.0/src/benchflow/traces/huggingface.py +540 -0
- benchflow-0.4.0/src/benchflow/traces/local.py +117 -0
- benchflow-0.4.0/src/benchflow/traces/models.py +108 -0
- benchflow-0.4.0/src/benchflow/traces/parsers.py +485 -0
- benchflow-0.4.0/src/benchflow/traces/task_gen.py +562 -0
- {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/trajectories}/viewer.py +18 -16
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/proof_multi_agent.py +22 -20
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/proof_snapshot.py +6 -7
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/run_conformance.py +6 -6
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conftest.py +18 -12
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/test_codex.sh +16 -7
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/test_codex_custom_provider.sh +1 -1
- benchflow-0.4.0/tests/integration/check_adapter_evidence.py +373 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/check_results.py +26 -6
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/run.sh +15 -8
- benchflow-0.4.0/tests/integration/run_suite.py +439 -0
- benchflow-0.4.0/tests/integration/suites/release.yaml +262 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_acp.py +21 -21
- benchflow-0.4.0/tests/test_adapter_scripts.py +33 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_adapters.py +19 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_agent_registry.py +1 -1
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_agent_setup.py +1 -1
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_agent_spec.py +9 -8
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_capture_trajectory.py +1 -1
- benchflow-0.4.0/tests/test_cli_daytona.py +89 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_connect_as_env.py +51 -8
- benchflow-0.4.0/tests/test_docs_examples.py +58 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_eng50_capabilities.py +50 -48
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_env_setup.py +39 -9
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_exclude_tasks.py +9 -9
- benchflow-0.4.0/tests/test_harvey_lab_shim.py +46 -0
- benchflow-0.4.0/tests/test_hosted_env.py +257 -0
- benchflow-0.4.0/tests/test_integration_check_results.py +110 -0
- benchflow-0.4.0/tests/test_integration_run_suite.py +261 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_internet_policy.py +19 -19
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_job.py +17 -15
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_llm_judge.py +7 -21
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_notification_order_real.py +1 -1
- benchflow-0.4.0/tests/test_oracle_chokepoint.py +469 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_process.py +3 -3
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_provider_runtime.py +2 -2
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_reexport.py +12 -14
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_resolve_env_helpers.py +67 -3
- benchflow-0.4.0/tests/test_rollout_upload.py +70 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_runtime.py +23 -4
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sandbox_hardening.py +92 -68
- benchflow-0.4.0/tests/test_sandbox_protocol.py +225 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sandbox_setup.py +1 -1
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sandbox_verifier_workspace.py +10 -10
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_scene.py +15 -15
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_scene_outbox_trial.py +52 -15
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_scoring.py +2 -2
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sdk_internals.py +120 -22
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sdk_lockdown.py +10 -10
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_self_gen_orchestration.py +15 -15
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_skill_eval.py +155 -1
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_skill_eval_dryrun.py +37 -6
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_skill_eval_integration.py +3 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_smoke.py +5 -5
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_subscription_auth.py +164 -2
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_task_download.py +10 -9
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_tasks.py +1 -1
- benchflow-0.4.0/tests/test_trace_import_cli.py +61 -0
- benchflow-0.4.0/tests/test_traces_parsers.py +474 -0
- benchflow-0.4.0/tests/test_traces_task_gen.py +514 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_trajectory_integration.py +4 -4
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_trial_agent_timeout_verify.py +7 -7
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_trial_bedrock_proxy.py +8 -8
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_trial_install_agent_timeout.py +15 -10
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_user.py +23 -20
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_verify.py +36 -14
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_yaml_config.py +60 -24
- benchflow-0.3.4/src/benchflow/job.py +0 -29
- benchflow-0.3.4/src/benchflow/sandbox/__init__.py +0 -9
- benchflow-0.3.4/src/benchflow/sandbox/daytona.py +0 -74
- benchflow-0.3.4/src/benchflow/sandbox/docker.py +0 -74
- benchflow-0.3.4/src/benchflow/trial.py +0 -39
- benchflow-0.3.4/tests/test_oracle_chokepoint.py +0 -224
- benchflow-0.3.4/tests/test_sandbox_protocol.py +0 -250
- {benchflow-0.3.4 → benchflow-0.4.0}/CHANGELOG.md +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/LICENSE +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/_run.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/_types.py +0 -0
- /benchflow-0.3.4/src/benchflow/_scoring.py → /benchflow-0.4.0/src/benchflow/_utils/scoring.py +0 -0
- /benchflow-0.3.4/src/benchflow/tasks.py → /benchflow-0.4.0/src/benchflow/_utils/task_authoring.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/client.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/session.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/adapters/__init__.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/adapters/inspect_ai.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/pi_acp_launcher.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/providers.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/demo_task/environment/Dockerfile +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/demo_task/instruction.md +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/demo_task/task.toml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/demo_task/tests/test.sh +0 -0
- {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/experimental}/mcp/__init__.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/providers/__init__.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/providers/bedrock_proxy.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/providers/bedrock_runtime.py +0 -0
- /benchflow-0.3.4/src/benchflow/_provider_runtime.py → /benchflow-0.4.0/src/benchflow/providers/runtime.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/py.typed +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/README.md +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/__init__.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/events.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/protocol.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/rubric.py +0 -0
- /benchflow-0.3.4/src/benchflow/_daytona_patches.py → /benchflow-0.4.0/src/benchflow/sandbox/_sdk_ops.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/sandbox/protocol.py +0 -0
- /benchflow-0.3.4/src/benchflow/environments.py → /benchflow-0.4.0/src/benchflow/sandbox/services.py +0 -0
- /benchflow-0.3.4/src/benchflow/_snapshot.py → /benchflow-0.4.0/src/benchflow/sandbox/snapshot.py +0 -0
- {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/sandbox}/user.py +0 -0
- /benchflow-0.3.4/src/benchflow/_scene.py → /benchflow-0.4.0/src/benchflow/scenes.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/skills.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/templates/__init__.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/templates/judge.py.tmpl +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/templates/test.sh.tmpl +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/trajectories/__init__.py +0 -0
- /benchflow-0.3.4/src/benchflow/_trajectory.py → /benchflow-0.4.0/src/benchflow/trajectories/_capture.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/trajectories/proxy.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/__init__.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/README.md +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/acp_smoke/instruction.md +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/acp_smoke/task.toml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/acp_smoke/tests/test.sh +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/conformance-results.json +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/fixtures/mock_openai_responses_server.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/claude-agent-acp.yaml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/codex-acp.yaml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/gemini.yaml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/openclaw.yaml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/opencode.yaml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/openhands.yaml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/pi-acp.yaml +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_atif_trajectory.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_bedrock_proxy.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_bedrock_runtime.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_metrics.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_mock_openai_responses_server.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_oracle.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_pi_acp_launcher.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_providers.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_registry_invariants.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_rewards.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_rewards_jsonl.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_rubric_config.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sandbox.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_self_gen_cli.py +0 -0
- {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_skills.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -11,7 +11,7 @@ Author-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@g
|
|
|
11
11
|
Maintainer-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
|
|
12
12
|
License: Apache-2.0
|
|
13
13
|
License-File: LICENSE
|
|
14
|
-
Keywords: acp,agent-evaluation,benchmark,llm-agents,multi-turn,skillsbench
|
|
14
|
+
Keywords: acp,agent-evaluation,benchmark,llm-agents,multi-turn,skillsbench
|
|
15
15
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
16
|
Classifier: Operating System :: OS Independent
|
|
17
17
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -19,7 +19,6 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.13
|
|
20
20
|
Requires-Python: >=3.12
|
|
21
21
|
Requires-Dist: anyio>=4.0
|
|
22
|
-
Requires-Dist: harbor==0.3.0
|
|
23
22
|
Requires-Dist: httpx>=0.27.0
|
|
24
23
|
Requires-Dist: pydantic>=2.0
|
|
25
24
|
Requires-Dist: pyyaml>=6.0
|
|
@@ -33,6 +32,12 @@ Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
|
33
32
|
Requires-Dist: pytest>=9.0.3; extra == 'dev'
|
|
34
33
|
Requires-Dist: ruff>=0.7.0; extra == 'dev'
|
|
35
34
|
Requires-Dist: ty>=0.0.1a1; extra == 'dev'
|
|
35
|
+
Provides-Extra: sandbox-daytona
|
|
36
|
+
Requires-Dist: daytona>=0.153.0; extra == 'sandbox-daytona'
|
|
37
|
+
Requires-Dist: tenacity>=8.0; extra == 'sandbox-daytona'
|
|
38
|
+
Provides-Extra: sandbox-modal
|
|
39
|
+
Requires-Dist: modal>=0.73; extra == 'sandbox-modal'
|
|
40
|
+
Requires-Dist: tenacity>=8.0; extra == 'sandbox-modal'
|
|
36
41
|
Description-Content-Type: text/markdown
|
|
37
42
|
|
|
38
43
|
<div align="center">
|
|
@@ -116,7 +121,7 @@ SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
|
|
|
116
121
|
|
|
117
122
|
## Featured
|
|
118
123
|
|
|
119
|
-
- **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb).
|
|
124
|
+
- **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). See [Progressive disclosure](./docs/progressive-disclosure.md).
|
|
120
125
|
|
|
121
126
|
## Research artifacts
|
|
122
127
|
|
|
@@ -130,7 +135,7 @@ Two runnable labs validate the security story:
|
|
|
130
135
|
- **Eval researchers / paper writers** → [Getting started](./docs/getting-started.md) → [Concepts](./docs/concepts.md) → [Use cases](./docs/use-cases.md)
|
|
131
136
|
- **Task authors** → [Task authoring](./docs/task-authoring.md) → [Sandbox hardening](./docs/sandbox-hardening.md)
|
|
132
137
|
- **Agent builders integrating with benchflow** → [Concepts](./docs/concepts.md) → [Python API reference](./docs/reference/python-api.md) → [`benchflow.agents.registry`](./src/benchflow/agents/registry.py)
|
|
133
|
-
- **
|
|
138
|
+
- **External benchmark adapters** → [Task authoring](./docs/task-authoring.md) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
|
|
134
139
|
|
|
135
140
|
## Contributing
|
|
136
141
|
|
|
@@ -79,7 +79,7 @@ SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
|
|
|
79
79
|
|
|
80
80
|
## Featured
|
|
81
81
|
|
|
82
|
-
- **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb).
|
|
82
|
+
- **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). See [Progressive disclosure](./docs/progressive-disclosure.md).
|
|
83
83
|
|
|
84
84
|
## Research artifacts
|
|
85
85
|
|
|
@@ -93,7 +93,7 @@ Two runnable labs validate the security story:
|
|
|
93
93
|
- **Eval researchers / paper writers** → [Getting started](./docs/getting-started.md) → [Concepts](./docs/concepts.md) → [Use cases](./docs/use-cases.md)
|
|
94
94
|
- **Task authors** → [Task authoring](./docs/task-authoring.md) → [Sandbox hardening](./docs/sandbox-hardening.md)
|
|
95
95
|
- **Agent builders integrating with benchflow** → [Concepts](./docs/concepts.md) → [Python API reference](./docs/reference/python-api.md) → [`benchflow.agents.registry`](./src/benchflow/agents/registry.py)
|
|
96
|
-
- **
|
|
96
|
+
- **External benchmark adapters** → [Task authoring](./docs/task-authoring.md) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
|
|
97
97
|
|
|
98
98
|
## Contributing
|
|
99
99
|
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "benchflow"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.0"
|
|
4
4
|
description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
7
|
-
keywords = ["benchmark", "llm-agents", "acp", "agent-evaluation", "multi-turn", "
|
|
7
|
+
keywords = ["benchmark", "llm-agents", "acp", "agent-evaluation", "multi-turn", "skillsbench"]
|
|
8
8
|
dependencies = [
|
|
9
|
-
"harbor==0.3.0",
|
|
10
9
|
"httpx>=0.27.0",
|
|
11
10
|
"anyio>=4.0",
|
|
12
11
|
"pydantic>=2.0",
|
|
@@ -42,6 +41,14 @@ dev = [
|
|
|
42
41
|
"ruff>=0.7.0",
|
|
43
42
|
"ty>=0.0.1a1",
|
|
44
43
|
]
|
|
44
|
+
sandbox-daytona = [
|
|
45
|
+
"daytona>=0.153.0",
|
|
46
|
+
"tenacity>=8.0",
|
|
47
|
+
]
|
|
48
|
+
sandbox-modal = [
|
|
49
|
+
"modal>=0.73",
|
|
50
|
+
"tenacity>=8.0",
|
|
51
|
+
]
|
|
45
52
|
bedrock = [
|
|
46
53
|
"boto3>=1.40",
|
|
47
54
|
]
|
|
@@ -114,5 +121,24 @@ ignore = [
|
|
|
114
121
|
[tool.ty.environment]
|
|
115
122
|
python-version = "3.12"
|
|
116
123
|
|
|
124
|
+
[tool.ty.rules]
|
|
125
|
+
# Many modules lazily import optional deps (daytona, modal, openai, toml, …).
|
|
126
|
+
# These are guarded by try/except at runtime but ty can't resolve them in CI.
|
|
127
|
+
unresolved-import = "ignore"
|
|
128
|
+
|
|
117
129
|
[tool.ty.src]
|
|
118
130
|
include = ["src"]
|
|
131
|
+
# Modules that heavily use optional-dep types (daytona, modal, openai, boto3, …)
|
|
132
|
+
# produce cascading type errors when those packages aren't installed.
|
|
133
|
+
exclude = [
|
|
134
|
+
"src/benchflow/sandbox/daytona.py",
|
|
135
|
+
"src/benchflow/sandbox/modal_impl.py",
|
|
136
|
+
"src/benchflow/sandbox/docker.py",
|
|
137
|
+
"src/benchflow/sandbox/_base.py",
|
|
138
|
+
"src/benchflow/_env_setup.py",
|
|
139
|
+
"src/benchflow/rewards/llm.py",
|
|
140
|
+
"src/benchflow/rewards/file_readers.py",
|
|
141
|
+
"src/benchflow/rewards/rubric_config.py",
|
|
142
|
+
"src/benchflow/providers/bedrock_runtime.py",
|
|
143
|
+
"src/benchflow/experimental/mcp/reviewer_server.py",
|
|
144
|
+
]
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
"""benchflow — ACP-native agent benchmarking framework.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Public API surface:
|
|
4
|
+
- Sandbox protocol for isolated execution environments
|
|
4
5
|
- ACP client for multi-turn agent communication
|
|
5
6
|
- Trajectory capture (HTTP proxy, OTel collector, ACP native)
|
|
6
7
|
- Rollout lifecycle for single-task execution
|
|
7
8
|
- Evaluation orchestration with retries and concurrency
|
|
9
|
+
- Rewards protocol (composable Rubric + RewardFunc)
|
|
8
10
|
- Metrics collection and aggregation
|
|
9
11
|
"""
|
|
10
12
|
|
|
@@ -12,23 +14,9 @@ from importlib.metadata import version as _version
|
|
|
12
14
|
|
|
13
15
|
__version__ = _version("benchflow")
|
|
14
16
|
|
|
15
|
-
#
|
|
16
|
-
from harbor import (
|
|
17
|
-
BaseAgent,
|
|
18
|
-
BaseEnvironment,
|
|
19
|
-
ExecResult,
|
|
20
|
-
Task,
|
|
21
|
-
TaskConfig,
|
|
22
|
-
Verifier,
|
|
23
|
-
VerifierResult,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
# benchflow's additions
|
|
27
|
-
from benchflow._env_setup import stage_dockerfile_deps
|
|
28
|
-
from benchflow._scene import MailboxTransport, Message, MessageTransport, SceneRole
|
|
29
|
-
from benchflow._scene import Scene as SceneRuntime
|
|
30
|
-
from benchflow._snapshot import list_snapshots, restore, snapshot
|
|
17
|
+
# Core types
|
|
31
18
|
from benchflow._types import Role, Scene, Turn
|
|
19
|
+
from benchflow._utils.yaml_loader import rollout_config_from_yaml
|
|
32
20
|
from benchflow.acp.client import ACPClient
|
|
33
21
|
from benchflow.acp.session import ACPSession
|
|
34
22
|
from benchflow.adapters import (
|
|
@@ -45,12 +33,6 @@ from benchflow.agents.registry import (
|
|
|
45
33
|
list_agents,
|
|
46
34
|
register_agent,
|
|
47
35
|
)
|
|
48
|
-
from benchflow.environments import (
|
|
49
|
-
SERVICES,
|
|
50
|
-
build_service_hooks,
|
|
51
|
-
detect_services_from_dockerfile,
|
|
52
|
-
register_service,
|
|
53
|
-
)
|
|
54
36
|
from benchflow.evaluation import (
|
|
55
37
|
Evaluation,
|
|
56
38
|
EvaluationConfig,
|
|
@@ -85,32 +67,39 @@ from benchflow.runtime import (
|
|
|
85
67
|
RuntimeResult,
|
|
86
68
|
run,
|
|
87
69
|
) # bf.run() — supports Agent, RolloutConfig, and str calling conventions
|
|
70
|
+
from benchflow.sandbox import (
|
|
71
|
+
SERVICES,
|
|
72
|
+
ImageBuilder,
|
|
73
|
+
ImageConfig,
|
|
74
|
+
ImageRef,
|
|
75
|
+
Sandbox,
|
|
76
|
+
build_service_hooks,
|
|
77
|
+
detect_services_from_dockerfile,
|
|
78
|
+
register_service,
|
|
79
|
+
)
|
|
88
80
|
|
|
89
|
-
# Sandbox protocol (v0.4
|
|
81
|
+
# Sandbox protocol (v0.4)
|
|
90
82
|
from benchflow.sandbox import ExecResult as SandboxExecResult
|
|
91
|
-
from benchflow.sandbox import
|
|
83
|
+
from benchflow.sandbox.protocol import ExecResult
|
|
84
|
+
from benchflow.sandbox.setup import stage_dockerfile_deps
|
|
85
|
+
from benchflow.sandbox.snapshot import list_snapshots, restore, snapshot
|
|
86
|
+
from benchflow.sandbox.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
|
|
87
|
+
from benchflow.scenes import MailboxTransport, Message, MessageTransport, SceneRole
|
|
88
|
+
from benchflow.scenes import Scene as SceneRuntime
|
|
92
89
|
from benchflow.sdk import SDK
|
|
93
90
|
from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
|
|
91
|
+
from benchflow.task import (
|
|
92
|
+
Task,
|
|
93
|
+
TaskConfig,
|
|
94
|
+
Verifier,
|
|
95
|
+
VerifierResult,
|
|
96
|
+
)
|
|
94
97
|
from benchflow.trajectories.otel import OTelCollector
|
|
95
98
|
from benchflow.trajectories.proxy import TrajectoryProxy
|
|
96
99
|
from benchflow.trajectories.types import Trajectory
|
|
97
|
-
from benchflow.trial_yaml import trial_config_from_yaml
|
|
98
|
-
from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
|
|
99
|
-
|
|
100
|
-
# Backward-compat aliases
|
|
101
|
-
Trial = Rollout
|
|
102
|
-
TrialConfig = RolloutConfig
|
|
103
|
-
TrialRole = Role
|
|
104
|
-
TrialScene = Scene
|
|
105
|
-
RunResult = RolloutResult
|
|
106
|
-
Job = Evaluation
|
|
107
|
-
JobConfig = EvaluationConfig
|
|
108
|
-
JobResult = EvaluationResult
|
|
109
100
|
|
|
110
101
|
# Public API surface. Anything not in this list is implementation detail and
|
|
111
|
-
# may change without notice.
|
|
112
|
-
# imports above and to make it obvious to a future agent which module owns
|
|
113
|
-
# what.
|
|
102
|
+
# may change without notice.
|
|
114
103
|
__all__ = [
|
|
115
104
|
"__version__",
|
|
116
105
|
# Rewards protocol (v0.4)
|
|
@@ -122,21 +111,17 @@ __all__ = [
|
|
|
122
111
|
"LLMJudgeRewardFunc",
|
|
123
112
|
"StringMatchRewardFunc",
|
|
124
113
|
"CodeExecRewardFunc",
|
|
125
|
-
# Rubric config (ENG-55)
|
|
126
114
|
"Criterion",
|
|
127
115
|
"JudgeConfig",
|
|
128
116
|
"RubricConfig",
|
|
129
117
|
"ScoringConfig",
|
|
130
118
|
"load_rubric_toml",
|
|
131
|
-
# Sandbox protocol
|
|
119
|
+
# Sandbox protocol
|
|
132
120
|
"Sandbox",
|
|
133
121
|
"SandboxExecResult",
|
|
134
122
|
"ImageBuilder",
|
|
135
123
|
"ImageConfig",
|
|
136
124
|
"ImageRef",
|
|
137
|
-
# Harbor re-exports
|
|
138
|
-
"BaseAgent",
|
|
139
|
-
"BaseEnvironment",
|
|
140
125
|
"ExecResult",
|
|
141
126
|
"Task",
|
|
142
127
|
"TaskConfig",
|
|
@@ -152,15 +137,11 @@ __all__ = [
|
|
|
152
137
|
"is_vertex_model",
|
|
153
138
|
"list_agents",
|
|
154
139
|
"register_agent",
|
|
155
|
-
# Evaluation orchestration
|
|
140
|
+
# Evaluation orchestration
|
|
156
141
|
"Evaluation",
|
|
157
142
|
"EvaluationConfig",
|
|
158
143
|
"EvaluationResult",
|
|
159
144
|
"RetryConfig",
|
|
160
|
-
# Backward-compat aliases for Job
|
|
161
|
-
"Job",
|
|
162
|
-
"JobConfig",
|
|
163
|
-
"JobResult",
|
|
164
145
|
# Metrics
|
|
165
146
|
"BenchmarkMetrics",
|
|
166
147
|
"collect_metrics",
|
|
@@ -168,8 +149,7 @@ __all__ = [
|
|
|
168
149
|
"AgentInstallError",
|
|
169
150
|
"AgentTimeoutError",
|
|
170
151
|
"RolloutResult",
|
|
171
|
-
|
|
172
|
-
# Runtime (0.3 compat)
|
|
152
|
+
# Runtime
|
|
173
153
|
"Agent",
|
|
174
154
|
"Environment",
|
|
175
155
|
"Runtime",
|
|
@@ -177,7 +157,7 @@ __all__ = [
|
|
|
177
157
|
"RuntimeResult",
|
|
178
158
|
# Single entry point
|
|
179
159
|
"run",
|
|
180
|
-
#
|
|
160
|
+
# Declarative types
|
|
181
161
|
"Role",
|
|
182
162
|
"Scene",
|
|
183
163
|
"Turn",
|
|
@@ -191,23 +171,18 @@ __all__ = [
|
|
|
191
171
|
"snapshot",
|
|
192
172
|
"restore",
|
|
193
173
|
"list_snapshots",
|
|
194
|
-
# Rollout
|
|
174
|
+
# Rollout
|
|
195
175
|
"Rollout",
|
|
196
176
|
"RolloutConfig",
|
|
197
|
-
|
|
198
|
-
"Trial",
|
|
199
|
-
"TrialConfig",
|
|
200
|
-
"TrialRole",
|
|
201
|
-
"TrialScene",
|
|
202
|
-
"trial_config_from_yaml",
|
|
177
|
+
"rollout_config_from_yaml",
|
|
203
178
|
# User abstraction (progressive disclosure)
|
|
204
179
|
"BaseUser",
|
|
205
180
|
"FunctionUser",
|
|
206
181
|
"PassthroughUser",
|
|
207
182
|
"RoundResult",
|
|
208
|
-
# SDK
|
|
183
|
+
# SDK
|
|
209
184
|
"SDK",
|
|
210
|
-
#
|
|
185
|
+
# Sandbox services
|
|
211
186
|
"SERVICES",
|
|
212
187
|
"build_service_hooks",
|
|
213
188
|
"detect_services_from_dockerfile",
|
|
@@ -222,7 +197,7 @@ __all__ = [
|
|
|
222
197
|
"OTelCollector",
|
|
223
198
|
"TrajectoryProxy",
|
|
224
199
|
"Trajectory",
|
|
225
|
-
# External adapters
|
|
200
|
+
# External adapters
|
|
226
201
|
"InspectAdapter",
|
|
227
202
|
"ORSAdapter",
|
|
228
203
|
"to_inspect_task",
|
|
@@ -231,8 +206,7 @@ __all__ = [
|
|
|
231
206
|
|
|
232
207
|
|
|
233
208
|
def __getattr__(name: str):
|
|
234
|
-
"""
|
|
235
|
-
# Let Python's normal submodule resolution handle subpackages first.
|
|
209
|
+
"""Lazy submodule resolution."""
|
|
236
210
|
import importlib
|
|
237
211
|
|
|
238
212
|
try:
|
|
@@ -240,16 +214,4 @@ def __getattr__(name: str):
|
|
|
240
214
|
except ModuleNotFoundError as e:
|
|
241
215
|
if e.name != f"benchflow.{name}":
|
|
242
216
|
raise
|
|
243
|
-
|
|
244
|
-
import harbor
|
|
245
|
-
|
|
246
|
-
if hasattr(harbor, name):
|
|
247
|
-
import warnings
|
|
248
|
-
|
|
249
|
-
warnings.warn(
|
|
250
|
-
f"'{name}' is not directly re-exported by benchflow. Use 'from harbor import {name}' instead.",
|
|
251
|
-
ImportWarning,
|
|
252
|
-
stacklevel=2,
|
|
253
|
-
)
|
|
254
|
-
return getattr(harbor, name)
|
|
255
217
|
raise AttributeError(f"module 'benchflow' has no attribute {name!r}")
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Small `.env` reader shared by CLI/runtime env resolution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
_DEFAULT_DOTENV_PATH = Path(".env")
|
|
9
|
+
_DOTENV_PATH_ENV = "BENCHFLOW_DOTENV_PATH"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_dotenv_env(path: str | Path | None = None) -> dict[str, str]:
|
|
13
|
+
"""Read a local `.env` file into a plain dict.
|
|
14
|
+
|
|
15
|
+
Missing files are treated as empty input. `BENCHFLOW_DOTENV_PATH` lets tests
|
|
16
|
+
or callers override the implicit `.env` lookup without changing cwd.
|
|
17
|
+
"""
|
|
18
|
+
if path is not None:
|
|
19
|
+
dotenv_path = Path(path)
|
|
20
|
+
else:
|
|
21
|
+
dotenv_path = Path(os.environ.get(_DOTENV_PATH_ENV, _DEFAULT_DOTENV_PATH))
|
|
22
|
+
if not dotenv_path.exists() or not dotenv_path.is_file():
|
|
23
|
+
return {}
|
|
24
|
+
|
|
25
|
+
parsed: dict[str, str] = {}
|
|
26
|
+
for raw_line in dotenv_path.read_text().splitlines():
|
|
27
|
+
line = raw_line.strip()
|
|
28
|
+
if not line or line.startswith("#"):
|
|
29
|
+
continue
|
|
30
|
+
if line.startswith("export "):
|
|
31
|
+
line = line[len("export ") :].lstrip()
|
|
32
|
+
if "=" not in line:
|
|
33
|
+
continue
|
|
34
|
+
|
|
35
|
+
key, value = line.split("=", 1)
|
|
36
|
+
key = key.strip()
|
|
37
|
+
value = value.strip()
|
|
38
|
+
if not key:
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
if value[:1] in {"'", '"'} and value[-1:] == value[:1]:
|
|
42
|
+
value = value[1:-1]
|
|
43
|
+
elif " #" in value:
|
|
44
|
+
value = value.split(" #", 1)[0].rstrip()
|
|
45
|
+
|
|
46
|
+
parsed[key] = value
|
|
47
|
+
return parsed
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""benchflow._utils — small periphery I/O glue, private.
|
|
2
|
+
|
|
3
|
+
Holds small (<200 LOC) periphery modules that translate between external
|
|
4
|
+
artifacts (YAML files, git repos, scaffolded task dirs) and benchflow
|
|
5
|
+
shapes.
|
|
6
|
+
|
|
7
|
+
Members:
|
|
8
|
+
yaml_loader — YAML → RolloutConfig/EvaluationConfig
|
|
9
|
+
benchmark_repos — clone benchmark repos
|
|
10
|
+
task_authoring — init_task / check_task scaffolding
|
|
11
|
+
"""
|
|
@@ -138,7 +138,11 @@ def resolve_source(repo: str, path: str | None = None, ref: str | None = None) -
|
|
|
138
138
|
# Format: (org/repo, ref, subpath)
|
|
139
139
|
TASK_ALIASES: dict[str, tuple[str, str | None, str | None]] = {
|
|
140
140
|
"skillsbench": ("benchflow-ai/skillsbench", "main", "tasks"),
|
|
141
|
-
"programbench": (
|
|
141
|
+
"programbench": (
|
|
142
|
+
"facebookresearch/programbench",
|
|
143
|
+
"main",
|
|
144
|
+
"src/programbench/data/tasks",
|
|
145
|
+
),
|
|
142
146
|
"harvey-lab": ("benchflow-ai/benchmarks", "main", "datasets/harvey-lab/tasks"),
|
|
143
147
|
}
|
|
144
148
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Shared configuration normalization helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from benchflow.agents.registry import parse_agent_spec
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def normalize_agent_name(agent: str) -> str:
|
|
9
|
+
"""Return the canonical registry name for an ACP agent alias."""
|
|
10
|
+
protocol, canonical = parse_agent_spec(agent)
|
|
11
|
+
if protocol == "acp":
|
|
12
|
+
return canonical
|
|
13
|
+
return agent
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def normalize_sandbox_user(sandbox_user: str | None) -> str | None:
|
|
17
|
+
"""Map text root-user sentinels to ``None``."""
|
|
18
|
+
if sandbox_user is None:
|
|
19
|
+
return None
|
|
20
|
+
if sandbox_user.lower() in {"none", "null"}:
|
|
21
|
+
return None
|
|
22
|
+
return sandbox_user
|
benchflow-0.3.4/src/benchflow/trial_yaml.py → benchflow-0.4.0/src/benchflow/_utils/yaml_loader.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
"""YAML
|
|
1
|
+
"""YAML rollout config loader.
|
|
2
2
|
|
|
3
|
-
Parses
|
|
3
|
+
Parses rollout YAML files into RolloutConfig with Scene support.
|
|
4
4
|
Handles both new scene-based format and legacy flat format.
|
|
5
5
|
|
|
6
6
|
New format::
|
|
@@ -44,13 +44,13 @@ from typing import Any
|
|
|
44
44
|
import yaml
|
|
45
45
|
|
|
46
46
|
from benchflow._types import Role, Scene, Turn
|
|
47
|
-
from benchflow.
|
|
47
|
+
from benchflow.rollout import RolloutConfig
|
|
48
48
|
|
|
49
49
|
logger = logging.getLogger(__name__)
|
|
50
50
|
|
|
51
51
|
|
|
52
|
-
def
|
|
53
|
-
"""Load and normalize a
|
|
52
|
+
def load_rollout_yaml(path: str | Path) -> dict:
|
|
53
|
+
"""Load and normalize a rollout YAML file."""
|
|
54
54
|
with open(path) as f:
|
|
55
55
|
raw = yaml.safe_load(f)
|
|
56
56
|
if not isinstance(raw, dict):
|
|
@@ -58,23 +58,23 @@ def load_trial_yaml(path: str | Path) -> dict:
|
|
|
58
58
|
return raw
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def
|
|
61
|
+
def rollout_config_from_yaml(
|
|
62
62
|
path: str | Path,
|
|
63
63
|
task_path: Path | None = None,
|
|
64
|
-
) ->
|
|
65
|
-
"""Parse a YAML file into a
|
|
64
|
+
) -> RolloutConfig:
|
|
65
|
+
"""Parse a YAML file into a RolloutConfig.
|
|
66
66
|
|
|
67
67
|
If task_path is provided, it overrides task_dir from the YAML.
|
|
68
68
|
"""
|
|
69
|
-
raw =
|
|
70
|
-
return
|
|
69
|
+
raw = load_rollout_yaml(path)
|
|
70
|
+
return rollout_config_from_dict(raw, task_path=task_path)
|
|
71
71
|
|
|
72
72
|
|
|
73
|
-
def
|
|
73
|
+
def rollout_config_from_dict(
|
|
74
74
|
raw: dict[str, Any],
|
|
75
75
|
task_path: Path | None = None,
|
|
76
|
-
) ->
|
|
77
|
-
"""Convert a raw dict (from YAML or programmatic) into a
|
|
76
|
+
) -> RolloutConfig:
|
|
77
|
+
"""Convert a raw dict (from YAML or programmatic) into a RolloutConfig."""
|
|
78
78
|
tp = task_path or Path(raw.get("task_dir", raw.get("task_path", ".")))
|
|
79
79
|
|
|
80
80
|
# Scene-based format
|
|
@@ -105,7 +105,7 @@ def trial_config_from_dict(
|
|
|
105
105
|
else:
|
|
106
106
|
raise ValueError("YAML must have either 'scenes' or 'agent' at top level")
|
|
107
107
|
|
|
108
|
-
return
|
|
108
|
+
return RolloutConfig(
|
|
109
109
|
task_path=tp,
|
|
110
110
|
scenes=scenes,
|
|
111
111
|
environment=raw.get("environment", "docker"),
|
|
@@ -113,7 +113,7 @@ def trial_config_from_dict(
|
|
|
113
113
|
sandbox_locked_paths=raw.get("sandbox_locked_paths"),
|
|
114
114
|
sandbox_setup_timeout=raw.get("sandbox_setup_timeout", 120),
|
|
115
115
|
job_name=raw.get("job_name"),
|
|
116
|
-
|
|
116
|
+
rollout_name=raw.get("rollout_name"),
|
|
117
117
|
jobs_dir=raw.get("jobs_dir", "jobs"),
|
|
118
118
|
context_root=raw.get("context_root"),
|
|
119
119
|
agent=raw.get("agent", "claude-agent-acp"),
|
|
@@ -165,9 +165,9 @@ def job_config_from_yaml(path: str | Path) -> dict:
|
|
|
165
165
|
"""Parse a YAML file and return both job-level and trial-level config.
|
|
166
166
|
|
|
167
167
|
Returns a dict with keys: task_dir, concurrency, max_retries,
|
|
168
|
-
trial_config (
|
|
168
|
+
trial_config (RolloutConfig), and any other job-level fields.
|
|
169
169
|
"""
|
|
170
|
-
raw =
|
|
170
|
+
raw = load_rollout_yaml(path)
|
|
171
171
|
task_dir = Path(raw.get("task_dir", raw.get("tasks_dir", ".")))
|
|
172
172
|
concurrency = raw.get("concurrency", 4)
|
|
173
173
|
max_retries = raw.get("max_retries", 2)
|
|
@@ -176,6 +176,6 @@ def job_config_from_yaml(path: str | Path) -> dict:
|
|
|
176
176
|
"task_dir": task_dir,
|
|
177
177
|
"concurrency": concurrency,
|
|
178
178
|
"max_retries": max_retries,
|
|
179
|
-
"trial_config":
|
|
179
|
+
"trial_config": rollout_config_from_dict(raw, task_path=task_dir),
|
|
180
180
|
"raw": raw,
|
|
181
181
|
}
|
|
@@ -7,7 +7,7 @@ Owns the live agent-side of a run:
|
|
|
7
7
|
ACP-native trajectory, and report tool-call counts
|
|
8
8
|
|
|
9
9
|
The one allowed horizontal phase import in this refactor lives here:
|
|
10
|
-
``from benchflow.
|
|
10
|
+
``from benchflow.sandbox.lockdown import build_priv_drop_cmd``. connect_acp wraps
|
|
11
11
|
the agent launch command in the sandbox user's privilege-drop prefix
|
|
12
12
|
before handing it to the transport. It is a single pure-function call
|
|
13
13
|
with no shared state — not a coupling of concerns.
|
|
@@ -22,13 +22,13 @@ import contextlib
|
|
|
22
22
|
import logging
|
|
23
23
|
from pathlib import Path
|
|
24
24
|
|
|
25
|
-
from benchflow._sandbox import build_priv_drop_cmd
|
|
26
|
-
from benchflow._trajectory import _capture_session_trajectory
|
|
27
25
|
from benchflow.acp.client import ACPClient
|
|
28
26
|
from benchflow.acp.container_transport import ContainerTransport
|
|
29
27
|
from benchflow.agents.providers import find_provider, strip_provider_prefix
|
|
30
28
|
from benchflow.agents.registry import AGENTS
|
|
31
|
-
from benchflow.
|
|
29
|
+
from benchflow.sandbox.lockdown import build_priv_drop_cmd
|
|
30
|
+
from benchflow.sandbox.process import DaytonaProcess, DaytonaPtyProcess, DockerProcess
|
|
31
|
+
from benchflow.trajectories._capture import _capture_session_trajectory
|
|
32
32
|
|
|
33
33
|
logger = logging.getLogger(__name__)
|
|
34
34
|
|
|
@@ -144,7 +144,7 @@ async def connect_acp(
|
|
|
144
144
|
agent_env: dict,
|
|
145
145
|
sandbox_user: str | None,
|
|
146
146
|
model: str | None,
|
|
147
|
-
|
|
147
|
+
rollout_dir: Path,
|
|
148
148
|
environment: str,
|
|
149
149
|
agent_cwd: str,
|
|
150
150
|
) -> tuple[ACPClient, object, str]:
|
|
@@ -181,18 +181,18 @@ async def connect_acp(
|
|
|
181
181
|
|
|
182
182
|
try:
|
|
183
183
|
if environment == "docker":
|
|
184
|
-
live_proc = DockerProcess.
|
|
184
|
+
live_proc = DockerProcess.from_sandbox_env(env)
|
|
185
185
|
else:
|
|
186
186
|
is_dind = hasattr(env, "_strategy") and hasattr(
|
|
187
187
|
env._strategy, "_compose_cmd"
|
|
188
188
|
)
|
|
189
189
|
if is_dind:
|
|
190
|
-
live_proc = await DaytonaPtyProcess.
|
|
190
|
+
live_proc = await DaytonaPtyProcess.from_sandbox_env(env)
|
|
191
191
|
logger.info("Using PTY transport for DinD compose task")
|
|
192
192
|
else:
|
|
193
|
-
live_proc = await DaytonaProcess.
|
|
193
|
+
live_proc = await DaytonaProcess.from_sandbox_env(env)
|
|
194
194
|
|
|
195
|
-
agent_log =
|
|
195
|
+
agent_log = rollout_dir / "agent" / f"{agent.replace('-', '_')}.txt"
|
|
196
196
|
transport = ContainerTransport(
|
|
197
197
|
container_process=live_proc,
|
|
198
198
|
command=agent_launch,
|