benchflow 0.4.0__tar.gz → 0.5.1.dev869__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/.gitignore +1 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/CHANGELOG.md +10 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/PKG-INFO +26 -16
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/README.md +15 -13
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/pyproject.toml +27 -6
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/__init__.py +55 -17
- benchflow-0.5.1.dev869/src/benchflow/_paths.py +218 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/_run.py +2 -2
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/_types.py +13 -5
- benchflow-0.5.1.dev869/src/benchflow/_utils/benchmark_repos.py +516 -0
- benchflow-0.5.1.dev869/src/benchflow/_utils/config.py +66 -0
- benchflow-0.5.1.dev869/src/benchflow/_utils/evaluation_results.py +228 -0
- benchflow-0.5.1.dev869/src/benchflow/_utils/json_safe.py +44 -0
- benchflow-0.5.1.dev869/src/benchflow/_utils/learner_memory.py +162 -0
- benchflow-0.5.1.dev869/src/benchflow/_utils/result_metadata.py +75 -0
- benchflow-0.5.1.dev869/src/benchflow/_utils/reward_events.py +95 -0
- benchflow-0.5.1.dev869/src/benchflow/_utils/scoring.py +264 -0
- benchflow-0.5.1.dev869/src/benchflow/_utils/source_provenance.py +129 -0
- benchflow-0.5.1.dev869/src/benchflow/_utils/task_authoring.py +236 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/yaml_loader.py +11 -2
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/client.py +164 -26
- benchflow-0.5.1.dev869/src/benchflow/acp/runtime.py +646 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/session.py +171 -3
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/transport.py +13 -1
- benchflow-0.5.1.dev869/src/benchflow/acp/types.py +328 -0
- benchflow-0.5.1.dev869/src/benchflow/adapters/__init__.py +51 -0
- benchflow-0.5.1.dev869/src/benchflow/adapters/harbor.py +137 -0
- benchflow-0.5.1.dev869/src/benchflow/adapters/inbound.py +219 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/ors.py +11 -0
- benchflow-0.5.1.dev869/src/benchflow/adapters/terminal_bench.py +353 -0
- benchflow-0.5.1.dev869/src/benchflow/agents/codex_config.py +68 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/credentials.py +10 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/env.py +342 -19
- benchflow-0.5.1.dev869/src/benchflow/agents/errors.py +8 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/install.py +56 -15
- benchflow-0.5.1.dev869/src/benchflow/agents/protocol.py +242 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/providers.py +145 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/registry.py +194 -20
- benchflow-0.5.1.dev869/src/benchflow/branch.py +74 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/main.py +623 -359
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/trace_import.py +3 -5
- benchflow-0.5.1.dev869/src/benchflow/compat/__init__.py +19 -0
- benchflow-0.5.1.dev869/src/benchflow/compat/harbor_registry.py +289 -0
- benchflow-0.5.1.dev869/src/benchflow/contracts/__init__.py +68 -0
- benchflow-0.5.1.dev869/src/benchflow/contracts/planes.py +96 -0
- benchflow-0.5.1.dev869/src/benchflow/contracts/user.py +74 -0
- benchflow-0.5.1.dev869/src/benchflow/diagnostics.py +393 -0
- benchflow-0.5.1.dev869/src/benchflow/environment/__init__.py +42 -0
- benchflow-0.5.1.dev869/src/benchflow/environment/manifest.py +230 -0
- benchflow-0.5.1.dev869/src/benchflow/environment/manifest_env.py +308 -0
- benchflow-0.5.1.dev869/src/benchflow/environment/protocol.py +77 -0
- benchflow-0.5.1.dev869/src/benchflow/environment/readiness.py +74 -0
- benchflow-0.5.1.dev869/src/benchflow/eval_sharding.py +349 -0
- benchflow-0.5.1.dev869/src/benchflow/eval_worker.py +116 -0
- benchflow-0.5.1.dev869/src/benchflow/evaluation.py +1458 -0
- benchflow-0.5.1.dev869/src/benchflow/hosted_env.py +849 -0
- benchflow-0.5.1.dev869/src/benchflow/learner_skills.py +142 -0
- benchflow-0.5.1.dev869/src/benchflow/learner_store.py +277 -0
- benchflow-0.5.1.dev869/src/benchflow/metrics.py +419 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/models.py +84 -9
- benchflow-0.5.1.dev869/src/benchflow/monitor.py +209 -0
- benchflow-0.5.1.dev869/src/benchflow/providers/__init__.py +18 -0
- benchflow-0.5.1.dev869/src/benchflow/providers/litellm_bedrock_patch.py +93 -0
- benchflow-0.5.1.dev869/src/benchflow/providers/litellm_config.py +335 -0
- benchflow-0.5.1.dev869/src/benchflow/providers/litellm_logging.py +385 -0
- benchflow-0.5.1.dev869/src/benchflow/providers/litellm_runtime.py +1109 -0
- benchflow-0.5.1.dev869/src/benchflow/providers/runtime.py +70 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/__init__.py +16 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/builtins.py +79 -9
- benchflow-0.5.1.dev869/src/benchflow/rewards/events.py +44 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/file_readers.py +7 -3
- benchflow-0.5.1.dev869/src/benchflow/rewards/llm.py +297 -0
- benchflow-0.5.1.dev869/src/benchflow/rewards/memory_scorer.py +146 -0
- benchflow-0.5.1.dev869/src/benchflow/rewards/node.py +164 -0
- benchflow-0.5.1.dev869/src/benchflow/rewards/protocol.py +89 -0
- benchflow-0.5.1.dev869/src/benchflow/rewards/rubric_config.py +220 -0
- benchflow-0.5.1.dev869/src/benchflow/rewards/validation.py +67 -0
- benchflow-0.5.1.dev869/src/benchflow/rollout.py +2920 -0
- benchflow-0.5.1.dev869/src/benchflow/rollout_branch.py +222 -0
- benchflow-0.5.1.dev869/src/benchflow/rollout_planes.py +195 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/runtime.py +97 -13
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/__init__.py +4 -0
- benchflow-0.5.1.dev869/src/benchflow/sandbox/_base.py +362 -0
- benchflow-0.5.1.dev869/src/benchflow/sandbox/_compose.py +28 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +6 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/daytona.py +596 -67
- benchflow-0.5.1.dev869/src/benchflow/sandbox/docker.py +853 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/lockdown.py +145 -15
- benchflow-0.5.1.dev869/src/benchflow/sandbox/metadata.py +34 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/modal_impl.py +57 -12
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/process.py +393 -116
- benchflow-0.5.1.dev869/src/benchflow/sandbox/protocol.py +181 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/setup.py +234 -42
- benchflow-0.5.1.dev869/src/benchflow/sandbox/snapshot.py +107 -0
- benchflow-0.5.1.dev869/src/benchflow/sandbox/user.py +10 -0
- benchflow-0.5.1.dev869/src/benchflow/scenes.py +99 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sdk.py +24 -3
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/self_gen.py +34 -15
- benchflow-0.5.1.dev869/src/benchflow/skill_eval/__init__.py +45 -0
- benchflow-0.4.0/src/benchflow/skill_eval.py → benchflow-0.5.1.dev869/src/benchflow/skill_eval/_core.py +145 -164
- benchflow-0.5.1.dev869/src/benchflow/skill_eval/gepa_export.py +195 -0
- benchflow-0.5.1.dev869/src/benchflow/skill_eval/schema.py +153 -0
- benchflow-0.5.1.dev869/src/benchflow/skill_policy.py +191 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/task/__init__.py +4 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/task/config.py +114 -5
- benchflow-0.5.1.dev869/src/benchflow/task/verifier.py +457 -0
- benchflow-0.5.1.dev869/src/benchflow/templates/test.sh.tmpl +20 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/huggingface.py +89 -20
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/parsers.py +8 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/task_gen.py +353 -67
- benchflow-0.5.1.dev869/src/benchflow/trajectories/__init__.py +36 -0
- benchflow-0.5.1.dev869/src/benchflow/trajectories/_capture.py +252 -0
- benchflow-0.5.1.dev869/src/benchflow/trajectories/export.py +304 -0
- benchflow-0.5.1.dev869/src/benchflow/trajectories/metrics.py +161 -0
- benchflow-0.5.1.dev869/src/benchflow/trajectories/tree.py +144 -0
- benchflow-0.5.1.dev869/src/benchflow/trajectories/types.py +388 -0
- benchflow-0.5.1.dev869/src/benchflow/usage_tracking.py +163 -0
- benchflow-0.5.1.dev869/tests/agents/__init__.py +1 -0
- benchflow-0.5.1.dev869/tests/agents/test_protocol.py +191 -0
- benchflow-0.5.1.dev869/tests/conformance/acp_smoke/environment/docker-compose.yaml +3 -0
- benchflow-0.5.1.dev869/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +16 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/task.toml +2 -1
- benchflow-0.5.1.dev869/tests/conformance/proof_multi_agent.py +87 -0
- benchflow-0.5.1.dev869/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +22 -0
- benchflow-0.5.1.dev869/tests/environment/__init__.py +0 -0
- benchflow-0.5.1.dev869/tests/environment/test_chibench_manifest.py +65 -0
- benchflow-0.5.1.dev869/tests/environment/test_clawsbench_manifest.py +41 -0
- benchflow-0.5.1.dev869/tests/environment/test_manifest.py +171 -0
- benchflow-0.5.1.dev869/tests/environment/test_manifest_env.py +410 -0
- benchflow-0.5.1.dev869/tests/environment/test_protocol.py +57 -0
- benchflow-0.5.1.dev869/tests/environment/test_readiness.py +76 -0
- benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +5 -0
- benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/instruction.md +13 -0
- benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/solution/solve.sh +11 -0
- benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/task.toml +18 -0
- benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/tests/test.sh +8 -0
- benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/tests/test_state.py +27 -0
- benchflow-0.5.1.dev869/tests/examples/traces/minimal-claude.jsonl +3 -0
- benchflow-0.5.1.dev869/tests/examples/traces/minimal-opentraces.jsonl +1 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent.py +21 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent_interleaved.py +2 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent_multi_turn.py +2 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/check_adapter_evidence.py +73 -15
- benchflow-0.5.1.dev869/tests/integration/check_hosted_env_evidence.py +209 -0
- benchflow-0.5.1.dev869/tests/integration/check_results.py +1079 -0
- benchflow-0.5.1.dev869/tests/integration/check_skillsbench_harbor_parity.py +509 -0
- benchflow-0.5.1.dev869/tests/integration/check_trace_to_task_evidence.py +353 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/claude-agent-acp.yaml +1 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/codex-acp.yaml +1 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/gemini.yaml +1 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/harvey-lab-harness.yaml +1 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/openclaw.yaml +1 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/opencode.yaml +1 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/openhands.yaml +1 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/pi-acp.yaml +1 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/run.sh +69 -44
- benchflow-0.5.1.dev869/tests/integration/run_suite.py +879 -0
- benchflow-0.5.1.dev869/tests/integration/suites/release.yaml +526 -0
- benchflow-0.5.1.dev869/tests/test_acp.py +1659 -0
- benchflow-0.5.1.dev869/tests/test_acp_capability_advertising.py +184 -0
- benchflow-0.5.1.dev869/tests/test_acp_model_config_dispatch.py +134 -0
- benchflow-0.5.1.dev869/tests/test_acp_pinned_protocol_guard.py +93 -0
- benchflow-0.5.1.dev869/tests/test_acp_setup_failure_propagation.py +208 -0
- benchflow-0.5.1.dev869/tests/test_adapter_scripts.py +227 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_adapters.py +71 -0
- benchflow-0.5.1.dev869/tests/test_agent_cli.py +29 -0
- benchflow-0.5.1.dev869/tests/test_agent_env_resolution.py +132 -0
- benchflow-0.5.1.dev869/tests/test_agent_gemini_defaults.py +82 -0
- benchflow-0.5.1.dev869/tests/test_agent_idle_timeout_cli.py +188 -0
- benchflow-0.5.1.dev869/tests/test_agent_registry.py +246 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_agent_setup.py +277 -22
- benchflow-0.5.1.dev869/tests/test_agent_spec.py +213 -0
- benchflow-0.5.1.dev869/tests/test_base_install_imports.py +106 -0
- benchflow-0.5.1.dev869/tests/test_bedrock_thinking.py +62 -0
- benchflow-0.5.1.dev869/tests/test_branch.py +89 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_capture_trajectory.py +5 -1
- benchflow-0.5.1.dev869/tests/test_clawsbench_slice.py +52 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_cli_daytona.py +11 -3
- benchflow-0.5.1.dev869/tests/test_cli_docs_drift.py +115 -0
- benchflow-0.5.1.dev869/tests/test_cli_misc.py +176 -0
- benchflow-0.5.1.dev869/tests/test_compat_harbor_registry.py +328 -0
- benchflow-0.5.1.dev869/tests/test_config_redaction.py +160 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_connect_as_env.py +41 -58
- benchflow-0.5.1.dev869/tests/test_continuallearningbench_adapter.py +90 -0
- benchflow-0.5.1.dev869/tests/test_dashboard_credential_env_scrub.py +115 -0
- benchflow-0.5.1.dev869/tests/test_dashboard_daytona_key.py +129 -0
- benchflow-0.5.1.dev869/tests/test_dashboard_no_host_paths.py +169 -0
- benchflow-0.5.1.dev869/tests/test_dashboard_release_evidence.py +297 -0
- benchflow-0.5.1.dev869/tests/test_dashboard_roadmap.py +818 -0
- benchflow-0.5.1.dev869/tests/test_dashboard_symlink_ingestion.py +170 -0
- benchflow-0.5.1.dev869/tests/test_dashboard_sync.py +1699 -0
- benchflow-0.5.1.dev869/tests/test_daytona_command_polling.py +51 -0
- benchflow-0.5.1.dev869/tests/test_daytona_litellm_runtime.py +121 -0
- benchflow-0.5.1.dev869/tests/test_daytona_status.py +90 -0
- benchflow-0.5.1.dev869/tests/test_docker_prune_scoping.py +153 -0
- benchflow-0.5.1.dev869/tests/test_docker_uploads.py +41 -0
- benchflow-0.5.1.dev869/tests/test_docs_examples.py +121 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_eng50_capabilities.py +29 -111
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_env_setup.py +44 -0
- benchflow-0.5.1.dev869/tests/test_environment_manifest_controls.py +299 -0
- benchflow-0.5.1.dev869/tests/test_eval_filters_applied.py +158 -0
- benchflow-0.5.1.dev869/tests/test_eval_sharding.py +38 -0
- benchflow-0.5.1.dev869/tests/test_eval_single_task_summary.py +159 -0
- benchflow-0.5.1.dev869/tests/test_eval_source_provenance.py +681 -0
- benchflow-0.5.1.dev869/tests/test_eval_worker_retry.py +57 -0
- benchflow-0.5.1.dev869/tests/test_eval_zero_task_guard.py +116 -0
- benchflow-0.5.1.dev869/tests/test_evaluation_environment_manifest.py +213 -0
- benchflow-0.5.1.dev869/tests/test_experiments_status.py +181 -0
- benchflow-0.5.1.dev869/tests/test_hilbench_adapter.py +104 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_hosted_env.py +14 -1
- benchflow-0.5.1.dev869/tests/test_hosted_env_rollout_contract.py +207 -0
- benchflow-0.5.1.dev869/tests/test_inbound_adapter_manifest.py +265 -0
- benchflow-0.5.1.dev869/tests/test_inbound_adapters.py +440 -0
- benchflow-0.5.1.dev869/tests/test_integration_check_results.py +2345 -0
- benchflow-0.5.1.dev869/tests/test_integration_run_suite.py +894 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_internet_policy.py +68 -30
- benchflow-0.5.1.dev869/tests/test_job.py +781 -0
- benchflow-0.5.1.dev869/tests/test_job_sequential_shared.py +595 -0
- benchflow-0.5.1.dev869/tests/test_job_sequential_shared_resume.py +314 -0
- benchflow-0.5.1.dev869/tests/test_judge_symlink_ingestion.py +64 -0
- benchflow-0.5.1.dev869/tests/test_learner_skills.py +137 -0
- benchflow-0.5.1.dev869/tests/test_learner_skills_traversal.py +95 -0
- benchflow-0.5.1.dev869/tests/test_learner_store.py +257 -0
- benchflow-0.5.1.dev869/tests/test_learner_store_persistence.py +138 -0
- benchflow-0.5.1.dev869/tests/test_litellm_config.py +99 -0
- benchflow-0.5.1.dev869/tests/test_litellm_hardening.py +659 -0
- benchflow-0.5.1.dev869/tests/test_litellm_logging.py +121 -0
- benchflow-0.5.1.dev869/tests/test_litellm_runtime.py +429 -0
- benchflow-0.5.1.dev869/tests/test_litellm_smoke.py +160 -0
- benchflow-0.5.1.dev869/tests/test_llm_judge.py +852 -0
- benchflow-0.5.1.dev869/tests/test_llm_judge_event_tags.py +217 -0
- benchflow-0.5.1.dev869/tests/test_llm_judge_verifier.py +734 -0
- benchflow-0.5.1.dev869/tests/test_memory_scorer.py +394 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_metrics.py +155 -0
- benchflow-0.5.1.dev869/tests/test_monitor_scaffold.py +133 -0
- benchflow-0.5.1.dev869/tests/test_native_acp_usage.py +243 -0
- benchflow-0.5.1.dev869/tests/test_no_cross_provider_fallback.py +84 -0
- benchflow-0.5.1.dev869/tests/test_oracle_chokepoint.py +1011 -0
- benchflow-0.5.1.dev869/tests/test_paths_safe.py +106 -0
- benchflow-0.5.1.dev869/tests/test_paths_symlink_helpers.py +95 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_pi_acp_launcher.py +52 -0
- benchflow-0.5.1.dev869/tests/test_process.py +760 -0
- benchflow-0.5.1.dev869/tests/test_provider_auth_detection.py +308 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_providers.py +136 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_reexport.py +6 -2
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_registry_invariants.py +117 -6
- benchflow-0.5.1.dev869/tests/test_release_version.py +132 -0
- benchflow-0.5.1.dev869/tests/test_resolve_env_helpers.py +1089 -0
- benchflow-0.5.1.dev869/tests/test_reward_node.py +145 -0
- benchflow-0.5.1.dev869/tests/test_reward_unified_contract.py +175 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_rewards.py +40 -41
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_rewards_jsonl.py +66 -0
- benchflow-0.5.1.dev869/tests/test_rollout_architecture.py +54 -0
- benchflow-0.5.1.dev869/tests/test_rollout_branch.py +451 -0
- benchflow-0.5.1.dev869/tests/test_rollout_config_path_coercion.py +96 -0
- benchflow-0.5.1.dev869/tests/test_rollout_environment.py +24 -0
- benchflow-0.5.1.dev869/tests/test_rollout_import_no_side_effects.py +137 -0
- benchflow-0.5.1.dev869/tests/test_rollout_on_ask_user_wiring.py +299 -0
- benchflow-0.5.1.dev869/tests/test_rollout_probe_sandbox_health.py +101 -0
- benchflow-0.5.1.dev869/tests/test_rollout_upload.py +633 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_rubric_config.py +106 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_runtime.py +15 -0
- benchflow-0.5.1.dev869/tests/test_runtime_config_wired.py +199 -0
- benchflow-0.5.1.dev869/tests/test_runtime_live_sandbox.py +257 -0
- benchflow-0.5.1.dev869/tests/test_sandbox.py +320 -0
- benchflow-0.5.1.dev869/tests/test_sandbox_exec_secret_handling.py +231 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_hardening.py +156 -14
- benchflow-0.5.1.dev869/tests/test_sandbox_isolation_copy_traversal.py +102 -0
- benchflow-0.5.1.dev869/tests/test_sandbox_multi_service.py +792 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_setup.py +89 -0
- benchflow-0.5.1.dev869/tests/test_sandbox_snapshot_contract.py +259 -0
- benchflow-0.5.1.dev869/tests/test_sandbox_upload_symlink.py +227 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_verifier_workspace.py +7 -7
- benchflow-0.5.1.dev869/tests/test_scene.py +102 -0
- benchflow-0.5.1.dev869/tests/test_scene_outbox_trial.py +397 -0
- benchflow-0.5.1.dev869/tests/test_scene_parallel_group.py +40 -0
- benchflow-0.5.1.dev869/tests/test_scene_result_aggregation.py +189 -0
- benchflow-0.5.1.dev869/tests/test_scoring.py +231 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sdk_internals.py +245 -6
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sdk_lockdown.py +17 -12
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_cli.py +17 -24
- benchflow-0.5.1.dev869/tests/test_self_gen_export_error_channel.py +202 -0
- benchflow-0.5.1.dev869/tests/test_self_gen_export_failures.py +175 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_orchestration.py +85 -7
- benchflow-0.5.1.dev869/tests/test_session_request_permission_dispatch.py +289 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval.py +74 -2
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_dryrun.py +7 -10
- benchflow-0.5.1.dev869/tests/test_skill_eval_sweep.py +472 -0
- benchflow-0.5.1.dev869/tests/test_skill_eval_traversal.py +141 -0
- benchflow-0.5.1.dev869/tests/test_skill_invocation_artifacts.py +188 -0
- benchflow-0.5.1.dev869/tests/test_skill_policy.py +209 -0
- benchflow-0.5.1.dev869/tests/test_skills_dir_agent_home_link.py +146 -0
- benchflow-0.5.1.dev869/tests/test_skillsbench_harbor_parity.py +180 -0
- benchflow-0.5.1.dev869/tests/test_skillsbench_harbor_run_suite.py +87 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_smoke.py +1 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_subscription_auth.py +162 -0
- benchflow-0.5.1.dev869/tests/test_task_check_eval_consistency.py +126 -0
- benchflow-0.5.1.dev869/tests/test_task_config.py +49 -0
- benchflow-0.5.1.dev869/tests/test_task_download.py +660 -0
- benchflow-0.5.1.dev869/tests/test_tasks.py +298 -0
- benchflow-0.5.1.dev869/tests/test_token_usage_normalization.py +158 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_trace_import_cli.py +49 -0
- benchflow-0.5.1.dev869/tests/test_trace_task_gen_traversal.py +123 -0
- benchflow-0.5.1.dev869/tests/test_trace_to_task_evidence.py +164 -0
- benchflow-0.5.1.dev869/tests/test_traces_huggingface.py +130 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_traces_parsers.py +5 -1
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_traces_task_gen.py +226 -7
- benchflow-0.5.1.dev869/tests/test_train_mode_artifact_emission.py +328 -0
- benchflow-0.5.1.dev869/tests/test_trajectory_streaming.py +450 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_trial_agent_timeout_verify.py +25 -10
- benchflow-0.5.1.dev869/tests/test_trial_install_agent_timeout.py +182 -0
- benchflow-0.5.1.dev869/tests/test_trial_litellm_runtime.py +115 -0
- benchflow-0.5.1.dev869/tests/test_usage_litellm.py +315 -0
- benchflow-0.5.1.dev869/tests/test_usage_required.py +70 -0
- benchflow-0.5.1.dev869/tests/test_usage_tracking.py +231 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_user.py +56 -3
- benchflow-0.5.1.dev869/tests/test_verifier_multi_container.py +313 -0
- benchflow-0.5.1.dev869/tests/test_verifier_output.py +214 -0
- benchflow-0.5.1.dev869/tests/test_verifier_output_freshness.py +114 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_verify.py +374 -45
- benchflow-0.5.1.dev869/tests/test_workflow_action_pinning.py +121 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_yaml_config.py +289 -2
- benchflow-0.5.1.dev869/tests/trajectories/__init__.py +0 -0
- benchflow-0.5.1.dev869/tests/trajectories/test_export.py +157 -0
- benchflow-0.5.1.dev869/tests/trajectories/test_export_nan_handling.py +101 -0
- benchflow-0.5.1.dev869/tests/trajectories/test_redaction.py +357 -0
- benchflow-0.5.1.dev869/tests/trajectories/test_step_granularity.py +201 -0
- benchflow-0.5.1.dev869/tests/trajectories/test_tree.py +159 -0
- benchflow-0.4.0/src/benchflow/_utils/benchmark_repos.py +0 -165
- benchflow-0.4.0/src/benchflow/_utils/config.py +0 -22
- benchflow-0.4.0/src/benchflow/_utils/scoring.py +0 -56
- benchflow-0.4.0/src/benchflow/_utils/task_authoring.py +0 -154
- benchflow-0.4.0/src/benchflow/acp/runtime.py +0 -358
- benchflow-0.4.0/src/benchflow/acp/types.py +0 -321
- benchflow-0.4.0/src/benchflow/adapters/__init__.py +0 -25
- benchflow-0.4.0/src/benchflow/evaluation.py +0 -680
- benchflow-0.4.0/src/benchflow/hosted_env.py +0 -408
- benchflow-0.4.0/src/benchflow/metrics.py +0 -240
- benchflow-0.4.0/src/benchflow/providers/__init__.py +0 -25
- benchflow-0.4.0/src/benchflow/providers/bedrock_proxy.py +0 -534
- benchflow-0.4.0/src/benchflow/providers/bedrock_runtime.py +0 -665
- benchflow-0.4.0/src/benchflow/providers/runtime.py +0 -172
- benchflow-0.4.0/src/benchflow/rewards/events.py +0 -26
- benchflow-0.4.0/src/benchflow/rewards/llm.py +0 -182
- benchflow-0.4.0/src/benchflow/rewards/protocol.py +0 -33
- benchflow-0.4.0/src/benchflow/rewards/rubric_config.py +0 -127
- benchflow-0.4.0/src/benchflow/rollout.py +0 -1845
- benchflow-0.4.0/src/benchflow/sandbox/_base.py +0 -189
- benchflow-0.4.0/src/benchflow/sandbox/_compose.py +0 -9
- benchflow-0.4.0/src/benchflow/sandbox/docker.py +0 -465
- benchflow-0.4.0/src/benchflow/sandbox/protocol.py +0 -74
- benchflow-0.4.0/src/benchflow/sandbox/snapshot.py +0 -85
- benchflow-0.4.0/src/benchflow/sandbox/user.py +0 -101
- benchflow-0.4.0/src/benchflow/scenes.py +0 -315
- benchflow-0.4.0/src/benchflow/task/verifier.py +0 -166
- benchflow-0.4.0/src/benchflow/templates/test.sh.tmpl +0 -12
- benchflow-0.4.0/src/benchflow/trajectories/__init__.py +0 -37
- benchflow-0.4.0/src/benchflow/trajectories/_capture.py +0 -113
- benchflow-0.4.0/src/benchflow/trajectories/proxy.py +0 -425
- benchflow-0.4.0/src/benchflow/trajectories/types.py +0 -107
- benchflow-0.4.0/tests/conformance/proof_multi_agent.py +0 -167
- benchflow-0.4.0/tests/integration/check_results.py +0 -199
- benchflow-0.4.0/tests/integration/run_suite.py +0 -439
- benchflow-0.4.0/tests/integration/suites/release.yaml +0 -262
- benchflow-0.4.0/tests/test_acp.py +0 -551
- benchflow-0.4.0/tests/test_adapter_scripts.py +0 -33
- benchflow-0.4.0/tests/test_agent_registry.py +0 -104
- benchflow-0.4.0/tests/test_agent_spec.py +0 -81
- benchflow-0.4.0/tests/test_atif_trajectory.py +0 -299
- benchflow-0.4.0/tests/test_bedrock_proxy.py +0 -375
- benchflow-0.4.0/tests/test_bedrock_runtime.py +0 -405
- benchflow-0.4.0/tests/test_docs_examples.py +0 -58
- benchflow-0.4.0/tests/test_integration_check_results.py +0 -110
- benchflow-0.4.0/tests/test_integration_run_suite.py +0 -261
- benchflow-0.4.0/tests/test_job.py +0 -298
- benchflow-0.4.0/tests/test_llm_judge.py +0 -488
- benchflow-0.4.0/tests/test_oracle_chokepoint.py +0 -469
- benchflow-0.4.0/tests/test_process.py +0 -296
- benchflow-0.4.0/tests/test_provider_runtime.py +0 -224
- benchflow-0.4.0/tests/test_resolve_env_helpers.py +0 -481
- benchflow-0.4.0/tests/test_rollout_upload.py +0 -70
- benchflow-0.4.0/tests/test_sandbox.py +0 -97
- benchflow-0.4.0/tests/test_scene.py +0 -198
- benchflow-0.4.0/tests/test_scene_outbox_trial.py +0 -503
- benchflow-0.4.0/tests/test_scoring.py +0 -102
- benchflow-0.4.0/tests/test_task_download.py +0 -186
- benchflow-0.4.0/tests/test_tasks.py +0 -159
- benchflow-0.4.0/tests/test_trial_bedrock_proxy.py +0 -129
- benchflow-0.4.0/tests/test_trial_install_agent_timeout.py +0 -129
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/LICENSE +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/_dotenv.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/__init__.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/container_transport.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/inspect_ai.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/pi_acp_launcher.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/environment/Dockerfile +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/instruction.md +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/task.toml +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/tests/test.sh +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/__init__.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/__init__.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/hooks.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/py.typed +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/README.md +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/rubric.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_sdk_ops.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/services.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/skills.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/task/env.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/task/paths.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/task/task.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/templates/__init__.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/templates/judge.py.tmpl +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/__init__.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/local.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/models.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/viewer.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/__init__.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/README.md +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/instruction.md +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/tests/test.sh +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/conformance-results.json +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/proof_snapshot.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/run_conformance.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conftest.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/test_codex.sh +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/test_codex_custom_provider.sh +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_openai_responses_server.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_harvey_lab_shim.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_mock_openai_responses_server.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_notification_order_real.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_oracle.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_protocol.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_integration.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_skills.py +0 -0
- {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_trajectory_integration.py +0 -0
|
@@ -2,6 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- **Daytona usage telemetry by default** — Daytona runs now start a sandbox-local provider usage proxy so token/cost telemetry works without an external tunnel; use `--usage-tracking off` to bypass proxying when needed.
|
|
8
|
+
- **Azure AI Foundry providers** — new `azure-foundry-openai/` and `azure-foundry-anthropic/` prefixes routing through Foundry's unified resource. Export `AZURE_API_KEY` plus `AZURE_API_ENDPOINT` (e.g. `https://<resource>.openai.azure.com/`); benchflow derives the resource name from the endpoint host, builds the per-surface base URL, and maps the key onto the agent-native auth env automatically. Missing/unrecognized endpoints and unsupported agent/provider protocol pairings fail fast with clear errors instead of falling through to the wrong endpoint.
|
|
9
|
+
- **Azure Foundry auth guidance** — agent discovery output and docs now call out that provider-prefixed models can use provider-specific credentials instead of the agent's native/default API key.
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
|
|
13
|
+
- Inherit `BENCHFLOW_PROVIDER_BASE_URL` / `BENCHFLOW_PROVIDER_API_KEY` from the host environment so self-hosted / OpenAI-compatible endpoints route correctly instead of falling back to `api.openai.com`; empty or whitespace-only host values are skipped so they cannot shadow the resolved provider URL (benchflow-ai/skillsbench#817).
|
|
14
|
+
|
|
5
15
|
## 0.3.3 — 2026-05-15
|
|
6
16
|
|
|
7
17
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1.dev869
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -18,22 +18,30 @@ Classifier: Programming Language :: Python :: 3
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.13
|
|
20
20
|
Requires-Python: >=3.12
|
|
21
|
+
Requires-Dist: agent-client-protocol>=0.10
|
|
21
22
|
Requires-Dist: anyio>=4.0
|
|
22
23
|
Requires-Dist: httpx>=0.27.0
|
|
23
|
-
Requires-Dist:
|
|
24
|
+
Requires-Dist: litellm[proxy]==1.88.0rc1
|
|
25
|
+
Requires-Dist: pydantic>=2.7
|
|
24
26
|
Requires-Dist: pyyaml>=6.0
|
|
25
27
|
Requires-Dist: rich>=13.0
|
|
28
|
+
Requires-Dist: tomli-w>=1.0
|
|
26
29
|
Requires-Dist: typer>=0.9
|
|
27
30
|
Provides-Extra: bedrock
|
|
28
31
|
Requires-Dist: boto3>=1.40; extra == 'bedrock'
|
|
29
32
|
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: packaging>=24; extra == 'dev'
|
|
30
34
|
Requires-Dist: pre-commit>=3.7; extra == 'dev'
|
|
31
35
|
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
32
36
|
Requires-Dist: pytest>=9.0.3; extra == 'dev'
|
|
33
37
|
Requires-Dist: ruff>=0.7.0; extra == 'dev'
|
|
34
38
|
Requires-Dist: ty>=0.0.1a1; extra == 'dev'
|
|
39
|
+
Provides-Extra: judge
|
|
40
|
+
Requires-Dist: anthropic>=0.40; extra == 'judge'
|
|
41
|
+
Requires-Dist: google-genai>=1.0; extra == 'judge'
|
|
42
|
+
Requires-Dist: openai>=1.40; extra == 'judge'
|
|
35
43
|
Provides-Extra: sandbox-daytona
|
|
36
|
-
Requires-Dist: daytona>=0.
|
|
44
|
+
Requires-Dist: daytona>=0.184.0; extra == 'sandbox-daytona'
|
|
37
45
|
Requires-Dist: tenacity>=8.0; extra == 'sandbox-daytona'
|
|
38
46
|
Provides-Extra: sandbox-modal
|
|
39
47
|
Requires-Dist: modal>=0.73; extra == 'sandbox-modal'
|
|
@@ -66,7 +74,7 @@ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. Sing
|
|
|
66
74
|
uv tool install benchflow
|
|
67
75
|
```
|
|
68
76
|
|
|
69
|
-
Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth.
|
|
77
|
+
Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth. Provider-prefixed models may use provider-specific credentials; Azure Foundry models use `AZURE_API_KEY` plus `AZURE_API_ENDPOINT`.
|
|
70
78
|
|
|
71
79
|
## Documentation
|
|
72
80
|
|
|
@@ -81,6 +89,7 @@ Start with [Getting started](./docs/getting-started.md), then [Concepts](./docs/
|
|
|
81
89
|
| Multi-round single-agent (progressive disclosure, oracle access) | [Progressive disclosure](./docs/progressive-disclosure.md) |
|
|
82
90
|
| Skill evaluation (when the artifact is a skill, not a workspace) | [Skill eval](./docs/skill-eval.md) |
|
|
83
91
|
| Understand the security model | [Sandbox hardening](./docs/sandbox-hardening.md) |
|
|
92
|
+
| Use public vs internal preview SDK releases | [Release channels](./docs/release.md) |
|
|
84
93
|
| CLI flags + commands | [CLI reference](./docs/reference/cli.md) |
|
|
85
94
|
| Python API surface | [Python API reference](./docs/reference/python-api.md) |
|
|
86
95
|
|
|
@@ -91,20 +100,20 @@ Notebooks and runnable example scripts live under [`docs/examples/`](./docs/exam
|
|
|
91
100
|
Benchmark datasets live in external Git repos and are referenced with two fields:
|
|
92
101
|
|
|
93
102
|
```yaml
|
|
94
|
-
# benchmarks/
|
|
103
|
+
# benchmarks/harvey-lab/harvey-lab-gemini-flash-lite.yaml
|
|
95
104
|
source:
|
|
96
|
-
repo: benchflow-ai/
|
|
97
|
-
path: tasks
|
|
105
|
+
repo: benchflow-ai/benchmarks # GitHub org/repo
|
|
106
|
+
path: datasets/harvey-lab/tasks # optional subpath within repo
|
|
98
107
|
ref: main # optional branch/tag
|
|
99
|
-
agent:
|
|
100
|
-
model:
|
|
108
|
+
agent: gemini
|
|
109
|
+
model: gemini/gemini-3.1-flash-lite-preview
|
|
101
110
|
```
|
|
102
111
|
|
|
103
112
|
Run any benchmark via the CLI:
|
|
104
113
|
|
|
105
114
|
```bash
|
|
106
|
-
# From a YAML config
|
|
107
|
-
bench eval create --config benchmarks/
|
|
115
|
+
# From a YAML config (shipped with the repo)
|
|
116
|
+
bench eval create --config benchmarks/harvey-lab/harvey-lab-gemini-flash-lite.yaml
|
|
108
117
|
|
|
109
118
|
# Inline — mirrors the YAML source fields
|
|
110
119
|
bench eval create \
|
|
@@ -114,10 +123,9 @@ bench eval create \
|
|
|
114
123
|
|
|
115
124
|
Repos are cloned and cached locally under `.cache/datasets/` on first use.
|
|
116
125
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
|
|
126
|
+
Downstream projects should depend on the public PyPI release by default. For
|
|
127
|
+
internal validation before the next public release, install or lock the internal
|
|
128
|
+
preview channel with prereleases enabled; see [Release channels](./docs/release.md).
|
|
121
129
|
|
|
122
130
|
## Featured
|
|
123
131
|
|
|
@@ -141,7 +149,9 @@ Two runnable labs validate the security story:
|
|
|
141
149
|
|
|
142
150
|
PRs welcome. Open against `main`. CI runs ruff + tests on every PR; please run `ruff check .` and `pytest tests/` locally first.
|
|
143
151
|
|
|
144
|
-
|
|
152
|
+
Release channels are documented in [Release channels](./docs/release.md). In
|
|
153
|
+
short: merges to `main` publish an internal preview after CI passes, while a
|
|
154
|
+
matching `v<version>` tag publishes the public release.
|
|
145
155
|
|
|
146
156
|
## License
|
|
147
157
|
|
|
@@ -24,7 +24,7 @@ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. Sing
|
|
|
24
24
|
uv tool install benchflow
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth.
|
|
27
|
+
Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth. Provider-prefixed models may use provider-specific credentials; Azure Foundry models use `AZURE_API_KEY` plus `AZURE_API_ENDPOINT`.
|
|
28
28
|
|
|
29
29
|
## Documentation
|
|
30
30
|
|
|
@@ -39,6 +39,7 @@ Start with [Getting started](./docs/getting-started.md), then [Concepts](./docs/
|
|
|
39
39
|
| Multi-round single-agent (progressive disclosure, oracle access) | [Progressive disclosure](./docs/progressive-disclosure.md) |
|
|
40
40
|
| Skill evaluation (when the artifact is a skill, not a workspace) | [Skill eval](./docs/skill-eval.md) |
|
|
41
41
|
| Understand the security model | [Sandbox hardening](./docs/sandbox-hardening.md) |
|
|
42
|
+
| Use public vs internal preview SDK releases | [Release channels](./docs/release.md) |
|
|
42
43
|
| CLI flags + commands | [CLI reference](./docs/reference/cli.md) |
|
|
43
44
|
| Python API surface | [Python API reference](./docs/reference/python-api.md) |
|
|
44
45
|
|
|
@@ -49,20 +50,20 @@ Notebooks and runnable example scripts live under [`docs/examples/`](./docs/exam
|
|
|
49
50
|
Benchmark datasets live in external Git repos and are referenced with two fields:
|
|
50
51
|
|
|
51
52
|
```yaml
|
|
52
|
-
# benchmarks/
|
|
53
|
+
# benchmarks/harvey-lab/harvey-lab-gemini-flash-lite.yaml
|
|
53
54
|
source:
|
|
54
|
-
repo: benchflow-ai/
|
|
55
|
-
path: tasks
|
|
55
|
+
repo: benchflow-ai/benchmarks # GitHub org/repo
|
|
56
|
+
path: datasets/harvey-lab/tasks # optional subpath within repo
|
|
56
57
|
ref: main # optional branch/tag
|
|
57
|
-
agent:
|
|
58
|
-
model:
|
|
58
|
+
agent: gemini
|
|
59
|
+
model: gemini/gemini-3.1-flash-lite-preview
|
|
59
60
|
```
|
|
60
61
|
|
|
61
62
|
Run any benchmark via the CLI:
|
|
62
63
|
|
|
63
64
|
```bash
|
|
64
|
-
# From a YAML config
|
|
65
|
-
bench eval create --config benchmarks/
|
|
65
|
+
# From a YAML config (shipped with the repo)
|
|
66
|
+
bench eval create --config benchmarks/harvey-lab/harvey-lab-gemini-flash-lite.yaml
|
|
66
67
|
|
|
67
68
|
# Inline — mirrors the YAML source fields
|
|
68
69
|
bench eval create \
|
|
@@ -72,10 +73,9 @@ bench eval create \
|
|
|
72
73
|
|
|
73
74
|
Repos are cloned and cached locally under `.cache/datasets/` on first use.
|
|
74
75
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
|
|
76
|
+
Downstream projects should depend on the public PyPI release by default. For
|
|
77
|
+
internal validation before the next public release, install or lock the internal
|
|
78
|
+
preview channel with prereleases enabled; see [Release channels](./docs/release.md).
|
|
79
79
|
|
|
80
80
|
## Featured
|
|
81
81
|
|
|
@@ -99,7 +99,9 @@ Two runnable labs validate the security story:
|
|
|
99
99
|
|
|
100
100
|
PRs welcome. Open against `main`. CI runs ruff + tests on every PR; please run `ruff check .` and `pytest tests/` locally first.
|
|
101
101
|
|
|
102
|
-
|
|
102
|
+
Release channels are documented in [Release channels](./docs/release.md). In
|
|
103
|
+
short: merges to `main` publish an internal preview after CI passes, while a
|
|
104
|
+
matching `v<version>` tag publishes the public release.
|
|
103
105
|
|
|
104
106
|
## License
|
|
105
107
|
|
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "benchflow"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.5.1.dev869"
|
|
4
4
|
description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
7
7
|
keywords = ["benchmark", "llm-agents", "acp", "agent-evaluation", "multi-turn", "skillsbench"]
|
|
8
8
|
dependencies = [
|
|
9
|
+
"agent-client-protocol>=0.10",
|
|
9
10
|
"httpx>=0.27.0",
|
|
10
11
|
"anyio>=4.0",
|
|
11
|
-
"pydantic>=2.
|
|
12
|
+
"pydantic>=2.7",
|
|
12
13
|
"pyyaml>=6.0",
|
|
13
14
|
"rich>=13.0",
|
|
15
|
+
"litellm[proxy]==1.88.0rc1",
|
|
16
|
+
"tomli-w>=1.0",
|
|
14
17
|
"typer>=0.9",
|
|
15
18
|
]
|
|
16
19
|
authors = [
|
|
@@ -35,6 +38,7 @@ classifiers = [
|
|
|
35
38
|
|
|
36
39
|
[project.optional-dependencies]
|
|
37
40
|
dev = [
|
|
41
|
+
"packaging>=24",
|
|
38
42
|
"pre-commit>=3.7",
|
|
39
43
|
"pytest>=9.0.3",
|
|
40
44
|
"pytest-asyncio>=0.24.0",
|
|
@@ -42,7 +46,12 @@ dev = [
|
|
|
42
46
|
"ty>=0.0.1a1",
|
|
43
47
|
]
|
|
44
48
|
sandbox-daytona = [
|
|
45
|
-
|
|
49
|
+
# >=0.183: list() returns an auto-paginating Iterator[Sandbox] (the older
|
|
50
|
+
# paged list(page=, limit=) -> .items API was removed).
|
|
51
|
+
# >=0.184: the top-level sync `Daytona` export is present (0.176-0.183 only
|
|
52
|
+
# shipped `AsyncDaytona`); the dashboard's daytona_status.snapshot() uses the
|
|
53
|
+
# sync client, so this floor is required for that panel to import.
|
|
54
|
+
"daytona>=0.184.0",
|
|
46
55
|
"tenacity>=8.0",
|
|
47
56
|
]
|
|
48
57
|
sandbox-modal = [
|
|
@@ -52,6 +61,13 @@ sandbox-modal = [
|
|
|
52
61
|
bedrock = [
|
|
53
62
|
"boto3>=1.40",
|
|
54
63
|
]
|
|
64
|
+
# Provider SDKs for the llm-judge verifier (type = "llm-judge").
|
|
65
|
+
# llm.py routes judge calls across all three; install at least one.
|
|
66
|
+
judge = [
|
|
67
|
+
"anthropic>=0.40",
|
|
68
|
+
"openai>=1.40",
|
|
69
|
+
"google-genai>=1.0",
|
|
70
|
+
]
|
|
55
71
|
|
|
56
72
|
[project.scripts]
|
|
57
73
|
benchflow = "benchflow.cli.main:app"
|
|
@@ -90,7 +106,13 @@ markers = [
|
|
|
90
106
|
|
|
91
107
|
[tool.ruff]
|
|
92
108
|
target-version = "py312"
|
|
93
|
-
|
|
109
|
+
# Vendored third-party service packages baked into task images (e.g. the
|
|
110
|
+
# smolclaws claw-* sources copied under a ClawsBench task's environment/) are
|
|
111
|
+
# not BenchFlow code — do not lint them.
|
|
112
|
+
extend-exclude = [
|
|
113
|
+
".claude/skills/skill-creator",
|
|
114
|
+
"benchmarks/**/tasks/**/environment/claw-*",
|
|
115
|
+
]
|
|
94
116
|
|
|
95
117
|
[tool.ruff.lint]
|
|
96
118
|
select = [
|
|
@@ -127,7 +149,7 @@ python-version = "3.12"
|
|
|
127
149
|
unresolved-import = "ignore"
|
|
128
150
|
|
|
129
151
|
[tool.ty.src]
|
|
130
|
-
include = ["src"]
|
|
152
|
+
include = ["src", "tools"]
|
|
131
153
|
# Modules that heavily use optional-dep types (daytona, modal, openai, boto3, …)
|
|
132
154
|
# produce cascading type errors when those packages aren't installed.
|
|
133
155
|
exclude = [
|
|
@@ -139,6 +161,5 @@ exclude = [
|
|
|
139
161
|
"src/benchflow/rewards/llm.py",
|
|
140
162
|
"src/benchflow/rewards/file_readers.py",
|
|
141
163
|
"src/benchflow/rewards/rubric_config.py",
|
|
142
|
-
"src/benchflow/providers/bedrock_runtime.py",
|
|
143
164
|
"src/benchflow/experimental/mcp/reviewer_server.py",
|
|
144
165
|
]
|
|
@@ -3,16 +3,20 @@
|
|
|
3
3
|
Public API surface:
|
|
4
4
|
- Sandbox protocol for isolated execution environments
|
|
5
5
|
- ACP client for multi-turn agent communication
|
|
6
|
-
- Trajectory capture (
|
|
6
|
+
- Trajectory capture (LiteLLM callbacks, OTel collector, ACP native)
|
|
7
7
|
- Rollout lifecycle for single-task execution
|
|
8
8
|
- Evaluation orchestration with retries and concurrency
|
|
9
9
|
- Rewards protocol (composable Rubric + RewardFunc)
|
|
10
10
|
- Metrics collection and aggregation
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
+
from importlib.metadata import PackageNotFoundError
|
|
13
14
|
from importlib.metadata import version as _version
|
|
14
15
|
|
|
15
|
-
|
|
16
|
+
try:
|
|
17
|
+
__version__ = _version("benchflow")
|
|
18
|
+
except PackageNotFoundError:
|
|
19
|
+
__version__ = "0+unknown"
|
|
16
20
|
|
|
17
21
|
# Core types
|
|
18
22
|
from benchflow._types import Role, Scene, Turn
|
|
@@ -33,6 +37,12 @@ from benchflow.agents.registry import (
|
|
|
33
37
|
list_agents,
|
|
34
38
|
register_agent,
|
|
35
39
|
)
|
|
40
|
+
from benchflow.contracts.user import (
|
|
41
|
+
BaseUser,
|
|
42
|
+
FunctionUser,
|
|
43
|
+
PassthroughUser,
|
|
44
|
+
RoundResult,
|
|
45
|
+
)
|
|
36
46
|
from benchflow.evaluation import (
|
|
37
47
|
Evaluation,
|
|
38
48
|
EvaluationConfig,
|
|
@@ -41,13 +51,23 @@ from benchflow.evaluation import (
|
|
|
41
51
|
)
|
|
42
52
|
from benchflow.metrics import BenchmarkMetrics, collect_metrics
|
|
43
53
|
from benchflow.models import AgentInstallError, AgentTimeoutError, RolloutResult
|
|
54
|
+
from benchflow.monitor import (
|
|
55
|
+
Monitor,
|
|
56
|
+
MonitorConfig,
|
|
57
|
+
MonitorNotImplementedError,
|
|
58
|
+
MonitorResult,
|
|
59
|
+
)
|
|
44
60
|
|
|
45
|
-
# Rewards
|
|
61
|
+
# Rewards plane. Reward is the canonical node-based contract
|
|
62
|
+
# (``score(node) -> VerifyResult``); RewardFunc is the legacy path-based shape
|
|
63
|
+
# (``score(rollout_dir) -> float``) adapted into Reward via PathReward.
|
|
46
64
|
from benchflow.rewards import (
|
|
47
65
|
CodeExecRewardFunc,
|
|
48
66
|
Criterion,
|
|
49
67
|
JudgeConfig,
|
|
50
68
|
LLMJudgeRewardFunc,
|
|
69
|
+
PathReward,
|
|
70
|
+
Reward,
|
|
51
71
|
RewardEvent,
|
|
52
72
|
RewardFunc,
|
|
53
73
|
Rubric,
|
|
@@ -56,6 +76,8 @@ from benchflow.rewards import (
|
|
|
56
76
|
StringMatchRewardFunc,
|
|
57
77
|
TestRewardFunc,
|
|
58
78
|
VerifyResult,
|
|
79
|
+
load_rubric,
|
|
80
|
+
load_rubric_json,
|
|
59
81
|
load_rubric_toml,
|
|
60
82
|
)
|
|
61
83
|
from benchflow.rollout import Rollout, RolloutConfig
|
|
@@ -73,6 +95,8 @@ from benchflow.sandbox import (
|
|
|
73
95
|
ImageConfig,
|
|
74
96
|
ImageRef,
|
|
75
97
|
Sandbox,
|
|
98
|
+
SandboxImage,
|
|
99
|
+
SandboxSnapshotNotSupported,
|
|
76
100
|
build_service_hooks,
|
|
77
101
|
detect_services_from_dockerfile,
|
|
78
102
|
register_service,
|
|
@@ -82,10 +106,15 @@ from benchflow.sandbox import (
|
|
|
82
106
|
from benchflow.sandbox import ExecResult as SandboxExecResult
|
|
83
107
|
from benchflow.sandbox.protocol import ExecResult
|
|
84
108
|
from benchflow.sandbox.setup import stage_dockerfile_deps
|
|
85
|
-
from benchflow.sandbox.snapshot import
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
109
|
+
from benchflow.sandbox.snapshot import (
|
|
110
|
+
list_snapshots,
|
|
111
|
+
list_workspace_snapshots,
|
|
112
|
+
restore,
|
|
113
|
+
snapshot,
|
|
114
|
+
workspace_restore,
|
|
115
|
+
workspace_snapshot,
|
|
116
|
+
)
|
|
117
|
+
from benchflow.scenes import compile_scenes_to_steps
|
|
89
118
|
from benchflow.sdk import SDK
|
|
90
119
|
from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
|
|
91
120
|
from benchflow.task import (
|
|
@@ -95,17 +124,18 @@ from benchflow.task import (
|
|
|
95
124
|
VerifierResult,
|
|
96
125
|
)
|
|
97
126
|
from benchflow.trajectories.otel import OTelCollector
|
|
98
|
-
from benchflow.trajectories.proxy import TrajectoryProxy
|
|
99
127
|
from benchflow.trajectories.types import Trajectory
|
|
100
128
|
|
|
101
129
|
# Public API surface. Anything not in this list is implementation detail and
|
|
102
130
|
# may change without notice.
|
|
103
131
|
__all__ = [
|
|
104
132
|
"__version__",
|
|
105
|
-
# Rewards
|
|
133
|
+
# Rewards plane
|
|
134
|
+
"Reward",
|
|
106
135
|
"Rubric",
|
|
107
136
|
"RewardFunc",
|
|
108
137
|
"RewardEvent",
|
|
138
|
+
"PathReward",
|
|
109
139
|
"VerifyResult",
|
|
110
140
|
"TestRewardFunc",
|
|
111
141
|
"LLMJudgeRewardFunc",
|
|
@@ -115,10 +145,14 @@ __all__ = [
|
|
|
115
145
|
"JudgeConfig",
|
|
116
146
|
"RubricConfig",
|
|
117
147
|
"ScoringConfig",
|
|
148
|
+
"load_rubric",
|
|
149
|
+
"load_rubric_json",
|
|
118
150
|
"load_rubric_toml",
|
|
119
151
|
# Sandbox protocol
|
|
120
152
|
"Sandbox",
|
|
121
153
|
"SandboxExecResult",
|
|
154
|
+
"SandboxImage",
|
|
155
|
+
"SandboxSnapshotNotSupported",
|
|
122
156
|
"ImageBuilder",
|
|
123
157
|
"ImageConfig",
|
|
124
158
|
"ImageRef",
|
|
@@ -149,6 +183,11 @@ __all__ = [
|
|
|
149
183
|
"AgentInstallError",
|
|
150
184
|
"AgentTimeoutError",
|
|
151
185
|
"RolloutResult",
|
|
186
|
+
# Monitor mode — scaffolded API surface (#386)
|
|
187
|
+
"Monitor",
|
|
188
|
+
"MonitorConfig",
|
|
189
|
+
"MonitorResult",
|
|
190
|
+
"MonitorNotImplementedError",
|
|
152
191
|
# Runtime
|
|
153
192
|
"Agent",
|
|
154
193
|
"Environment",
|
|
@@ -161,13 +200,13 @@ __all__ = [
|
|
|
161
200
|
"Role",
|
|
162
201
|
"Scene",
|
|
163
202
|
"Turn",
|
|
164
|
-
#
|
|
165
|
-
"
|
|
166
|
-
|
|
167
|
-
"
|
|
168
|
-
"
|
|
169
|
-
"
|
|
170
|
-
#
|
|
203
|
+
# Scene authoring desugaring
|
|
204
|
+
"compile_scenes_to_steps",
|
|
205
|
+
# Workspace snapshots (filesystem helper — NOT the Sandbox primitive, #384)
|
|
206
|
+
"workspace_snapshot",
|
|
207
|
+
"workspace_restore",
|
|
208
|
+
"list_workspace_snapshots",
|
|
209
|
+
# Backward-compatible aliases for the above (pre-#384 names)
|
|
171
210
|
"snapshot",
|
|
172
211
|
"restore",
|
|
173
212
|
"list_snapshots",
|
|
@@ -195,7 +234,6 @@ __all__ = [
|
|
|
195
234
|
"parse_skill",
|
|
196
235
|
# Trajectories
|
|
197
236
|
"OTelCollector",
|
|
198
|
-
"TrajectoryProxy",
|
|
199
237
|
"Trajectory",
|
|
200
238
|
# External adapters
|
|
201
239
|
"InspectAdapter",
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""Path safety helpers — reject unsafe inputs and refuse to follow symlinks.
|
|
2
|
+
|
|
3
|
+
Two independent helper sets live here:
|
|
4
|
+
|
|
5
|
+
1. **Segment validation** (``safe_path_segment``, ``assert_within``):
|
|
6
|
+
Reject user-controlled strings (case ids, skill names) that would traverse
|
|
7
|
+
outside the intended tree.
|
|
8
|
+
|
|
9
|
+
2. **Symlink defense** (``is_safe_regular_file``, ``iter_safe_tree``, etc.):
|
|
10
|
+
Walk directories we do not own without following symlinks, so an
|
|
11
|
+
attacker-placed link cannot pull host files into dashboard payloads,
|
|
12
|
+
judge prompts, or sandbox uploads.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import stat
|
|
20
|
+
from collections.abc import Iterator
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"safe_path_segment",
|
|
25
|
+
"assert_within",
|
|
26
|
+
"is_safe_regular_file",
|
|
27
|
+
"is_safe_regular_dir",
|
|
28
|
+
"iter_safe_children",
|
|
29
|
+
"iter_safe_tree",
|
|
30
|
+
"ignore_symlinks",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ── Segment validation ───────────────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def safe_path_segment(name: str, *, kind: str = "name") -> str:
|
|
40
|
+
"""Return ``name`` unchanged if safe as a single path segment.
|
|
41
|
+
|
|
42
|
+
Raises :class:`ValueError` for inputs that cannot be used as a directory
|
|
43
|
+
or file name without risking path traversal or shell ambiguity.
|
|
44
|
+
|
|
45
|
+
Rejected forms:
|
|
46
|
+
|
|
47
|
+
* empty string
|
|
48
|
+
* ``.`` or ``..`` (current/parent directory references)
|
|
49
|
+
* any string containing ``/`` or ``\\`` (multi-segment paths)
|
|
50
|
+
* any string containing a NUL byte
|
|
51
|
+
* leading or trailing whitespace
|
|
52
|
+
* leading ``-`` (would be interpreted as a CLI flag by downstream tools)
|
|
53
|
+
|
|
54
|
+
All other Unicode is accepted; this is a security boundary, not a
|
|
55
|
+
cosmetic slugifier. Callers that want forgiving behaviour should slugify
|
|
56
|
+
*before* calling this function.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
name: The candidate path segment.
|
|
60
|
+
kind: A human label used in the error message (e.g. ``"case id"``,
|
|
61
|
+
``"skill name"``).
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
The input ``name`` unchanged.
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
ValueError: If ``name`` is not safe as a single path segment.
|
|
68
|
+
"""
|
|
69
|
+
if not isinstance(name, str):
|
|
70
|
+
raise ValueError(f"{kind} must be a string, got {type(name).__name__}")
|
|
71
|
+
if name == "":
|
|
72
|
+
raise ValueError(f"{kind} must not be empty")
|
|
73
|
+
if name in (".", ".."):
|
|
74
|
+
raise ValueError(f"{kind} must not be '.' or '..' (got {name!r})")
|
|
75
|
+
if "/" in name or "\\" in name:
|
|
76
|
+
raise ValueError(f"{kind} must not contain path separators (got {name!r})")
|
|
77
|
+
if "\x00" in name:
|
|
78
|
+
raise ValueError(f"{kind} must not contain NUL bytes (got {name!r})")
|
|
79
|
+
if name != name.strip():
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"{kind} must not have leading or trailing whitespace (got {name!r})"
|
|
82
|
+
)
|
|
83
|
+
if name.startswith("-"):
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"{kind} must not start with '-' (got {name!r}); "
|
|
86
|
+
"would be misread as a CLI flag"
|
|
87
|
+
)
|
|
88
|
+
return name
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def assert_within(child: Path, root: Path) -> Path:
|
|
92
|
+
"""Resolve both paths and assert ``child`` is under ``root``.
|
|
93
|
+
|
|
94
|
+
Uses :meth:`Path.resolve` so symlinks are followed and ``..`` segments
|
|
95
|
+
collapsed before the containment check. Returns the resolved child.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
child: A path that should be inside ``root``.
|
|
99
|
+
root: The directory ``child`` must not escape.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
The resolved ``child`` path.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
ValueError: If the resolved ``child`` is not under the resolved
|
|
106
|
+
``root``.
|
|
107
|
+
"""
|
|
108
|
+
resolved_root = root.resolve()
|
|
109
|
+
resolved_child = child.resolve()
|
|
110
|
+
try:
|
|
111
|
+
resolved_child.relative_to(resolved_root)
|
|
112
|
+
except ValueError as exc:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"path {child} resolves to {resolved_child}, "
|
|
115
|
+
f"which is outside {resolved_root}"
|
|
116
|
+
) from exc
|
|
117
|
+
return resolved_child
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# ── Symlink defense ──────────────────────────────────────────────────
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def is_safe_regular_file(path: Path) -> bool:
|
|
124
|
+
"""True if *path* exists, is a regular file, and is not a symlink.
|
|
125
|
+
|
|
126
|
+
Uses ``os.lstat`` so symlinks, fifos, sockets, and device files all
|
|
127
|
+
return False. A non-existent path also returns False.
|
|
128
|
+
"""
|
|
129
|
+
try:
|
|
130
|
+
st = os.lstat(path)
|
|
131
|
+
except OSError:
|
|
132
|
+
return False
|
|
133
|
+
return stat.S_ISREG(st.st_mode) and not stat.S_ISLNK(st.st_mode)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def is_safe_regular_dir(path: Path) -> bool:
|
|
137
|
+
"""True if *path* is a directory and not a symlink to one."""
|
|
138
|
+
try:
|
|
139
|
+
st = os.lstat(path)
|
|
140
|
+
except OSError:
|
|
141
|
+
return False
|
|
142
|
+
return stat.S_ISDIR(st.st_mode) and not stat.S_ISLNK(st.st_mode)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def iter_safe_children(
|
|
146
|
+
directory: Path,
|
|
147
|
+
*,
|
|
148
|
+
context: str = "directory walk",
|
|
149
|
+
) -> Iterator[Path]:
|
|
150
|
+
"""Yield direct children of *directory*, skipping symlinks with a warning."""
|
|
151
|
+
try:
|
|
152
|
+
entries = sorted(directory.iterdir())
|
|
153
|
+
except (OSError, NotADirectoryError):
|
|
154
|
+
return
|
|
155
|
+
for child in entries:
|
|
156
|
+
if child.is_symlink():
|
|
157
|
+
logger.warning(
|
|
158
|
+
"%s: skipping symlink %s (refusing to follow)", context, child
|
|
159
|
+
)
|
|
160
|
+
continue
|
|
161
|
+
yield child
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def iter_safe_tree(
|
|
165
|
+
root: Path,
|
|
166
|
+
*,
|
|
167
|
+
context: str = "tree walk",
|
|
168
|
+
) -> Iterator[Path]:
|
|
169
|
+
"""Recursively yield regular files under *root*, never following symlinks.
|
|
170
|
+
|
|
171
|
+
Uses ``os.walk(followlinks=False)`` so directory symlinks are also not
|
|
172
|
+
descended into.
|
|
173
|
+
"""
|
|
174
|
+
if not is_safe_regular_dir(root):
|
|
175
|
+
if Path(root).is_symlink():
|
|
176
|
+
logger.warning(
|
|
177
|
+
"%s: refusing to descend into symlinked root %s", context, root
|
|
178
|
+
)
|
|
179
|
+
return
|
|
180
|
+
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
|
|
181
|
+
base = Path(dirpath)
|
|
182
|
+
kept_dirs: list[str] = []
|
|
183
|
+
for name in dirnames:
|
|
184
|
+
child = base / name
|
|
185
|
+
if child.is_symlink():
|
|
186
|
+
logger.warning(
|
|
187
|
+
"%s: skipping symlinked directory %s (refusing to follow)",
|
|
188
|
+
context,
|
|
189
|
+
child,
|
|
190
|
+
)
|
|
191
|
+
continue
|
|
192
|
+
kept_dirs.append(name)
|
|
193
|
+
dirnames[:] = sorted(kept_dirs)
|
|
194
|
+
for name in sorted(filenames):
|
|
195
|
+
f = base / name
|
|
196
|
+
if not is_safe_regular_file(f):
|
|
197
|
+
logger.warning(
|
|
198
|
+
"%s: skipping non-regular path %s (symlink or special file)",
|
|
199
|
+
context,
|
|
200
|
+
f,
|
|
201
|
+
)
|
|
202
|
+
continue
|
|
203
|
+
yield f
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def ignore_symlinks(directory: str, contents: list[str]) -> list[str]:
|
|
207
|
+
"""``shutil.copytree`` ``ignore=`` callback that drops every symlink."""
|
|
208
|
+
skipped: list[str] = []
|
|
209
|
+
for name in contents:
|
|
210
|
+
if Path(directory, name).is_symlink():
|
|
211
|
+
skipped.append(name)
|
|
212
|
+
if skipped:
|
|
213
|
+
logger.warning(
|
|
214
|
+
"copytree: skipping symlinked entries under %s: %s",
|
|
215
|
+
directory,
|
|
216
|
+
", ".join(sorted(skipped)),
|
|
217
|
+
)
|
|
218
|
+
return skipped
|