benchflow 0.5.3.dev894__tar.gz → 0.5.3.dev902__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/PKG-INFO +1 -1
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/pyproject.toml +1 -1
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/registry.py +13 -1
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_agent_registry.py +16 -2
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_agent_setup.py +3 -1
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/.gitignore +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/CHANGELOG.md +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/LICENSE +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/README.md +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_dotenv.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_paths.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_run.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_types.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/benchmark_repos.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/config.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/evaluation_results.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/json_safe.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/learner_memory.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/result_metadata.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/reward_events.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/scoring.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/source_provenance.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/task_authoring.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/yaml_loader.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/acp/client.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/acp/container_transport.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/acp/runtime.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/acp/session.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/acp/transport.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/adapters/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/adapters/harbor.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/adapters/inbound.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/adapters/inspect_ai.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/adapters/ors.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/adapters/terminal_bench.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/codex_config.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/credentials.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/env.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/errors.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/install.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/protocol.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/providers.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/branch.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/cli/main.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/cli/trace_import.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/compat/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/compat/harbor_registry.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/contracts/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/contracts/planes.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/contracts/user.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/demo_task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/demo_task/instruction.md +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/demo_task/task.toml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/demo_task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/diagnostics.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/environment/manifest.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/environment/manifest_env.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/environment/protocol.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/environment/readiness.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/eval_sharding.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/eval_worker.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/evaluation.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/experimental/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/experimental/mcp/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/experimental/mcp/hooks.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/hosted_env.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/learner_skills.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/learner_store.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/metrics.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/models.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/monitor.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/providers/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/providers/litellm_config.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/providers/litellm_logging.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/providers/litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/providers/runtime.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/py.typed +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/README.md +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/builtins.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/events.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/file_readers.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/llm.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/memory_scorer.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/node.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/protocol.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/rubric.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/rubric_config.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rewards/validation.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rollout.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rollout_branch.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/rollout_planes.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/runtime.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/_base.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/_compose.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/_sdk_ops.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/daytona.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/docker.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/lockdown.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/metadata.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/modal_impl.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/process.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/protocol.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/services.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/setup.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/snapshot.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sandbox/user.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/scenes.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/sdk.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/self_gen.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/skill_eval/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/skill_eval/_core.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/skill_eval/gepa_export.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/skill_eval/schema.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/skill_policy.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/skills.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/task/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/task/config.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/task/env.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/task/paths.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/task/task.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/task/verifier.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/templates/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/templates/judge.py.tmpl +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/templates/test.sh.tmpl +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/traces/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/traces/huggingface.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/traces/local.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/traces/models.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/traces/parsers.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/traces/task_gen.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/trajectories/_capture.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/trajectories/export.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/trajectories/metrics.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/trajectories/tree.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/trajectories/viewer.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/usage_tracking.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/agents/test_protocol.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/README.md +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/acp_smoke/instruction.md +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/acp_smoke/task.toml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/acp_smoke/tests/test.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/conformance-results.json +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/proof_multi_agent.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/proof_snapshot.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/run_conformance.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conftest.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/environment/test_chibench_manifest.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/environment/test_clawsbench_manifest.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/environment/test_manifest.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/environment/test_manifest_env.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/environment/test_protocol.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/environment/test_readiness.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/test_codex.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/test_codex_custom_provider.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/traces/minimal-claude.jsonl +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/fixtures/mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/check_adapter_evidence.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/check_hosted_env_evidence.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/check_results.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/check_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/configs/claude-agent-acp.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/configs/codex-acp.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/configs/gemini.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/configs/openclaw.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/configs/opencode.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/configs/openhands.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/configs/pi-acp.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/run.sh +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/run_suite.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/integration/suites/release.yaml +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_acp.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_acp_capability_advertising.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_acp_model_config_dispatch.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_acp_pinned_protocol_guard.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_acp_setup_failure_propagation.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_adapter_scripts.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_adapters.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_agent_cli.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_agent_env_resolution.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_agent_gemini_defaults.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_agent_idle_timeout_cli.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_agent_spec.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_base_install_imports.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_bedrock_thinking.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_branch.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_capture_trajectory.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_clawsbench_slice.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_cli_daytona.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_cli_docs_drift.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_cli_misc.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_compat_harbor_registry.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_config_redaction.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_connect_as_env.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_continuallearningbench_adapter.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_dashboard_credential_env_scrub.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_dashboard_daytona_key.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_dashboard_no_host_paths.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_dashboard_release_evidence.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_dashboard_roadmap.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_dashboard_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_dashboard_sync.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_daytona_command_polling.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_daytona_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_daytona_status.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_docker_prune_scoping.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_docker_uploads.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_docs_examples.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_eng50_capabilities.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_env_setup.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_environment_manifest_controls.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_eval_filters_applied.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_eval_sharding.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_eval_single_task_summary.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_eval_source_provenance.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_eval_worker_retry.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_eval_zero_task_guard.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_evaluation_environment_manifest.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_experiments_status.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_harvey_lab_shim.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_hf_scores.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_hilbench_adapter.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_hosted_env.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_hosted_env_rollout_contract.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_inbound_adapter_manifest.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_inbound_adapters.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_integration_check_results.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_integration_run_suite.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_internet_policy.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_job.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_job_sequential_shared.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_job_sequential_shared_resume.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_judge_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_learner_skills.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_learner_skills_traversal.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_learner_store.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_learner_store_persistence.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_litellm_config.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_litellm_hardening.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_litellm_logging.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_litellm_smoke.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_llm_judge.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_llm_judge_event_tags.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_llm_judge_verifier.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_memory_scorer.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_metrics.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_monitor_scaffold.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_native_acp_usage.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_no_cross_provider_fallback.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_notification_order_real.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_oracle.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_oracle_chokepoint.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_paths_safe.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_paths_symlink_helpers.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_process.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_provider_auth_detection.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_providers.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_reexport.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_registry_invariants.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_release_version.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_resolve_env_helpers.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_reward_node.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_reward_unified_contract.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rewards.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rewards_jsonl.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rollout_architecture.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rollout_branch.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rollout_config_path_coercion.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rollout_environment.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rollout_import_no_side_effects.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rollout_on_ask_user_wiring.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rollout_probe_sandbox_health.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rollout_upload.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_rubric_config.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_runtime.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_runtime_config_wired.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_runtime_live_sandbox.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sandbox.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sandbox_exec_secret_handling.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sandbox_hardening.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sandbox_multi_service.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sandbox_protocol.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sandbox_setup.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sandbox_snapshot_contract.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sandbox_upload_symlink.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sandbox_verifier_workspace.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_scene.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_scene_outbox_trial.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_scene_parallel_group.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_scene_result_aggregation.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_scoring.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sdk_internals.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_sdk_lockdown.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_self_gen_cli.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_self_gen_export_error_channel.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_self_gen_export_failures.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_self_gen_orchestration.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_session_request_permission_dispatch.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skill_eval.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skill_eval_dryrun.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skill_eval_integration.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skill_eval_sweep.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skill_eval_traversal.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skill_invocation_artifacts.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skill_policy.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skills.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skills_dir_agent_home_link.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_skillsbench_harbor_run_suite.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_smoke.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_subscription_auth.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_task_check_eval_consistency.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_task_config.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_task_download.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_tasks.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_token_usage_normalization.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_trace_import_cli.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_trace_task_gen_traversal.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_traces_huggingface.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_traces_parsers.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_traces_task_gen.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_train_mode_artifact_emission.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_trajectory_integration.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_trajectory_streaming.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_trial_agent_timeout_verify.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_trial_install_agent_timeout.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_trial_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_usage_litellm.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_usage_required.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_usage_tracking.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_user.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_verifier_multi_container.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_verifier_output.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_verifier_output_freshness.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_verify.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_workflow_action_pinning.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/test_yaml_config.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/trajectories/test_export.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/trajectories/test_export_nan_handling.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/trajectories/test_redaction.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/trajectories/test_step_granularity.py +0 -0
- {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/trajectories/test_tree.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.5.3.
|
|
3
|
+
Version: 0.5.3.dev902
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -106,6 +106,9 @@ def _apt_install(*packages: str) -> str:
|
|
|
106
106
|
_BENCHFLOW_NODE_PREFIX = "/opt/benchflow/node"
|
|
107
107
|
_BENCHFLOW_JS_AGENT_PREFIX = "/opt/benchflow/js-agents"
|
|
108
108
|
_BENCHFLOW_BIN_PREFIX = "/opt/benchflow/bin"
|
|
109
|
+
_OPENHANDS_CLI_GIT_REV = "3ca17446c5d9c1e35e054803478a3501ec251ecf"
|
|
110
|
+
_OPENHANDS_SDK_VERSION = "1.22.1"
|
|
111
|
+
_OPENHANDS_TOOLS_VERSION = "1.22.1"
|
|
109
112
|
_JS_AGENT_PATH = (
|
|
110
113
|
f"{_BENCHFLOW_BIN_PREFIX}:{_BENCHFLOW_JS_AGENT_PREFIX}/bin:"
|
|
111
114
|
f"{_BENCHFLOW_NODE_PREFIX}/bin:$PATH"
|
|
@@ -566,8 +569,17 @@ AGENTS: dict[str, AgentConfig] = {
|
|
|
566
569
|
" curl -LsSf https://astral.sh/uv/install.sh | sh >/dev/null 2>&1 && "
|
|
567
570
|
' export PATH="$HOME/.local/bin:$PATH"; '
|
|
568
571
|
" fi && "
|
|
572
|
+
# Pin the OpenHands CLI source so the agent workflow cannot drift
|
|
573
|
+
# with GitHub main; only override the buggy sdk/tools 1.21.0 pins.
|
|
574
|
+
# SDK 1.22.x restores default-to-UNKNOWN for the synthetic
|
|
575
|
+
# `security_risk` tool field without the API drift seen in 1.26.x.
|
|
576
|
+
f"printf 'openhands-sdk=={_OPENHANDS_SDK_VERSION}\\n"
|
|
577
|
+
f"openhands-tools=={_OPENHANDS_TOOLS_VERSION}\\n' "
|
|
578
|
+
"> /tmp/oh-sdk-overrides.txt && "
|
|
569
579
|
"uv tool install --force --refresh "
|
|
570
|
-
"--
|
|
580
|
+
"--overrides /tmp/oh-sdk-overrides.txt "
|
|
581
|
+
"--from "
|
|
582
|
+
f"'git+https://github.com/OpenHands/OpenHands-CLI.git@{_OPENHANDS_CLI_GIT_REV}' "
|
|
571
583
|
"openhands --python 3.12 && "
|
|
572
584
|
" uv tool list | grep -q '^openhands\\b' ) && "
|
|
573
585
|
# Let sandbox user traverse to uv-managed Python interpreter path.
|
|
@@ -99,7 +99,7 @@ class TestOpenHandsConfig:
|
|
|
99
99
|
assert "$HOME/.agents/skills" in cfg.skill_paths
|
|
100
100
|
assert "$WORKSPACE/.agents/skills" in cfg.skill_paths
|
|
101
101
|
|
|
102
|
-
def
|
|
102
|
+
def test_openhands_install_cmd_pins_cli_git_revision(self):
|
|
103
103
|
cfg = AGENTS["openhands"]
|
|
104
104
|
assert (
|
|
105
105
|
"apt-get -o Acquire::Retries=3 install -y -qq curl ca-certificates git"
|
|
@@ -107,12 +107,26 @@ class TestOpenHandsConfig:
|
|
|
107
107
|
)
|
|
108
108
|
assert (
|
|
109
109
|
"uv tool install --force --refresh "
|
|
110
|
-
"--
|
|
110
|
+
"--overrides /tmp/oh-sdk-overrides.txt "
|
|
111
|
+
"--from "
|
|
112
|
+
"'git+https://github.com/OpenHands/OpenHands-CLI.git@"
|
|
113
|
+
"3ca17446c5d9c1e35e054803478a3501ec251ecf' "
|
|
111
114
|
"openhands --python 3.12" in cfg.install_cmd
|
|
112
115
|
)
|
|
116
|
+
assert "OpenHands/OpenHands-CLI.git@main" not in cfg.install_cmd
|
|
117
|
+
assert "openhands==1.16.0" not in cfg.install_cmd
|
|
113
118
|
assert "command -v git" in cfg.install_cmd
|
|
114
119
|
assert "install.openhands.dev/install.sh" not in cfg.install_cmd
|
|
115
120
|
|
|
121
|
+
def test_openhands_install_cmd_overrides_buggy_sdk_pin(self):
|
|
122
|
+
"""Guards PR #644 against Opus timeouts from OpenHands SDK 1.21.0."""
|
|
123
|
+
cfg = AGENTS["openhands"]
|
|
124
|
+
|
|
125
|
+
assert "openhands-sdk==1.22.1" in cfg.install_cmd
|
|
126
|
+
assert "openhands-tools==1.22.1" in cfg.install_cmd
|
|
127
|
+
assert "openhands-sdk>=1.22.0" not in cfg.install_cmd
|
|
128
|
+
assert "--overrides /tmp/oh-sdk-overrides.txt" in cfg.install_cmd
|
|
129
|
+
|
|
116
130
|
def test_openhands_install_cmd_does_not_deploy_bedrock_shim(self):
|
|
117
131
|
"""Guards the LiteLLM runtime refactor: Bedrock patches live with LiteLLM."""
|
|
118
132
|
cfg = AGENTS["openhands"]
|
|
@@ -637,7 +637,9 @@ async def test_install_agent_writes_command_stdout_and_stderr_on_failure(
|
|
|
637
637
|
assert log_text.startswith("$ ")
|
|
638
638
|
assert (
|
|
639
639
|
"uv tool install --force --refresh "
|
|
640
|
-
"--
|
|
640
|
+
"--overrides /tmp/oh-sdk-overrides.txt "
|
|
641
|
+
"--from 'git+https://github.com/OpenHands/OpenHands-CLI.git@"
|
|
642
|
+
"3ca17446c5d9c1e35e054803478a3501ec251ecf' "
|
|
641
643
|
"openhands --python 3.12" in log_text
|
|
642
644
|
)
|
|
643
645
|
assert "=== stderr ===" in log_text
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/_utils/evaluation_results.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/agents/harvey_lab_acp_shim.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/demo_task/environment/Dockerfile
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/experimental/mcp/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/experimental/mcp/reviewer_server.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/providers/litellm_bedrock_patch.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/providers/litellm_logging.py
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/src/benchflow/providers/litellm_runtime.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/acp_smoke/environment/Dockerfile
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/acp_smoke/instruction.md
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/acp_smoke/solution/solve.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/conformance/conformance-results.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/environment/test_chibench_manifest.py
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/environment/test_clawsbench_manifest.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/hello-world-task/instruction.md
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/hello-world-task/solution/solve.sh
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/hello-world-task/tests/test.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev894 → benchflow-0.5.3.dev902}/tests/examples/terminal-bench-smoke-task/task.toml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|