benchflow 0.5.3.dev885__tar.gz → 0.5.3.dev899__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/PKG-INFO +1 -1
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/pyproject.toml +1 -1
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/registry.py +19 -7
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_agent_registry.py +14 -5
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_agent_setup.py +2 -2
- benchflow-0.5.3.dev899/tests/test_hf_scores.py +91 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/.gitignore +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/CHANGELOG.md +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/LICENSE +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/README.md +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_dotenv.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_paths.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_run.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_types.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/benchmark_repos.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/config.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/evaluation_results.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/json_safe.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/learner_memory.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/result_metadata.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/reward_events.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/scoring.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/source_provenance.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/task_authoring.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/yaml_loader.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/acp/client.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/acp/container_transport.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/acp/runtime.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/acp/session.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/acp/transport.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/adapters/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/adapters/harbor.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/adapters/inbound.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/adapters/inspect_ai.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/adapters/ors.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/adapters/terminal_bench.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/codex_config.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/credentials.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/env.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/errors.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/install.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/protocol.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/providers.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/branch.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/cli/main.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/cli/trace_import.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/compat/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/compat/harbor_registry.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/contracts/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/contracts/planes.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/contracts/user.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/demo_task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/demo_task/instruction.md +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/demo_task/task.toml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/demo_task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/diagnostics.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/environment/manifest.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/environment/manifest_env.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/environment/protocol.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/environment/readiness.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/eval_sharding.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/eval_worker.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/evaluation.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/experimental/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/experimental/mcp/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/experimental/mcp/hooks.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/hosted_env.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/learner_skills.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/learner_store.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/metrics.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/models.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/monitor.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/providers/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_config.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_logging.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/providers/runtime.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/py.typed +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/README.md +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/builtins.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/events.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/file_readers.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/llm.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/memory_scorer.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/node.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/protocol.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/rubric.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/rubric_config.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rewards/validation.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rollout.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rollout_branch.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/rollout_planes.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/runtime.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_base.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_compose.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_sdk_ops.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/daytona.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/docker.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/lockdown.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/metadata.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/modal_impl.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/process.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/protocol.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/services.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/setup.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/snapshot.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/user.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/scenes.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/sdk.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/self_gen.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/skill_eval/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/skill_eval/_core.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/skill_eval/gepa_export.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/skill_eval/schema.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/skill_policy.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/skills.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/task/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/task/config.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/task/env.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/task/paths.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/task/task.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/task/verifier.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/templates/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/templates/judge.py.tmpl +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/templates/test.sh.tmpl +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/traces/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/traces/huggingface.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/traces/local.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/traces/models.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/traces/parsers.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/traces/task_gen.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/_capture.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/export.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/metrics.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/tree.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/viewer.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/usage_tracking.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/agents/test_protocol.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/README.md +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/instruction.md +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/task.toml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/tests/test.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/conformance-results.json +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/proof_multi_agent.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/proof_snapshot.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/run_conformance.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/conftest.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/environment/test_chibench_manifest.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/environment/test_clawsbench_manifest.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/environment/test_manifest.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/environment/test_manifest_env.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/environment/test_protocol.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/environment/test_readiness.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/test_codex.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/test_codex_custom_provider.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/traces/minimal-claude.jsonl +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/fixtures/mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/check_adapter_evidence.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/check_hosted_env_evidence.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/check_results.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/check_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/configs/claude-agent-acp.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/configs/codex-acp.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/configs/gemini.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/configs/openclaw.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/configs/opencode.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/configs/openhands.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/configs/pi-acp.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/run.sh +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/run_suite.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/integration/suites/release.yaml +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_acp.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_acp_capability_advertising.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_acp_model_config_dispatch.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_acp_pinned_protocol_guard.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_acp_setup_failure_propagation.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_adapter_scripts.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_adapters.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_agent_cli.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_agent_env_resolution.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_agent_gemini_defaults.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_agent_idle_timeout_cli.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_agent_spec.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_base_install_imports.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_bedrock_thinking.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_branch.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_capture_trajectory.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_clawsbench_slice.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_cli_daytona.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_cli_docs_drift.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_cli_misc.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_compat_harbor_registry.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_config_redaction.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_connect_as_env.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_continuallearningbench_adapter.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_dashboard_credential_env_scrub.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_dashboard_daytona_key.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_dashboard_no_host_paths.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_dashboard_release_evidence.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_dashboard_roadmap.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_dashboard_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_dashboard_sync.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_daytona_command_polling.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_daytona_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_daytona_status.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_docker_prune_scoping.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_docker_uploads.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_docs_examples.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_eng50_capabilities.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_env_setup.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_environment_manifest_controls.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_eval_filters_applied.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_eval_sharding.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_eval_single_task_summary.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_eval_source_provenance.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_eval_worker_retry.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_eval_zero_task_guard.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_evaluation_environment_manifest.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_experiments_status.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_harvey_lab_shim.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_hilbench_adapter.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_hosted_env.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_hosted_env_rollout_contract.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_inbound_adapter_manifest.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_inbound_adapters.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_integration_check_results.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_integration_run_suite.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_internet_policy.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_job.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_job_sequential_shared.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_job_sequential_shared_resume.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_judge_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_learner_skills.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_learner_skills_traversal.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_learner_store.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_learner_store_persistence.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_litellm_config.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_litellm_hardening.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_litellm_logging.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_litellm_smoke.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_llm_judge.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_llm_judge_event_tags.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_llm_judge_verifier.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_memory_scorer.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_metrics.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_monitor_scaffold.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_native_acp_usage.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_no_cross_provider_fallback.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_notification_order_real.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_oracle.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_oracle_chokepoint.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_paths_safe.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_paths_symlink_helpers.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_process.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_provider_auth_detection.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_providers.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_reexport.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_registry_invariants.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_release_version.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_resolve_env_helpers.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_reward_node.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_reward_unified_contract.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rewards.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rewards_jsonl.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rollout_architecture.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rollout_branch.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rollout_config_path_coercion.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rollout_environment.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rollout_import_no_side_effects.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rollout_on_ask_user_wiring.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rollout_probe_sandbox_health.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rollout_upload.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_rubric_config.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_runtime.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_runtime_config_wired.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_runtime_live_sandbox.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sandbox.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sandbox_exec_secret_handling.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sandbox_hardening.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sandbox_multi_service.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sandbox_protocol.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sandbox_setup.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sandbox_snapshot_contract.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sandbox_upload_symlink.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sandbox_verifier_workspace.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_scene.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_scene_outbox_trial.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_scene_parallel_group.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_scene_result_aggregation.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_scoring.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sdk_internals.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_sdk_lockdown.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_self_gen_cli.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_self_gen_export_error_channel.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_self_gen_export_failures.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_self_gen_orchestration.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_session_request_permission_dispatch.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skill_eval.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skill_eval_dryrun.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skill_eval_integration.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skill_eval_sweep.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skill_eval_traversal.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skill_invocation_artifacts.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skill_policy.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skills.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skills_dir_agent_home_link.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_skillsbench_harbor_run_suite.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_smoke.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_subscription_auth.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_task_check_eval_consistency.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_task_config.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_task_download.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_tasks.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_token_usage_normalization.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_trace_import_cli.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_trace_task_gen_traversal.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_traces_huggingface.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_traces_parsers.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_traces_task_gen.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_train_mode_artifact_emission.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_trajectory_integration.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_trajectory_streaming.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_trial_agent_timeout_verify.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_trial_install_agent_timeout.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_trial_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_usage_litellm.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_usage_required.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_usage_tracking.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_user.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_verifier_multi_container.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_verifier_output.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_verifier_output_freshness.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_verify.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_workflow_action_pinning.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/test_yaml_config.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/trajectories/test_export.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/trajectories/test_export_nan_handling.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/trajectories/test_redaction.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/trajectories/test_step_granularity.py +0 -0
- {benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/tests/trajectories/test_tree.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.5.3.
|
|
3
|
+
Version: 0.5.3.dev899
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -106,6 +106,9 @@ def _apt_install(*packages: str) -> str:
|
|
|
106
106
|
_BENCHFLOW_NODE_PREFIX = "/opt/benchflow/node"
|
|
107
107
|
_BENCHFLOW_JS_AGENT_PREFIX = "/opt/benchflow/js-agents"
|
|
108
108
|
_BENCHFLOW_BIN_PREFIX = "/opt/benchflow/bin"
|
|
109
|
+
_OPENHANDS_CLI_VERSION = "1.16.0"
|
|
110
|
+
_OPENHANDS_SDK_VERSION = "1.22.1"
|
|
111
|
+
_OPENHANDS_TOOLS_VERSION = "1.22.1"
|
|
109
112
|
_JS_AGENT_PATH = (
|
|
110
113
|
f"{_BENCHFLOW_BIN_PREFIX}:{_BENCHFLOW_JS_AGENT_PREFIX}/bin:"
|
|
111
114
|
f"{_BENCHFLOW_NODE_PREFIX}/bin:$PATH"
|
|
@@ -543,15 +546,15 @@ AGENTS: dict[str, AgentConfig] = {
|
|
|
543
546
|
install_cmd=(
|
|
544
547
|
"export DEBIAN_FRONTEND=noninteractive && "
|
|
545
548
|
'export PATH="$HOME/.local/bin:$PATH" && '
|
|
546
|
-
"( command -v curl >/dev/null 2>&1
|
|
549
|
+
"( command -v curl >/dev/null 2>&1 || "
|
|
547
550
|
" if command -v apt-get >/dev/null 2>&1; then "
|
|
548
|
-
f" {_apt_install('curl', 'ca-certificates'
|
|
551
|
+
f" {_apt_install('curl', 'ca-certificates')}; "
|
|
549
552
|
" elif command -v dnf >/dev/null 2>&1; then "
|
|
550
|
-
" dnf -y --allowerasing install curl ca-certificates
|
|
553
|
+
" dnf -y --allowerasing install curl ca-certificates >/dev/null 2>&1; "
|
|
551
554
|
" elif command -v apk >/dev/null 2>&1; then "
|
|
552
|
-
" apk add --no-cache curl ca-certificates
|
|
555
|
+
" apk add --no-cache curl ca-certificates >/dev/null 2>&1; "
|
|
553
556
|
" else "
|
|
554
|
-
" echo 'OpenHands
|
|
557
|
+
" echo 'OpenHands install requires curl' >&2; "
|
|
555
558
|
" exit 127; "
|
|
556
559
|
" fi ) && "
|
|
557
560
|
"( UV_OK=0; "
|
|
@@ -566,9 +569,18 @@ AGENTS: dict[str, AgentConfig] = {
|
|
|
566
569
|
" curl -LsSf https://astral.sh/uv/install.sh | sh >/dev/null 2>&1 && "
|
|
567
570
|
' export PATH="$HOME/.local/bin:$PATH"; '
|
|
568
571
|
" fi && "
|
|
572
|
+
# OpenHands CLI 1.16.0 pins openhands-sdk/tools 1.21.0. That one
|
|
573
|
+
# SDK release makes the synthetic `security_risk` tool field
|
|
574
|
+
# required whenever LLMSecurityAnalyzer is attached; the ACP path
|
|
575
|
+
# attaches it even under --always-approve, so Claude Opus can loop
|
|
576
|
+
# on validation errors until timeout. 1.22.x restores the intended
|
|
577
|
+
# default-to-UNKNOWN behavior without the API drift seen in 1.26.x.
|
|
578
|
+
f"printf 'openhands-sdk=={_OPENHANDS_SDK_VERSION}\\n"
|
|
579
|
+
f"openhands-tools=={_OPENHANDS_TOOLS_VERSION}\\n' "
|
|
580
|
+
"> /tmp/oh-sdk-overrides.txt && "
|
|
569
581
|
"uv tool install --force --refresh "
|
|
570
|
-
"--
|
|
571
|
-
"openhands --python 3.12 && "
|
|
582
|
+
"--overrides /tmp/oh-sdk-overrides.txt "
|
|
583
|
+
f"openhands=={_OPENHANDS_CLI_VERSION} --python 3.12 && "
|
|
572
584
|
" uv tool list | grep -q '^openhands\\b' ) && "
|
|
573
585
|
# Let sandbox user traverse to uv-managed Python interpreter path.
|
|
574
586
|
"chmod o+x /root /root/.local /root/.local/share "
|
|
@@ -99,20 +99,29 @@ class TestOpenHandsConfig:
|
|
|
99
99
|
assert "$HOME/.agents/skills" in cfg.skill_paths
|
|
100
100
|
assert "$WORKSPACE/.agents/skills" in cfg.skill_paths
|
|
101
101
|
|
|
102
|
-
def
|
|
102
|
+
def test_openhands_install_cmd_pins_stable_pypi_release(self):
|
|
103
103
|
cfg = AGENTS["openhands"]
|
|
104
104
|
assert (
|
|
105
|
-
"apt-get -o Acquire::Retries=3 install -y -qq curl ca-certificates
|
|
105
|
+
"apt-get -o Acquire::Retries=3 install -y -qq curl ca-certificates"
|
|
106
106
|
in cfg.install_cmd
|
|
107
107
|
)
|
|
108
108
|
assert (
|
|
109
109
|
"uv tool install --force --refresh "
|
|
110
|
-
"--
|
|
111
|
-
"openhands --python 3.12" in cfg.install_cmd
|
|
110
|
+
"--overrides /tmp/oh-sdk-overrides.txt "
|
|
111
|
+
"openhands==1.16.0 --python 3.12" in cfg.install_cmd
|
|
112
112
|
)
|
|
113
|
-
assert "
|
|
113
|
+
assert "OpenHands/OpenHands-CLI.git@main" not in cfg.install_cmd
|
|
114
114
|
assert "install.openhands.dev/install.sh" not in cfg.install_cmd
|
|
115
115
|
|
|
116
|
+
def test_openhands_install_cmd_overrides_buggy_sdk_pin(self):
|
|
117
|
+
"""Guards PR #644 against Opus timeouts from OpenHands SDK 1.21.0."""
|
|
118
|
+
cfg = AGENTS["openhands"]
|
|
119
|
+
|
|
120
|
+
assert "openhands-sdk==1.22.1" in cfg.install_cmd
|
|
121
|
+
assert "openhands-tools==1.22.1" in cfg.install_cmd
|
|
122
|
+
assert "openhands-sdk>=1.22.0" not in cfg.install_cmd
|
|
123
|
+
assert "--overrides /tmp/oh-sdk-overrides.txt" in cfg.install_cmd
|
|
124
|
+
|
|
116
125
|
def test_openhands_install_cmd_does_not_deploy_bedrock_shim(self):
|
|
117
126
|
"""Guards the LiteLLM runtime refactor: Bedrock patches live with LiteLLM."""
|
|
118
127
|
cfg = AGENTS["openhands"]
|
|
@@ -637,8 +637,8 @@ async def test_install_agent_writes_command_stdout_and_stderr_on_failure(
|
|
|
637
637
|
assert log_text.startswith("$ ")
|
|
638
638
|
assert (
|
|
639
639
|
"uv tool install --force --refresh "
|
|
640
|
-
"--
|
|
641
|
-
"openhands --python 3.12" in log_text
|
|
640
|
+
"--overrides /tmp/oh-sdk-overrides.txt "
|
|
641
|
+
"openhands==1.16.0 --python 3.12" in log_text
|
|
642
642
|
)
|
|
643
643
|
assert "=== stderr ===" in log_text
|
|
644
644
|
assert "uv: command not found" in log_text
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Unit tests for the Experiments page's HF PR score aggregation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
_DASHBOARD = Path(__file__).resolve().parent.parent / "dashboard"
|
|
10
|
+
if str(_DASHBOARD) not in sys.path:
|
|
11
|
+
sys.path.insert(0, str(_DASHBOARD))
|
|
12
|
+
|
|
13
|
+
import hf_scores # noqa: E402
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_build_scoreboard_combines_hf_pr_modes(monkeypatch):
|
|
17
|
+
"""Guards the HF-score dashboard PR against mixing pass-rate and fill progress."""
|
|
18
|
+
|
|
19
|
+
def fake_analysis(buckets):
|
|
20
|
+
hf_scores._add_bucket(
|
|
21
|
+
buckets,
|
|
22
|
+
pr=2,
|
|
23
|
+
harness="openhands",
|
|
24
|
+
model="gpt-5.5",
|
|
25
|
+
mode="with-skills",
|
|
26
|
+
passed=6,
|
|
27
|
+
failed=4,
|
|
28
|
+
)
|
|
29
|
+
hf_scores._add_bucket(
|
|
30
|
+
buckets,
|
|
31
|
+
pr=2,
|
|
32
|
+
harness="openhands",
|
|
33
|
+
model="gpt-5.5",
|
|
34
|
+
mode="without-skills",
|
|
35
|
+
passed=2,
|
|
36
|
+
failed=8,
|
|
37
|
+
)
|
|
38
|
+
return []
|
|
39
|
+
|
|
40
|
+
def fake_direct(buckets):
|
|
41
|
+
hf_scores._add_bucket(
|
|
42
|
+
buckets,
|
|
43
|
+
pr=5,
|
|
44
|
+
harness="openhands",
|
|
45
|
+
model="gpt-5.5",
|
|
46
|
+
mode="with-skills",
|
|
47
|
+
passed=1,
|
|
48
|
+
failed=0,
|
|
49
|
+
)
|
|
50
|
+
return []
|
|
51
|
+
|
|
52
|
+
monkeypatch.setattr(hf_scores, "_read_analysis_prs", fake_analysis)
|
|
53
|
+
monkeypatch.setattr(hf_scores, "_read_direct_prs", fake_direct)
|
|
54
|
+
|
|
55
|
+
result = hf_scores.build_scoreboard()
|
|
56
|
+
|
|
57
|
+
with_row = result["by_mode"]["with-skills"][0]
|
|
58
|
+
assert with_row["label"] == "OpenHands GPT-5.5"
|
|
59
|
+
assert with_row["passed"] == 7
|
|
60
|
+
assert with_row["total"] == 11
|
|
61
|
+
assert with_row["prs"] == [2, 5]
|
|
62
|
+
gain_row = result["by_mode"]["normalized-gain"][0]
|
|
63
|
+
assert gain_row["without_pass_rate"] == 0.2
|
|
64
|
+
assert round(gain_row["gain"], 3) == 0.545
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_snapshot_serves_cached_hf_scores_without_inline_refresh(tmp_path, monkeypatch):
|
|
68
|
+
"""Guards the HF-score dashboard PR against blocking live requests on HF."""
|
|
69
|
+
cache = {
|
|
70
|
+
"as_of": "2026-06-08T00:00:00+00:00",
|
|
71
|
+
"source": "HuggingFace PR2/PR3/PR4/PR5",
|
|
72
|
+
"repo": hf_scores.REPO,
|
|
73
|
+
"refs": ["refs/pr/2", "refs/pr/3", "refs/pr/4", "refs/pr/5"],
|
|
74
|
+
"scored_trials": 1,
|
|
75
|
+
"groups": 1,
|
|
76
|
+
"by_mode": {"with-skills": [], "without-skills": [], "normalized-gain": []},
|
|
77
|
+
"warnings": [],
|
|
78
|
+
"warning_count": 0,
|
|
79
|
+
}
|
|
80
|
+
path = tmp_path / "hf_scoreboard_cache.json"
|
|
81
|
+
path.write_text(json.dumps(cache))
|
|
82
|
+
|
|
83
|
+
def fail_refresh():
|
|
84
|
+
raise AssertionError("snapshot should not refresh inline by default")
|
|
85
|
+
|
|
86
|
+
monkeypatch.setattr(hf_scores, "build_scoreboard", fail_refresh)
|
|
87
|
+
|
|
88
|
+
result = hf_scores.snapshot(path)
|
|
89
|
+
|
|
90
|
+
assert result["cached"] is True
|
|
91
|
+
assert result["scored_trials"] == 1
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/_utils/evaluation_results.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/agents/harvey_lab_acp_shim.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/demo_task/environment/Dockerfile
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/experimental/mcp/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/experimental/mcp/reviewer_server.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_bedrock_patch.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_logging.py
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev885 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_runtime.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|