benchflow 0.5.3.dev883__tar.gz → 0.5.3.dev894__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/PKG-INFO +1 -1
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/pyproject.toml +1 -1
- benchflow-0.5.3.dev894/tests/test_hf_scores.py +91 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/.gitignore +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/CHANGELOG.md +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/LICENSE +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/README.md +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_dotenv.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_paths.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_run.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_types.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/benchmark_repos.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/config.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/evaluation_results.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/json_safe.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/learner_memory.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/result_metadata.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/reward_events.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/scoring.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/source_provenance.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/task_authoring.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/yaml_loader.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/acp/client.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/acp/container_transport.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/acp/runtime.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/acp/session.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/acp/transport.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/adapters/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/adapters/harbor.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/adapters/inbound.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/adapters/inspect_ai.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/adapters/ors.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/adapters/terminal_bench.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/codex_config.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/credentials.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/env.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/errors.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/install.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/protocol.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/providers.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/registry.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/branch.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/cli/main.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/cli/trace_import.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/compat/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/compat/harbor_registry.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/contracts/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/contracts/planes.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/contracts/user.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/demo_task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/demo_task/instruction.md +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/demo_task/task.toml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/demo_task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/diagnostics.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/environment/manifest.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/environment/manifest_env.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/environment/protocol.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/environment/readiness.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/eval_sharding.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/eval_worker.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/evaluation.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/experimental/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/experimental/mcp/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/experimental/mcp/hooks.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/hosted_env.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/learner_skills.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/learner_store.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/metrics.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/models.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/monitor.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/providers/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/providers/litellm_config.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/providers/litellm_logging.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/providers/litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/providers/runtime.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/py.typed +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/README.md +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/builtins.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/events.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/file_readers.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/llm.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/memory_scorer.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/node.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/protocol.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/rubric.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/rubric_config.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rewards/validation.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rollout.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rollout_branch.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/rollout_planes.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/runtime.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/_base.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/_compose.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/_sdk_ops.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/daytona.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/docker.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/lockdown.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/metadata.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/modal_impl.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/process.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/protocol.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/services.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/setup.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/snapshot.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sandbox/user.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/scenes.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/sdk.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/self_gen.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/skill_eval/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/skill_eval/_core.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/skill_eval/gepa_export.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/skill_eval/schema.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/skill_policy.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/skills.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/task/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/task/config.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/task/env.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/task/paths.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/task/task.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/task/verifier.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/templates/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/templates/judge.py.tmpl +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/templates/test.sh.tmpl +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/traces/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/traces/huggingface.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/traces/local.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/traces/models.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/traces/parsers.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/traces/task_gen.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/trajectories/_capture.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/trajectories/export.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/trajectories/metrics.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/trajectories/tree.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/trajectories/viewer.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/usage_tracking.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/agents/test_protocol.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/README.md +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/acp_smoke/instruction.md +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/acp_smoke/task.toml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/acp_smoke/tests/test.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/conformance-results.json +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/proof_multi_agent.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/proof_snapshot.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/run_conformance.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conftest.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/environment/test_chibench_manifest.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/environment/test_clawsbench_manifest.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/environment/test_manifest.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/environment/test_manifest_env.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/environment/test_protocol.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/environment/test_readiness.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/test_codex.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/test_codex_custom_provider.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/traces/minimal-claude.jsonl +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/fixtures/mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/check_adapter_evidence.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/check_hosted_env_evidence.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/check_results.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/check_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/configs/claude-agent-acp.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/configs/codex-acp.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/configs/gemini.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/configs/openclaw.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/configs/opencode.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/configs/openhands.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/configs/pi-acp.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/run.sh +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/run_suite.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/integration/suites/release.yaml +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_acp.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_acp_capability_advertising.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_acp_model_config_dispatch.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_acp_pinned_protocol_guard.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_acp_setup_failure_propagation.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_adapter_scripts.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_adapters.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_agent_cli.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_agent_env_resolution.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_agent_gemini_defaults.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_agent_idle_timeout_cli.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_agent_registry.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_agent_setup.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_agent_spec.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_base_install_imports.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_bedrock_thinking.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_branch.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_capture_trajectory.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_clawsbench_slice.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_cli_daytona.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_cli_docs_drift.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_cli_misc.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_compat_harbor_registry.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_config_redaction.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_connect_as_env.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_continuallearningbench_adapter.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_dashboard_credential_env_scrub.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_dashboard_daytona_key.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_dashboard_no_host_paths.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_dashboard_release_evidence.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_dashboard_roadmap.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_dashboard_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_dashboard_sync.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_daytona_command_polling.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_daytona_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_daytona_status.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_docker_prune_scoping.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_docker_uploads.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_docs_examples.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_eng50_capabilities.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_env_setup.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_environment_manifest_controls.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_eval_filters_applied.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_eval_sharding.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_eval_single_task_summary.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_eval_source_provenance.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_eval_worker_retry.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_eval_zero_task_guard.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_evaluation_environment_manifest.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_experiments_status.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_harvey_lab_shim.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_hilbench_adapter.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_hosted_env.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_hosted_env_rollout_contract.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_inbound_adapter_manifest.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_inbound_adapters.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_integration_check_results.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_integration_run_suite.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_internet_policy.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_job.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_job_sequential_shared.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_job_sequential_shared_resume.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_judge_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_learner_skills.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_learner_skills_traversal.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_learner_store.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_learner_store_persistence.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_litellm_config.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_litellm_hardening.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_litellm_logging.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_litellm_smoke.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_llm_judge.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_llm_judge_event_tags.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_llm_judge_verifier.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_memory_scorer.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_metrics.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_monitor_scaffold.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_native_acp_usage.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_no_cross_provider_fallback.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_notification_order_real.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_oracle.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_oracle_chokepoint.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_paths_safe.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_paths_symlink_helpers.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_process.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_provider_auth_detection.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_providers.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_reexport.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_registry_invariants.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_release_version.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_resolve_env_helpers.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_reward_node.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_reward_unified_contract.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rewards.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rewards_jsonl.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rollout_architecture.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rollout_branch.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rollout_config_path_coercion.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rollout_environment.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rollout_import_no_side_effects.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rollout_on_ask_user_wiring.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rollout_probe_sandbox_health.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rollout_upload.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_rubric_config.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_runtime.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_runtime_config_wired.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_runtime_live_sandbox.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sandbox.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sandbox_exec_secret_handling.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sandbox_hardening.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sandbox_multi_service.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sandbox_protocol.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sandbox_setup.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sandbox_snapshot_contract.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sandbox_upload_symlink.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sandbox_verifier_workspace.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_scene.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_scene_outbox_trial.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_scene_parallel_group.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_scene_result_aggregation.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_scoring.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sdk_internals.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_sdk_lockdown.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_self_gen_cli.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_self_gen_export_error_channel.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_self_gen_export_failures.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_self_gen_orchestration.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_session_request_permission_dispatch.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skill_eval.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skill_eval_dryrun.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skill_eval_integration.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skill_eval_sweep.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skill_eval_traversal.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skill_invocation_artifacts.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skill_policy.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skills.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skills_dir_agent_home_link.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_skillsbench_harbor_run_suite.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_smoke.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_subscription_auth.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_task_check_eval_consistency.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_task_config.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_task_download.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_tasks.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_token_usage_normalization.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_trace_import_cli.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_trace_task_gen_traversal.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_traces_huggingface.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_traces_parsers.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_traces_task_gen.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_train_mode_artifact_emission.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_trajectory_integration.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_trajectory_streaming.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_trial_agent_timeout_verify.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_trial_install_agent_timeout.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_trial_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_usage_litellm.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_usage_required.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_usage_tracking.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_user.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_verifier_multi_container.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_verifier_output.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_verifier_output_freshness.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_verify.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_workflow_action_pinning.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/test_yaml_config.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/trajectories/test_export.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/trajectories/test_export_nan_handling.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/trajectories/test_redaction.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/trajectories/test_step_granularity.py +0 -0
- {benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/trajectories/test_tree.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.5.3.
|
|
3
|
+
Version: 0.5.3.dev894
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Unit tests for the Experiments page's HF PR score aggregation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
_DASHBOARD = Path(__file__).resolve().parent.parent / "dashboard"
|
|
10
|
+
if str(_DASHBOARD) not in sys.path:
|
|
11
|
+
sys.path.insert(0, str(_DASHBOARD))
|
|
12
|
+
|
|
13
|
+
import hf_scores # noqa: E402
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_build_scoreboard_combines_hf_pr_modes(monkeypatch):
|
|
17
|
+
"""Guards the HF-score dashboard PR against mixing pass-rate and fill progress."""
|
|
18
|
+
|
|
19
|
+
def fake_analysis(buckets):
|
|
20
|
+
hf_scores._add_bucket(
|
|
21
|
+
buckets,
|
|
22
|
+
pr=2,
|
|
23
|
+
harness="openhands",
|
|
24
|
+
model="gpt-5.5",
|
|
25
|
+
mode="with-skills",
|
|
26
|
+
passed=6,
|
|
27
|
+
failed=4,
|
|
28
|
+
)
|
|
29
|
+
hf_scores._add_bucket(
|
|
30
|
+
buckets,
|
|
31
|
+
pr=2,
|
|
32
|
+
harness="openhands",
|
|
33
|
+
model="gpt-5.5",
|
|
34
|
+
mode="without-skills",
|
|
35
|
+
passed=2,
|
|
36
|
+
failed=8,
|
|
37
|
+
)
|
|
38
|
+
return []
|
|
39
|
+
|
|
40
|
+
def fake_direct(buckets):
|
|
41
|
+
hf_scores._add_bucket(
|
|
42
|
+
buckets,
|
|
43
|
+
pr=5,
|
|
44
|
+
harness="openhands",
|
|
45
|
+
model="gpt-5.5",
|
|
46
|
+
mode="with-skills",
|
|
47
|
+
passed=1,
|
|
48
|
+
failed=0,
|
|
49
|
+
)
|
|
50
|
+
return []
|
|
51
|
+
|
|
52
|
+
monkeypatch.setattr(hf_scores, "_read_analysis_prs", fake_analysis)
|
|
53
|
+
monkeypatch.setattr(hf_scores, "_read_direct_prs", fake_direct)
|
|
54
|
+
|
|
55
|
+
result = hf_scores.build_scoreboard()
|
|
56
|
+
|
|
57
|
+
with_row = result["by_mode"]["with-skills"][0]
|
|
58
|
+
assert with_row["label"] == "OpenHands GPT-5.5"
|
|
59
|
+
assert with_row["passed"] == 7
|
|
60
|
+
assert with_row["total"] == 11
|
|
61
|
+
assert with_row["prs"] == [2, 5]
|
|
62
|
+
gain_row = result["by_mode"]["normalized-gain"][0]
|
|
63
|
+
assert gain_row["without_pass_rate"] == 0.2
|
|
64
|
+
assert round(gain_row["gain"], 3) == 0.545
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_snapshot_serves_cached_hf_scores_without_inline_refresh(tmp_path, monkeypatch):
|
|
68
|
+
"""Guards the HF-score dashboard PR against blocking live requests on HF."""
|
|
69
|
+
cache = {
|
|
70
|
+
"as_of": "2026-06-08T00:00:00+00:00",
|
|
71
|
+
"source": "HuggingFace PR2/PR3/PR4/PR5",
|
|
72
|
+
"repo": hf_scores.REPO,
|
|
73
|
+
"refs": ["refs/pr/2", "refs/pr/3", "refs/pr/4", "refs/pr/5"],
|
|
74
|
+
"scored_trials": 1,
|
|
75
|
+
"groups": 1,
|
|
76
|
+
"by_mode": {"with-skills": [], "without-skills": [], "normalized-gain": []},
|
|
77
|
+
"warnings": [],
|
|
78
|
+
"warning_count": 0,
|
|
79
|
+
}
|
|
80
|
+
path = tmp_path / "hf_scoreboard_cache.json"
|
|
81
|
+
path.write_text(json.dumps(cache))
|
|
82
|
+
|
|
83
|
+
def fail_refresh():
|
|
84
|
+
raise AssertionError("snapshot should not refresh inline by default")
|
|
85
|
+
|
|
86
|
+
monkeypatch.setattr(hf_scores, "build_scoreboard", fail_refresh)
|
|
87
|
+
|
|
88
|
+
result = hf_scores.snapshot(path)
|
|
89
|
+
|
|
90
|
+
assert result["cached"] is True
|
|
91
|
+
assert result["scored_trials"] == 1
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/_utils/evaluation_results.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/agents/harvey_lab_acp_shim.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/demo_task/environment/Dockerfile
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/experimental/mcp/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/experimental/mcp/reviewer_server.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/providers/litellm_bedrock_patch.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/providers/litellm_logging.py
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/src/benchflow/providers/litellm_runtime.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/acp_smoke/environment/Dockerfile
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/acp_smoke/instruction.md
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/acp_smoke/solution/solve.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/conformance/conformance-results.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/environment/test_chibench_manifest.py
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/environment/test_clawsbench_manifest.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/hello-world-task/instruction.md
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/hello-world-task/solution/solve.sh
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/hello-world-task/tests/test.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev883 → benchflow-0.5.3.dev894}/tests/examples/terminal-bench-smoke-task/task.toml
RENAMED
|
File without changes
|