benchflow 0.5.3.dev879__tar.gz → 0.5.3.dev881__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/CHANGELOG.md +6 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/PKG-INFO +20 -3
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/README.md +19 -2
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/pyproject.toml +1 -1
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/.gitignore +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/LICENSE +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_dotenv.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_paths.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_run.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_types.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/benchmark_repos.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/config.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/evaluation_results.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/json_safe.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/learner_memory.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/result_metadata.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/reward_events.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/scoring.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/source_provenance.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/task_authoring.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/yaml_loader.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/acp/client.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/acp/container_transport.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/acp/runtime.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/acp/session.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/acp/transport.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/adapters/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/adapters/harbor.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/adapters/inbound.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/adapters/inspect_ai.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/adapters/ors.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/adapters/terminal_bench.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/codex_config.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/credentials.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/env.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/errors.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/install.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/protocol.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/providers.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/registry.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/branch.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/cli/main.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/cli/trace_import.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/compat/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/compat/harbor_registry.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/contracts/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/contracts/planes.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/contracts/user.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/demo_task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/demo_task/instruction.md +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/demo_task/task.toml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/demo_task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/diagnostics.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/environment/manifest.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/environment/manifest_env.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/environment/protocol.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/environment/readiness.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/eval_sharding.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/eval_worker.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/evaluation.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/experimental/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/experimental/mcp/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/experimental/mcp/hooks.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/hosted_env.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/learner_skills.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/learner_store.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/metrics.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/models.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/monitor.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/providers/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/providers/litellm_config.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/providers/litellm_logging.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/providers/litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/providers/runtime.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/py.typed +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/README.md +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/builtins.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/events.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/file_readers.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/llm.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/memory_scorer.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/node.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/protocol.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/rubric.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/rubric_config.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rewards/validation.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rollout.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rollout_branch.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/rollout_planes.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/runtime.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/_base.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/_compose.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/_sdk_ops.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/daytona.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/docker.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/lockdown.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/metadata.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/modal_impl.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/process.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/protocol.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/services.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/setup.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/snapshot.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sandbox/user.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/scenes.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/sdk.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/self_gen.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/skill_eval/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/skill_eval/_core.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/skill_eval/gepa_export.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/skill_eval/schema.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/skill_policy.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/skills.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/task/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/task/config.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/task/env.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/task/paths.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/task/task.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/task/verifier.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/templates/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/templates/judge.py.tmpl +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/templates/test.sh.tmpl +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/traces/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/traces/huggingface.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/traces/local.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/traces/models.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/traces/parsers.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/traces/task_gen.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/trajectories/_capture.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/trajectories/export.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/trajectories/metrics.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/trajectories/tree.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/trajectories/viewer.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/usage_tracking.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/agents/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/agents/test_protocol.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/README.md +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/acp_smoke/instruction.md +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/acp_smoke/task.toml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/acp_smoke/tests/test.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/conformance-results.json +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/proof_multi_agent.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/proof_snapshot.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/run_conformance.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conftest.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/environment/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/environment/test_chibench_manifest.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/environment/test_clawsbench_manifest.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/environment/test_manifest.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/environment/test_manifest_env.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/environment/test_protocol.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/environment/test_readiness.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/test_codex.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/test_codex_custom_provider.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/traces/minimal-claude.jsonl +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/fixtures/mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/check_adapter_evidence.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/check_hosted_env_evidence.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/check_results.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/check_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/configs/claude-agent-acp.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/configs/codex-acp.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/configs/gemini.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/configs/openclaw.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/configs/opencode.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/configs/openhands.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/configs/pi-acp.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/run.sh +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/run_suite.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/integration/suites/release.yaml +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_acp.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_acp_capability_advertising.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_acp_model_config_dispatch.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_acp_pinned_protocol_guard.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_acp_setup_failure_propagation.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_adapter_scripts.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_adapters.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_agent_cli.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_agent_env_resolution.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_agent_gemini_defaults.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_agent_idle_timeout_cli.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_agent_registry.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_agent_setup.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_agent_spec.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_base_install_imports.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_bedrock_thinking.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_branch.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_capture_trajectory.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_clawsbench_slice.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_cli_daytona.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_cli_docs_drift.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_cli_misc.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_compat_harbor_registry.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_config_redaction.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_connect_as_env.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_continuallearningbench_adapter.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_dashboard_credential_env_scrub.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_dashboard_daytona_key.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_dashboard_no_host_paths.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_dashboard_release_evidence.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_dashboard_roadmap.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_dashboard_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_dashboard_sync.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_daytona_command_polling.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_daytona_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_daytona_status.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_docker_prune_scoping.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_docker_uploads.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_docs_examples.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_eng50_capabilities.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_env_setup.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_environment_manifest_controls.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_eval_filters_applied.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_eval_sharding.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_eval_single_task_summary.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_eval_source_provenance.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_eval_worker_retry.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_eval_zero_task_guard.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_evaluation_environment_manifest.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_experiments_status.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_harvey_lab_shim.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_hilbench_adapter.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_hosted_env.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_hosted_env_rollout_contract.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_inbound_adapter_manifest.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_inbound_adapters.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_integration_check_results.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_integration_run_suite.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_internet_policy.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_job.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_job_sequential_shared.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_job_sequential_shared_resume.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_judge_symlink_ingestion.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_learner_skills.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_learner_skills_traversal.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_learner_store.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_learner_store_persistence.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_litellm_config.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_litellm_hardening.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_litellm_logging.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_litellm_smoke.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_llm_judge.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_llm_judge_event_tags.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_llm_judge_verifier.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_memory_scorer.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_metrics.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_mock_openai_responses_server.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_monitor_scaffold.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_native_acp_usage.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_no_cross_provider_fallback.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_notification_order_real.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_oracle.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_oracle_chokepoint.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_paths_safe.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_paths_symlink_helpers.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_pi_acp_launcher.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_process.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_provider_auth_detection.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_providers.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_reexport.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_registry_invariants.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_release_version.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_resolve_env_helpers.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_reward_node.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_reward_unified_contract.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rewards.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rewards_jsonl.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rollout_architecture.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rollout_branch.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rollout_config_path_coercion.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rollout_environment.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rollout_import_no_side_effects.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rollout_on_ask_user_wiring.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rollout_probe_sandbox_health.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rollout_upload.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_rubric_config.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_runtime.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_runtime_config_wired.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_runtime_live_sandbox.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sandbox.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sandbox_exec_secret_handling.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sandbox_hardening.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sandbox_multi_service.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sandbox_protocol.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sandbox_setup.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sandbox_snapshot_contract.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sandbox_upload_symlink.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sandbox_verifier_workspace.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_scene.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_scene_outbox_trial.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_scene_parallel_group.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_scene_result_aggregation.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_scoring.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sdk_internals.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_sdk_lockdown.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_self_gen_cli.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_self_gen_export_error_channel.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_self_gen_export_failures.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_self_gen_orchestration.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_session_request_permission_dispatch.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skill_eval.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skill_eval_dryrun.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skill_eval_integration.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skill_eval_sweep.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skill_eval_traversal.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skill_invocation_artifacts.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skill_policy.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skills.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skills_dir_agent_home_link.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_skillsbench_harbor_run_suite.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_smoke.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_subscription_auth.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_task_check_eval_consistency.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_task_config.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_task_download.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_tasks.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_token_usage_normalization.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_trace_import_cli.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_trace_task_gen_traversal.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_traces_huggingface.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_traces_parsers.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_traces_task_gen.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_train_mode_artifact_emission.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_trajectory_integration.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_trajectory_streaming.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_trial_agent_timeout_verify.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_trial_install_agent_timeout.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_trial_litellm_runtime.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_usage_litellm.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_usage_required.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_usage_tracking.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_user.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_verifier_multi_container.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_verifier_output.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_verifier_output_freshness.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_verify.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_workflow_action_pinning.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/test_yaml_config.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/trajectories/__init__.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/trajectories/test_export.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/trajectories/test_export_nan_handling.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/trajectories/test_redaction.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/trajectories/test_step_granularity.py +0 -0
- {benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/trajectories/test_tree.py +0 -0
|
@@ -2,6 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
### Changed
|
|
6
|
+
|
|
7
|
+
- Document the public vs internal preview install/upgrade command matrix,
|
|
8
|
+
including `uv tool` exact pins, internal preview upgrades, and the
|
|
9
|
+
`--force` path for replacing stale entrypoint scripts.
|
|
10
|
+
|
|
5
11
|
## 0.5.2 — 2026-06-05
|
|
6
12
|
|
|
7
13
|
### Changed
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.5.3.
|
|
3
|
+
Version: 0.5.3.dev881
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -76,12 +76,29 @@ BenchFlow's current public release is `0.5.2`:
|
|
|
76
76
|
pip install --upgrade benchflow
|
|
77
77
|
```
|
|
78
78
|
|
|
79
|
-
For a `uv`-managed CLI install of the public release:
|
|
79
|
+
For a `uv`-managed CLI install or upgrade of the public release:
|
|
80
80
|
|
|
81
81
|
```bash
|
|
82
|
-
uv tool install --prerelease allow 'benchflow==0.5.2'
|
|
82
|
+
uv tool install --prerelease allow --upgrade 'benchflow==0.5.2'
|
|
83
83
|
```
|
|
84
84
|
|
|
85
|
+
Use the exact `benchflow==0.5.2` pin for the public CLI. The
|
|
86
|
+
`--prerelease allow` flag is currently needed for BenchFlow's pinned LiteLLM
|
|
87
|
+
release-candidate dependency; the exact BenchFlow pin keeps you off internal
|
|
88
|
+
preview builds.
|
|
89
|
+
|
|
90
|
+
Internal users who want the newest preview published from `main` should omit
|
|
91
|
+
the exact public pin:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
uv tool install --prerelease allow --upgrade benchflow
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
That installs the latest internal preview, such as `0.5.3.dev<N>`. If either
|
|
98
|
+
command reports `Executables already exist: bench, benchflow`, the machine has
|
|
99
|
+
old entrypoints from a previous install; rerun the same command with `--force`
|
|
100
|
+
to let `uv` replace them.
|
|
101
|
+
|
|
85
102
|
Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth. Provider-prefixed models may use provider-specific credentials; Azure Foundry models use `AZURE_API_KEY` plus `AZURE_API_ENDPOINT`.
|
|
86
103
|
|
|
87
104
|
## Documentation
|
|
@@ -26,12 +26,29 @@ BenchFlow's current public release is `0.5.2`:
|
|
|
26
26
|
pip install --upgrade benchflow
|
|
27
27
|
```
|
|
28
28
|
|
|
29
|
-
For a `uv`-managed CLI install of the public release:
|
|
29
|
+
For a `uv`-managed CLI install or upgrade of the public release:
|
|
30
30
|
|
|
31
31
|
```bash
|
|
32
|
-
uv tool install --prerelease allow 'benchflow==0.5.2'
|
|
32
|
+
uv tool install --prerelease allow --upgrade 'benchflow==0.5.2'
|
|
33
33
|
```
|
|
34
34
|
|
|
35
|
+
Use the exact `benchflow==0.5.2` pin for the public CLI. The
|
|
36
|
+
`--prerelease allow` flag is currently needed for BenchFlow's pinned LiteLLM
|
|
37
|
+
release-candidate dependency; the exact BenchFlow pin keeps you off internal
|
|
38
|
+
preview builds.
|
|
39
|
+
|
|
40
|
+
Internal users who want the newest preview published from `main` should omit
|
|
41
|
+
the exact public pin:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
uv tool install --prerelease allow --upgrade benchflow
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
That installs the latest internal preview, such as `0.5.3.dev<N>`. If either
|
|
48
|
+
command reports `Executables already exist: bench, benchflow`, the machine has
|
|
49
|
+
old entrypoints from a previous install; rerun the same command with `--force`
|
|
50
|
+
to let `uv` replace them.
|
|
51
|
+
|
|
35
52
|
Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth. Provider-prefixed models may use provider-specific credentials; Azure Foundry models use `AZURE_API_KEY` plus `AZURE_API_ENDPOINT`.
|
|
36
53
|
|
|
37
54
|
## Documentation
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/_utils/evaluation_results.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/agents/harvey_lab_acp_shim.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/demo_task/environment/Dockerfile
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/experimental/mcp/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/experimental/mcp/reviewer_server.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/providers/litellm_bedrock_patch.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/providers/litellm_logging.py
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/src/benchflow/providers/litellm_runtime.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/acp_smoke/environment/Dockerfile
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/acp_smoke/instruction.md
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/acp_smoke/solution/solve.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/conformance/conformance-results.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/environment/test_chibench_manifest.py
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/environment/test_clawsbench_manifest.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/hello-world-task/instruction.md
RENAMED
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/hello-world-task/solution/solve.sh
RENAMED
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/hello-world-task/tests/test.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/terminal-bench-smoke-task/task.toml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchflow-0.5.3.dev879 → benchflow-0.5.3.dev881}/tests/examples/test_codex_custom_provider.sh
RENAMED
|
File without changes
|