benchflow 0.5.0__tar.gz → 0.5.1.dev869__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/PKG-INFO +1 -1
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/pyproject.toml +1 -1
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/evaluation_results.py +6 -2
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/client.py +1 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/session.py +91 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/env.py +63 -2
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/metrics.py +2 -1
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/models.py +5 -2
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/litellm_logging.py +1 -13
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/litellm_runtime.py +9 -2
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rollout.py +142 -4
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/usage_tracking.py +37 -2
- benchflow-0.5.1.dev869/tests/test_dashboard_daytona_key.py +129 -0
- benchflow-0.5.1.dev869/tests/test_experiments_status.py +181 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_litellm_runtime.py +46 -0
- benchflow-0.5.1.dev869/tests/test_native_acp_usage.py +243 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_subscription_auth.py +73 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/.gitignore +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/CHANGELOG.md +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/LICENSE +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/README.md +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_dotenv.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_paths.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_run.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_types.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/benchmark_repos.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/config.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/json_safe.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/learner_memory.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/result_metadata.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/reward_events.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/scoring.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/source_provenance.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/task_authoring.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/yaml_loader.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/container_transport.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/runtime.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/transport.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/types.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/harbor.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/inbound.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/inspect_ai.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/ors.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/terminal_bench.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/codex_config.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/credentials.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/errors.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/install.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/pi_acp_launcher.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/protocol.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/providers.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/registry.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/branch.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/main.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/trace_import.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/compat/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/compat/harbor_registry.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/contracts/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/contracts/planes.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/contracts/user.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/environment/Dockerfile +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/instruction.md +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/task.toml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/tests/test.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/diagnostics.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/environment/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/environment/manifest.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/environment/manifest_env.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/environment/protocol.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/environment/readiness.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/eval_sharding.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/eval_worker.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/evaluation.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/hooks.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/hosted_env.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/learner_skills.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/learner_store.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/monitor.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/litellm_config.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/runtime.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/py.typed +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/README.md +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/builtins.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/events.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/file_readers.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/llm.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/memory_scorer.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/node.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/protocol.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/rubric.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/rubric_config.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/validation.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rollout_branch.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rollout_planes.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/runtime.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_base.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_sdk_ops.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/daytona.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/docker.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/lockdown.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/metadata.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/modal_impl.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/process.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/protocol.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/services.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/setup.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/snapshot.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/user.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/scenes.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sdk.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/self_gen.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skill_eval/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skill_eval/_core.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skill_eval/gepa_export.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skill_eval/schema.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skill_policy.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skills.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/config.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/env.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/paths.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/task.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/verifier.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/templates/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/templates/judge.py.tmpl +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/templates/test.sh.tmpl +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/huggingface.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/local.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/models.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/parsers.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/task_gen.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/_capture.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/export.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/metrics.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/otel.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/tree.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/types.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/viewer.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/agents/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/agents/test_protocol.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/README.md +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/instruction.md +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/task.toml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/tests/test.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/conformance-results.json +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/proof_multi_agent.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/proof_snapshot.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/run_conformance.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conftest.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_chibench_manifest.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_clawsbench_manifest.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_manifest.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_manifest_env.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_protocol.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_readiness.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/instruction.md +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/solution/solve.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/task.toml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/tests/test.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/test_claude.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/test_codex.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/test_codex_custom_provider.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/test_gemini.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/test_openclaw.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/traces/minimal-claude.jsonl +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_openai_responses_server.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/check_adapter_evidence.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/check_hosted_env_evidence.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/check_results.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/check_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/claude-agent-acp.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/codex-acp.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/gemini.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/openclaw.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/opencode.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/openhands.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/pi-acp.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/run.sh +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/run_suite.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/suites/release.yaml +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_acp.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_acp_capability_advertising.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_acp_model_config_dispatch.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_acp_pinned_protocol_guard.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_acp_setup_failure_propagation.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_adapter_scripts.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_adapters.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_cli.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_env_resolution.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_gemini_defaults.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_idle_timeout_cli.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_model_decouple.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_registry.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_setup.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_spec.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_base_install_imports.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_bedrock_thinking.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_branch.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_capture_trajectory.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_clawsbench_slice.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_cli_daytona.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_cli_docs_drift.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_cli_misc.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_compat_harbor_registry.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_config_redaction.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_connect_as_env.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_continuallearningbench_adapter.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_credential_env_scrub.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_no_host_paths.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_release_evidence.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_roadmap.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_symlink_ingestion.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_sync.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_daytona_command_polling.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_daytona_litellm_runtime.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_daytona_status.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_docker_prune_scoping.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_docker_uploads.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_docs_examples.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eng50_capabilities.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_env_setup.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_environment_manifest_controls.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_filters_applied.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_sharding.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_single_task_summary.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_source_provenance.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_worker_retry.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_zero_task_guard.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_evaluation_environment_manifest.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_exclude_tasks.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_harvey_lab_shim.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_hilbench_adapter.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_hosted_env.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_hosted_env_rollout_contract.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_inbound_adapter_manifest.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_inbound_adapters.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_integration_check_results.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_integration_run_suite.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_internet_policy.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_job.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_job_sequential_shared.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_job_sequential_shared_resume.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_judge_symlink_ingestion.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_learner_skills.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_learner_skills_traversal.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_learner_store.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_learner_store_persistence.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_litellm_config.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_litellm_hardening.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_litellm_logging.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_litellm_smoke.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_llm_judge.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_llm_judge_event_tags.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_llm_judge_verifier.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_memory_scorer.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_metrics.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_mock_openai_responses_server.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_monitor_scaffold.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_no_cross_provider_fallback.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_notification_order_real.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_oracle.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_oracle_chokepoint.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_paths_safe.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_paths_symlink_helpers.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_pi_acp_launcher.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_process.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_provider_auth_detection.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_providers.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_reexport.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_registry_invariants.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_release_version.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_resolve_env_helpers.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_reward_node.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_reward_unified_contract.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rewards.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rewards_jsonl.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_architecture.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_branch.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_config_path_coercion.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_environment.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_import_no_side_effects.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_on_ask_user_wiring.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_probe_sandbox_health.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_upload.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rubric_config.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_runtime.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_runtime_config_wired.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_runtime_live_sandbox.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_exec_secret_handling.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_hardening.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_multi_service.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_protocol.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_setup.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_snapshot_contract.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_upload_symlink.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_verifier_workspace.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_scene.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_scene_outbox_trial.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_scene_parallel_group.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_scene_result_aggregation.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_scoring.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sdk_internals.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sdk_lockdown.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_cli.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_export_error_channel.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_export_failures.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_orchestration.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_session_request_permission_dispatch.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_dryrun.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_integration.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_sweep.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_traversal.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_invocation_artifacts.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_policy.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skills.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skills_dir_agent_home_link.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skillsbench_harbor_parity.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skillsbench_harbor_run_suite.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_smoke.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_task_check_eval_consistency.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_task_config.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_task_download.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_tasks.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_token_usage_normalization.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trace_import_cli.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trace_task_gen_traversal.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trace_to_task_evidence.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_traces_huggingface.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_traces_parsers.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_traces_task_gen.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_train_mode_artifact_emission.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trajectory_integration.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trajectory_streaming.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trial_agent_timeout_verify.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trial_install_agent_timeout.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trial_litellm_runtime.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_usage_litellm.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_usage_required.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_usage_tracking.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_user.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_verifier_multi_container.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_verifier_output.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_verifier_output_freshness.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_verify.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_workflow_action_pinning.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_yaml_config.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/__init__.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/test_export.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/test_export_nan_handling.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/test_redaction.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/test_step_granularity.py +0 -0
- {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/test_tree.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchflow
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1.dev869
|
|
4
4
|
Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
|
|
5
5
|
Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
|
|
6
6
|
Project-URL: Repository, https://github.com/benchflow-ai/benchflow
|
|
@@ -17,6 +17,7 @@ from benchflow.trajectories.metrics import (
|
|
|
17
17
|
count_skill_invocations,
|
|
18
18
|
result_skill_invocations,
|
|
19
19
|
)
|
|
20
|
+
from benchflow.usage_tracking import is_trusted_usage_source
|
|
20
21
|
|
|
21
22
|
# Phase keys produced by Rollout (see rollout.py — environment_setup,
|
|
22
23
|
# agent_setup, agent_execution, verifier, total). Kept here so summary
|
|
@@ -35,7 +36,7 @@ def agent_result_from_rollout(result: RolloutResult) -> dict[str, Any]:
|
|
|
35
36
|
n_skill_invocations = result.n_skill_invocations or count_skill_invocations(
|
|
36
37
|
result.trajectory
|
|
37
38
|
)
|
|
38
|
-
|
|
39
|
+
agent_result = {
|
|
39
40
|
"n_tool_calls": result.n_tool_calls,
|
|
40
41
|
"n_skill_invocations": n_skill_invocations,
|
|
41
42
|
"n_prompts": result.n_prompts,
|
|
@@ -48,6 +49,9 @@ def agent_result_from_rollout(result: RolloutResult) -> dict[str, Any]:
|
|
|
48
49
|
"usage_source": result.usage_source,
|
|
49
50
|
"price_source": result.price_source,
|
|
50
51
|
}
|
|
52
|
+
if getattr(result, "usage_details", None) is not None:
|
|
53
|
+
agent_result["usage_details"] = result.usage_details
|
|
54
|
+
return agent_result
|
|
51
55
|
|
|
52
56
|
|
|
53
57
|
def rollout_result_payload(
|
|
@@ -105,7 +109,7 @@ def usage_summary(results: dict[str, dict]) -> dict[str, Any]:
|
|
|
105
109
|
covered = [
|
|
106
110
|
r
|
|
107
111
|
for r in completed
|
|
108
|
-
if (r.get("agent_result") or {}).get("usage_source")
|
|
112
|
+
if is_trusted_usage_source((r.get("agent_result") or {}).get("usage_source"))
|
|
109
113
|
]
|
|
110
114
|
|
|
111
115
|
def total(field: str) -> int:
|
|
@@ -380,6 +380,7 @@ class ACPClient:
|
|
|
380
380
|
# vendored ``StopReason`` enum so consumers keep ``.value`` / member
|
|
381
381
|
# comparisons working.
|
|
382
382
|
self._session.stop_reason = StopReason(prompt_result.stop_reason)
|
|
383
|
+
self._session.record_prompt_usage(getattr(prompt_result, "usage", None))
|
|
383
384
|
return prompt_result
|
|
384
385
|
|
|
385
386
|
async def cancel(self) -> None:
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
from datetime import datetime
|
|
6
|
+
from typing import Any
|
|
6
7
|
|
|
7
8
|
from benchflow.trajectories.metrics import is_skill_invocation_event
|
|
8
9
|
|
|
@@ -15,6 +16,81 @@ from .types import (
|
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
|
|
19
|
+
ACPUsageSnapshot = dict[str, int | None]
|
|
20
|
+
|
|
21
|
+
_ACP_USAGE_FIELDS: tuple[str, ...] = (
|
|
22
|
+
"input_tokens",
|
|
23
|
+
"output_tokens",
|
|
24
|
+
"total_tokens",
|
|
25
|
+
"cached_read_tokens",
|
|
26
|
+
"cached_write_tokens",
|
|
27
|
+
"thought_tokens",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _coerce_usage_int(value: object) -> int | None:
|
|
32
|
+
if value is None:
|
|
33
|
+
return None
|
|
34
|
+
if isinstance(value, bool):
|
|
35
|
+
return int(value)
|
|
36
|
+
if isinstance(value, int):
|
|
37
|
+
return value
|
|
38
|
+
if isinstance(value, float | str | bytes | bytearray):
|
|
39
|
+
try:
|
|
40
|
+
return int(value)
|
|
41
|
+
except ValueError:
|
|
42
|
+
return None
|
|
43
|
+
try:
|
|
44
|
+
return int(str(value))
|
|
45
|
+
except ValueError:
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _usage_mapping(usage: object) -> dict[str, Any]:
|
|
50
|
+
if isinstance(usage, dict):
|
|
51
|
+
return {str(key): value for key, value in usage.items()}
|
|
52
|
+
dump = getattr(usage, "model_dump", None)
|
|
53
|
+
if callable(dump):
|
|
54
|
+
data = dump(by_alias=False, exclude_none=True)
|
|
55
|
+
if isinstance(data, dict):
|
|
56
|
+
alias_data = dump(by_alias=True, exclude_none=True)
|
|
57
|
+
if isinstance(alias_data, dict):
|
|
58
|
+
data = {**alias_data, **data}
|
|
59
|
+
return data
|
|
60
|
+
return {
|
|
61
|
+
field: getattr(usage, field)
|
|
62
|
+
for field in _ACP_USAGE_FIELDS
|
|
63
|
+
if hasattr(usage, field)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def normalize_acp_usage(usage: object | None) -> ACPUsageSnapshot | None:
|
|
68
|
+
"""Normalize SDK ACP usage into BenchFlow's snake_case token counters."""
|
|
69
|
+
if usage is None:
|
|
70
|
+
return None
|
|
71
|
+
raw = _usage_mapping(usage)
|
|
72
|
+
if not raw:
|
|
73
|
+
return None
|
|
74
|
+
aliases = {
|
|
75
|
+
"input_tokens": ("input_tokens", "inputTokens"),
|
|
76
|
+
"output_tokens": ("output_tokens", "outputTokens"),
|
|
77
|
+
"total_tokens": ("total_tokens", "totalTokens"),
|
|
78
|
+
"cached_read_tokens": ("cached_read_tokens", "cachedReadTokens"),
|
|
79
|
+
"cached_write_tokens": ("cached_write_tokens", "cachedWriteTokens"),
|
|
80
|
+
"thought_tokens": ("thought_tokens", "thoughtTokens"),
|
|
81
|
+
}
|
|
82
|
+
snapshot: ACPUsageSnapshot = {}
|
|
83
|
+
for field, names in aliases.items():
|
|
84
|
+
value = None
|
|
85
|
+
for name in names:
|
|
86
|
+
if name in raw:
|
|
87
|
+
value = raw[name]
|
|
88
|
+
break
|
|
89
|
+
snapshot[field] = _coerce_usage_int(value)
|
|
90
|
+
if all(value is None for value in snapshot.values()):
|
|
91
|
+
return None
|
|
92
|
+
return snapshot
|
|
93
|
+
|
|
18
94
|
|
|
19
95
|
def _is_skill_tool_call(
|
|
20
96
|
kind: object, title: object = "", content: object = None
|
|
@@ -93,6 +169,7 @@ class ACPSession:
|
|
|
93
169
|
self.tool_calls: list[ToolCallRecord] = []
|
|
94
170
|
self._tool_call_map: dict[str, ToolCallRecord] = {}
|
|
95
171
|
self.stop_reason: StopReason | None = None
|
|
172
|
+
self.usage_snapshots: list[ACPUsageSnapshot] = []
|
|
96
173
|
self.created_at = datetime.now()
|
|
97
174
|
self.events: list[dict] = []
|
|
98
175
|
self._pending_text: list[dict] = []
|
|
@@ -124,6 +201,20 @@ class ACPSession:
|
|
|
124
201
|
self._flush_agent_text()
|
|
125
202
|
self._notify_change()
|
|
126
203
|
|
|
204
|
+
def record_prompt_usage(self, usage: object | None) -> None:
|
|
205
|
+
"""Record cumulative ACP token usage returned by session/prompt."""
|
|
206
|
+
snapshot = normalize_acp_usage(usage)
|
|
207
|
+
if snapshot is None:
|
|
208
|
+
return
|
|
209
|
+
self.usage_snapshots.append(snapshot)
|
|
210
|
+
self._notify_change()
|
|
211
|
+
|
|
212
|
+
def latest_usage_totals(self) -> ACPUsageSnapshot | None:
|
|
213
|
+
"""Return the latest cumulative ACP usage snapshot, if any."""
|
|
214
|
+
if not self.usage_snapshots:
|
|
215
|
+
return None
|
|
216
|
+
return dict(self.usage_snapshots[-1])
|
|
217
|
+
|
|
127
218
|
def _flush_agent_text(self) -> None:
|
|
128
219
|
"""Flush pending text events, merging consecutive same-type chunks."""
|
|
129
220
|
if not self._pending_text:
|
|
@@ -47,9 +47,17 @@ _CODEX_ACCESS_TOKEN_ENV = "CODEX_ACCESS_TOKEN"
|
|
|
47
47
|
_CODEX_AUTH_JSON_ENV = "CODEX_AUTH_JSON"
|
|
48
48
|
_CLAUDE_CODE_OAUTH_TOKEN_ENV = "CLAUDE_CODE_OAUTH_TOKEN"
|
|
49
49
|
_CLAUDE_OAUTH_TOKEN_ENV = "CLAUDE_OAUTH_TOKEN"
|
|
50
|
+
_SUBSCRIPTION_AUTH_MARKER = "_BENCHFLOW_SUBSCRIPTION_AUTH"
|
|
50
51
|
_CUSTOM_OPENAI_ENDPOINT_KEYS = frozenset(
|
|
51
52
|
{"BENCHFLOW_PROVIDER_BASE_URL", "OPENAI_BASE_URL"}
|
|
52
53
|
)
|
|
54
|
+
_LITELLM_RUNTIME_MARKER_KEYS = frozenset(
|
|
55
|
+
{
|
|
56
|
+
"BENCHFLOW_LITELLM_MASTER_KEY",
|
|
57
|
+
"BENCHFLOW_LITELLM_MODEL_ALIAS",
|
|
58
|
+
"BENCHFLOW_LITELLM_MODEL_VIA_ENV",
|
|
59
|
+
}
|
|
60
|
+
)
|
|
53
61
|
_CANONICAL_OPENAI_URL = "https://api.openai.com/v1"
|
|
54
62
|
_GENERIC_PROVIDER_OVERRIDE_KEYS = frozenset(
|
|
55
63
|
{
|
|
@@ -368,6 +376,59 @@ def _has_codex_auth_json_auth(
|
|
|
368
376
|
) and bool(agent_env.get(_CODEX_AUTH_JSON_ENV))
|
|
369
377
|
|
|
370
378
|
|
|
379
|
+
def uses_native_subscription_auth(
|
|
380
|
+
agent: str,
|
|
381
|
+
model: str | None,
|
|
382
|
+
agent_env: dict[str, str],
|
|
383
|
+
) -> bool:
|
|
384
|
+
"""Return True when an agent should use CLI/subscription auth directly.
|
|
385
|
+
|
|
386
|
+
This is the Harbor-style split point: API-key runs can be routed through
|
|
387
|
+
LiteLLM, while subscription-auth runs stay on the native Codex/Claude ACP
|
|
388
|
+
path and report usage from the agent protocol response.
|
|
389
|
+
"""
|
|
390
|
+
if agent_env.get("BENCHFLOW_PROVIDER_NAME") == "litellm" or any(
|
|
391
|
+
agent_env.get(key) for key in _LITELLM_RUNTIME_MARKER_KEYS
|
|
392
|
+
):
|
|
393
|
+
return False
|
|
394
|
+
|
|
395
|
+
if agent == "codex-acp":
|
|
396
|
+
if agent_env.get("OPENAI_API_KEY"):
|
|
397
|
+
return False
|
|
398
|
+
required_key = "OPENAI_API_KEY"
|
|
399
|
+
if not _can_use_codex_subscription_auth(
|
|
400
|
+
agent,
|
|
401
|
+
model,
|
|
402
|
+
required_key,
|
|
403
|
+
agent_env,
|
|
404
|
+
):
|
|
405
|
+
return False
|
|
406
|
+
return (
|
|
407
|
+
bool(agent_env.get(_CODEX_ACCESS_TOKEN_ENV))
|
|
408
|
+
or bool(agent_env.get(_CODEX_AUTH_JSON_ENV))
|
|
409
|
+
or agent_env.get(_SUBSCRIPTION_AUTH_MARKER) == "1"
|
|
410
|
+
or check_subscription_auth(agent, required_key)
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
if agent == "claude-agent-acp":
|
|
414
|
+
if agent_env.get("ANTHROPIC_API_KEY"):
|
|
415
|
+
return False
|
|
416
|
+
if model is not None:
|
|
417
|
+
from benchflow.agents.registry import infer_env_key_for_model
|
|
418
|
+
|
|
419
|
+
if infer_env_key_for_model(model) != "ANTHROPIC_API_KEY":
|
|
420
|
+
return False
|
|
421
|
+
return (
|
|
422
|
+
bool(agent_env.get(_CLAUDE_CODE_OAUTH_TOKEN_ENV))
|
|
423
|
+
or bool(agent_env.get(_CLAUDE_OAUTH_TOKEN_ENV))
|
|
424
|
+
or bool(agent_env.get("ANTHROPIC_AUTH_TOKEN"))
|
|
425
|
+
or agent_env.get(_SUBSCRIPTION_AUTH_MARKER) == "1"
|
|
426
|
+
or check_subscription_auth(agent, "ANTHROPIC_API_KEY")
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
return False
|
|
430
|
+
|
|
431
|
+
|
|
371
432
|
def inject_vertex_credentials(agent_env: dict[str, str], model: str) -> None:
|
|
372
433
|
"""Inject ADC credentials and defaults for Vertex AI models."""
|
|
373
434
|
from benchflow.agents.registry import is_vertex_model
|
|
@@ -668,7 +729,7 @@ def resolve_agent_env(
|
|
|
668
729
|
required_key,
|
|
669
730
|
agent_env,
|
|
670
731
|
) and check_subscription_auth(agent, required_key):
|
|
671
|
-
agent_env[
|
|
732
|
+
agent_env[_SUBSCRIPTION_AUTH_MARKER] = "1"
|
|
672
733
|
logger.info(
|
|
673
734
|
"Using host subscription auth (no %s set)",
|
|
674
735
|
required_key,
|
|
@@ -701,7 +762,7 @@ def resolve_agent_env(
|
|
|
701
762
|
and _can_use_subscription_auth(agent, model, req_key, agent_env)
|
|
702
763
|
and check_subscription_auth(agent, req_key)
|
|
703
764
|
):
|
|
704
|
-
agent_env[
|
|
765
|
+
agent_env[_SUBSCRIPTION_AUTH_MARKER] = "1"
|
|
705
766
|
logger.info(
|
|
706
767
|
"Using host subscription auth (no %s set)",
|
|
707
768
|
req_key,
|
|
@@ -21,6 +21,7 @@ from benchflow._utils.scoring import (
|
|
|
21
21
|
pass_rate_excl_errors,
|
|
22
22
|
)
|
|
23
23
|
from benchflow.trajectories.metrics import result_skill_invocations
|
|
24
|
+
from benchflow.usage_tracking import is_trusted_usage_source
|
|
24
25
|
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
26
27
|
|
|
@@ -199,7 +200,7 @@ class BenchmarkMetrics:
|
|
|
199
200
|
return [
|
|
200
201
|
t
|
|
201
202
|
for t in self.tasks
|
|
202
|
-
if t.completed and t.usage_source
|
|
203
|
+
if t.completed and is_trusted_usage_source(t.usage_source)
|
|
203
204
|
]
|
|
204
205
|
|
|
205
206
|
@property
|
|
@@ -87,9 +87,10 @@ class RolloutResult:
|
|
|
87
87
|
total_tokens: Sum of input, output, cache-read, and cache-creation tokens,
|
|
88
88
|
or None when provider telemetry was unavailable.
|
|
89
89
|
cost_usd: Provider cost estimate in USD, or None when unavailable.
|
|
90
|
-
usage_source:
|
|
91
|
-
"unavailable".
|
|
90
|
+
usage_source: Token telemetry source. One of "provider_response",
|
|
91
|
+
"agent_native_acp", or "unavailable".
|
|
92
92
|
price_source: Pricing table version used for cost_usd, or None.
|
|
93
|
+
usage_details: Optional source-specific telemetry details.
|
|
93
94
|
error: Error description string, or None on success.
|
|
94
95
|
error_category: Stable category for ``error``, or None on success.
|
|
95
96
|
verifier_error: Verifier error description, or None if verifier succeeded
|
|
@@ -139,6 +140,7 @@ class RolloutResult:
|
|
|
139
140
|
cost_usd: float | None = None,
|
|
140
141
|
usage_source: str = "unavailable",
|
|
141
142
|
price_source: str | None = None,
|
|
143
|
+
usage_details: dict[str, Any] | None = None,
|
|
142
144
|
error: str | None = None,
|
|
143
145
|
error_category: str | None = None,
|
|
144
146
|
verifier_error: str | None = None,
|
|
@@ -170,6 +172,7 @@ class RolloutResult:
|
|
|
170
172
|
self.cost_usd = cost_usd
|
|
171
173
|
self.usage_source = usage_source
|
|
172
174
|
self.price_source = price_source
|
|
175
|
+
self.usage_details = usage_details
|
|
173
176
|
self.error = error
|
|
174
177
|
self.error_category = error_category
|
|
175
178
|
self.verifier_error = verifier_error
|
|
@@ -13,6 +13,7 @@ from benchflow.trajectories.types import (
|
|
|
13
13
|
LLMResponse,
|
|
14
14
|
Trajectory,
|
|
15
15
|
)
|
|
16
|
+
from benchflow.usage_tracking import usage_unavailable
|
|
16
17
|
|
|
17
18
|
_PROVIDER_AUTH_STATUS_CODES = (401, 403)
|
|
18
19
|
_STATUS_KEYS = {
|
|
@@ -353,19 +354,6 @@ def trajectory_from_litellm_callback_log(
|
|
|
353
354
|
return trajectory
|
|
354
355
|
|
|
355
356
|
|
|
356
|
-
def usage_unavailable() -> dict[str, Any]:
|
|
357
|
-
return {
|
|
358
|
-
"n_input_tokens": 0,
|
|
359
|
-
"n_output_tokens": 0,
|
|
360
|
-
"n_cache_read_tokens": 0,
|
|
361
|
-
"n_cache_creation_tokens": 0,
|
|
362
|
-
"total_tokens": 0,
|
|
363
|
-
"cost_usd": None,
|
|
364
|
-
"usage_source": "unavailable",
|
|
365
|
-
"price_source": None,
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
|
|
369
357
|
def extract_usage_from_trajectory(
|
|
370
358
|
trajectory: Trajectory | None,
|
|
371
359
|
*,
|
|
@@ -25,6 +25,7 @@ import httpx
|
|
|
25
25
|
import yaml
|
|
26
26
|
|
|
27
27
|
from benchflow.agents.codex_config import apply_codex_provider_config
|
|
28
|
+
from benchflow.agents.env import uses_native_subscription_auth
|
|
28
29
|
from benchflow.agents.registry import AGENTS
|
|
29
30
|
from benchflow.providers.litellm_config import (
|
|
30
31
|
LITELLM_MASTER_KEY_ENV,
|
|
@@ -38,10 +39,9 @@ from benchflow.providers.litellm_logging import (
|
|
|
38
39
|
callback_module_source,
|
|
39
40
|
extract_usage_from_trajectory,
|
|
40
41
|
trajectory_from_litellm_callback_log,
|
|
41
|
-
usage_unavailable,
|
|
42
42
|
)
|
|
43
43
|
from benchflow.trajectories.types import Trajectory
|
|
44
|
-
from benchflow.usage_tracking import UsageTrackingConfig
|
|
44
|
+
from benchflow.usage_tracking import UsageTrackingConfig, usage_unavailable
|
|
45
45
|
|
|
46
46
|
logger = logging.getLogger(__name__)
|
|
47
47
|
|
|
@@ -961,6 +961,13 @@ async def ensure_litellm_runtime(
|
|
|
961
961
|
reason="usage_tracking=off leaves provider traffic untouched",
|
|
962
962
|
)
|
|
963
963
|
|
|
964
|
+
if uses_native_subscription_auth(agent, model, agent_env):
|
|
965
|
+
return await _skip_litellm_runtime(
|
|
966
|
+
agent_env,
|
|
967
|
+
runtime,
|
|
968
|
+
reason="native subscription auth will use agent ACP usage telemetry",
|
|
969
|
+
)
|
|
970
|
+
|
|
964
971
|
if not needs_litellm_runtime(agent, model):
|
|
965
972
|
if usage_cfg.mode == "required" and agent != "oracle":
|
|
966
973
|
raise RuntimeError(
|
|
@@ -103,7 +103,13 @@ from benchflow.trajectories._capture import (
|
|
|
103
103
|
from benchflow.trajectories.metrics import count_skill_invocations
|
|
104
104
|
from benchflow.trajectories.tree import RolloutNode, RolloutTree, Step
|
|
105
105
|
from benchflow.trajectories.types import redact_acp_trajectory_jsonl
|
|
106
|
-
from benchflow.usage_tracking import
|
|
106
|
+
from benchflow.usage_tracking import (
|
|
107
|
+
USAGE_SOURCE_AGENT_NATIVE_ACP,
|
|
108
|
+
USAGE_SOURCE_PROVIDER_RESPONSE,
|
|
109
|
+
UsageTrackingConfig,
|
|
110
|
+
is_token_usage_available,
|
|
111
|
+
usage_unavailable,
|
|
112
|
+
)
|
|
107
113
|
|
|
108
114
|
logger = logging.getLogger(__name__)
|
|
109
115
|
|
|
@@ -129,6 +135,75 @@ def _provider_auth_status_from_runtime(runtime: Any) -> int | None:
|
|
|
129
135
|
return None
|
|
130
136
|
|
|
131
137
|
|
|
138
|
+
_NATIVE_ACP_USAGE_SNAPSHOT_TO_RESULT = {
|
|
139
|
+
"input_tokens": "n_input_tokens",
|
|
140
|
+
"output_tokens": "n_output_tokens",
|
|
141
|
+
"cached_read_tokens": "n_cache_read_tokens",
|
|
142
|
+
"cached_write_tokens": "n_cache_creation_tokens",
|
|
143
|
+
"total_tokens": "total_tokens",
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _zero_native_acp_usage_metrics() -> dict[str, Any]:
|
|
148
|
+
return {**usage_unavailable(), "usage_details": {"thought_tokens": 0}}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _as_nonnegative_int(value: object) -> int:
|
|
152
|
+
if value is None:
|
|
153
|
+
return 0
|
|
154
|
+
if isinstance(value, bool):
|
|
155
|
+
return int(value)
|
|
156
|
+
if isinstance(value, int):
|
|
157
|
+
return max(value, 0)
|
|
158
|
+
if isinstance(value, float | str | bytes | bytearray):
|
|
159
|
+
try:
|
|
160
|
+
return max(int(value), 0)
|
|
161
|
+
except ValueError:
|
|
162
|
+
return 0
|
|
163
|
+
try:
|
|
164
|
+
return max(int(str(value)), 0)
|
|
165
|
+
except ValueError:
|
|
166
|
+
return 0
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _native_acp_usage_delta(
|
|
170
|
+
previous: dict[str, int | None] | None,
|
|
171
|
+
current: dict[str, int | None],
|
|
172
|
+
) -> dict[str, int]:
|
|
173
|
+
delta: dict[str, int] = {}
|
|
174
|
+
for usage_field in (
|
|
175
|
+
"input_tokens",
|
|
176
|
+
"output_tokens",
|
|
177
|
+
"cached_read_tokens",
|
|
178
|
+
"cached_write_tokens",
|
|
179
|
+
"thought_tokens",
|
|
180
|
+
):
|
|
181
|
+
current_value = _as_nonnegative_int(current.get(usage_field))
|
|
182
|
+
previous_value = (
|
|
183
|
+
_as_nonnegative_int(previous.get(usage_field)) if previous else 0
|
|
184
|
+
)
|
|
185
|
+
delta[usage_field] = max(current_value - previous_value, 0)
|
|
186
|
+
|
|
187
|
+
current_total = current.get("total_tokens")
|
|
188
|
+
if current_total is not None:
|
|
189
|
+
current_value = _as_nonnegative_int(current_total)
|
|
190
|
+
previous_value = (
|
|
191
|
+
_as_nonnegative_int(previous.get("total_tokens"))
|
|
192
|
+
if previous and previous.get("total_tokens") is not None
|
|
193
|
+
else 0
|
|
194
|
+
)
|
|
195
|
+
delta["total_tokens"] = max(current_value - previous_value, 0)
|
|
196
|
+
else:
|
|
197
|
+
delta["total_tokens"] = (
|
|
198
|
+
delta["input_tokens"]
|
|
199
|
+
+ delta["output_tokens"]
|
|
200
|
+
+ delta["cached_read_tokens"]
|
|
201
|
+
+ delta["cached_write_tokens"]
|
|
202
|
+
+ delta["thought_tokens"]
|
|
203
|
+
)
|
|
204
|
+
return delta
|
|
205
|
+
|
|
206
|
+
|
|
132
207
|
def _task_disallows_internet(task: Any) -> bool:
|
|
133
208
|
"""Return True when task.toml requests no internet for the agent task."""
|
|
134
209
|
env_config = getattr(getattr(task, "config", None), "environment", None)
|
|
@@ -537,6 +612,7 @@ def _build_rollout_result(
|
|
|
537
612
|
cost_usd: float | None = None,
|
|
538
613
|
usage_source: str = "unavailable",
|
|
539
614
|
price_source: str | None = None,
|
|
615
|
+
usage_details: dict[str, Any] | None = None,
|
|
540
616
|
usage_tracking: dict[str, Any] | None = None,
|
|
541
617
|
evolved_skills: dict[str, str] | None = None,
|
|
542
618
|
source_provenance: dict[str, Any] | None = None,
|
|
@@ -588,6 +664,7 @@ def _build_rollout_result(
|
|
|
588
664
|
cost_usd=cost_usd,
|
|
589
665
|
usage_source=usage_source,
|
|
590
666
|
price_source=price_source,
|
|
667
|
+
usage_details=usage_details,
|
|
591
668
|
error=error,
|
|
592
669
|
error_category=error_category,
|
|
593
670
|
verifier_error=verifier_error,
|
|
@@ -615,6 +692,8 @@ def _build_rollout_result(
|
|
|
615
692
|
"usage_source": result.usage_source,
|
|
616
693
|
"price_source": result.price_source,
|
|
617
694
|
}
|
|
695
|
+
if result.usage_details is not None:
|
|
696
|
+
agent_result["usage_details"] = result.usage_details
|
|
618
697
|
final_metrics = final_metrics_from_agent_result(agent_result)
|
|
619
698
|
trajectory_summary = trajectory_summary_from_events(
|
|
620
699
|
trajectory,
|
|
@@ -1178,6 +1257,8 @@ class Rollout:
|
|
|
1178
1257
|
self._task_skill_policy: TaskSkillPolicy | None = None
|
|
1179
1258
|
self._usage_runtime: Any = None
|
|
1180
1259
|
self._usage_metrics: dict[str, Any] = self._planes.extract_usage(None)
|
|
1260
|
+
self._native_usage_metrics: dict[str, Any] = _zero_native_acp_usage_metrics()
|
|
1261
|
+
self._native_usage_checkpoint: dict[str, int | None] | None = None
|
|
1181
1262
|
# Provider 401/403 status snapshotted during cleanup, after the usage
|
|
1182
1263
|
# proxy imports its captures (Daytona's SandboxUsageProxy only fills
|
|
1183
1264
|
# trajectory on stop()). Read by _provider_auth_status() so ACP-error
|
|
@@ -1649,6 +1730,7 @@ class Rollout:
|
|
|
1649
1730
|
agent_cwd=self._agent_cwd,
|
|
1650
1731
|
reasoning_effort=cfg.primary_reasoning_effort,
|
|
1651
1732
|
)
|
|
1733
|
+
self._native_usage_checkpoint = None
|
|
1652
1734
|
self._reapply_ask_user_handler()
|
|
1653
1735
|
self._attach_trajectory_writer(rollout_dir)
|
|
1654
1736
|
|
|
@@ -1827,6 +1909,7 @@ class Rollout:
|
|
|
1827
1909
|
self._n_tool_calls += new_tools
|
|
1828
1910
|
self._executed_prompts.extend(effective_prompts)
|
|
1829
1911
|
self._trajectory_source = "acp"
|
|
1912
|
+
self._collect_native_acp_usage()
|
|
1830
1913
|
|
|
1831
1914
|
# Grow the tree at Step-level granularity — one Step per ACP event
|
|
1832
1915
|
# (tool_call, agent_message, agent_thought, user_message). A single
|
|
@@ -1859,6 +1942,46 @@ class Rollout:
|
|
|
1859
1942
|
self._phase = "executed"
|
|
1860
1943
|
return trajectory, n_tool_calls
|
|
1861
1944
|
|
|
1945
|
+
def _collect_native_acp_usage(self) -> None:
|
|
1946
|
+
"""Accumulate ACP PromptResponse.usage deltas for native subscription runs."""
|
|
1947
|
+
session = getattr(self, "_session", None)
|
|
1948
|
+
latest_fn = getattr(session, "latest_usage_totals", None)
|
|
1949
|
+
if not callable(latest_fn):
|
|
1950
|
+
return
|
|
1951
|
+
latest = latest_fn()
|
|
1952
|
+
if not latest:
|
|
1953
|
+
return
|
|
1954
|
+
previous = getattr(self, "_native_usage_checkpoint", None)
|
|
1955
|
+
delta = _native_acp_usage_delta(previous, latest)
|
|
1956
|
+
self._native_usage_checkpoint = dict(latest)
|
|
1957
|
+
if not any(delta.values()):
|
|
1958
|
+
return
|
|
1959
|
+
|
|
1960
|
+
metrics = dict(
|
|
1961
|
+
getattr(self, "_native_usage_metrics", _zero_native_acp_usage_metrics())
|
|
1962
|
+
)
|
|
1963
|
+
for (
|
|
1964
|
+
snapshot_field,
|
|
1965
|
+
result_field,
|
|
1966
|
+
) in _NATIVE_ACP_USAGE_SNAPSHOT_TO_RESULT.items():
|
|
1967
|
+
if result_field == "total_tokens":
|
|
1968
|
+
continue
|
|
1969
|
+
metrics[result_field] = _as_nonnegative_int(metrics.get(result_field)) + (
|
|
1970
|
+
delta.get(snapshot_field) or 0
|
|
1971
|
+
)
|
|
1972
|
+
metrics["total_tokens"] = _as_nonnegative_int(metrics.get("total_tokens")) + (
|
|
1973
|
+
delta.get("total_tokens") or 0
|
|
1974
|
+
)
|
|
1975
|
+
details = dict(metrics.get("usage_details") or {})
|
|
1976
|
+
details["thought_tokens"] = _as_nonnegative_int(
|
|
1977
|
+
details.get("thought_tokens")
|
|
1978
|
+
) + (delta.get("thought_tokens") or 0)
|
|
1979
|
+
metrics["usage_details"] = details
|
|
1980
|
+
metrics["usage_source"] = USAGE_SOURCE_AGENT_NATIVE_ACP
|
|
1981
|
+
metrics["cost_usd"] = None
|
|
1982
|
+
metrics["price_source"] = None
|
|
1983
|
+
self._native_usage_metrics = metrics
|
|
1984
|
+
|
|
1862
1985
|
def _build_step_batch(self, new_events: list[dict], new_tools: int) -> list[Step]:
|
|
1863
1986
|
"""Build one Step per ACP event from the events appended this execute.
|
|
1864
1987
|
|
|
@@ -2102,7 +2225,9 @@ class Rollout:
|
|
|
2102
2225
|
logger.warning(f"LLM trajectory write failed: {e}")
|
|
2103
2226
|
finally:
|
|
2104
2227
|
self._usage_runtime = None
|
|
2105
|
-
|
|
2228
|
+
|
|
2229
|
+
self._finalize_usage_metrics()
|
|
2230
|
+
self._enforce_required_usage_tracking()
|
|
2106
2231
|
|
|
2107
2232
|
if self._environment is not None:
|
|
2108
2233
|
with contextlib.suppress(Exception):
|
|
@@ -2126,11 +2251,24 @@ class Rollout:
|
|
|
2126
2251
|
|
|
2127
2252
|
self._phase = "cleaned"
|
|
2128
2253
|
|
|
2254
|
+
def _finalize_usage_metrics(self) -> None:
|
|
2255
|
+
"""Prefer LiteLLM usage, otherwise use trusted native ACP usage."""
|
|
2256
|
+
current_metrics = getattr(
|
|
2257
|
+
self, "_usage_metrics", {"usage_source": "unavailable"}
|
|
2258
|
+
)
|
|
2259
|
+
if current_metrics.get("usage_source") == USAGE_SOURCE_PROVIDER_RESPONSE:
|
|
2260
|
+
return
|
|
2261
|
+
native_metrics = getattr(self, "_native_usage_metrics", None)
|
|
2262
|
+
if isinstance(native_metrics, dict) and is_token_usage_available(
|
|
2263
|
+
native_metrics
|
|
2264
|
+
):
|
|
2265
|
+
self._usage_metrics = native_metrics
|
|
2266
|
+
|
|
2129
2267
|
def _enforce_required_usage_tracking(self) -> None:
|
|
2130
2268
|
usage_cfg = self._config.usage_tracking.with_env_defaults()
|
|
2131
2269
|
if usage_cfg.mode != "required" or self._config.primary_agent == "oracle":
|
|
2132
2270
|
return
|
|
2133
|
-
if self
|
|
2271
|
+
if is_token_usage_available(getattr(self, "_usage_metrics", None)):
|
|
2134
2272
|
return
|
|
2135
2273
|
if self._error is not None:
|
|
2136
2274
|
return
|
|
@@ -2721,7 +2859,7 @@ class Rollout:
|
|
|
2721
2859
|
usage_source = str(self._usage_metrics.get("usage_source", "unavailable"))
|
|
2722
2860
|
if usage_cfg.mode == "off":
|
|
2723
2861
|
status = "off"
|
|
2724
|
-
elif
|
|
2862
|
+
elif is_token_usage_available(self._usage_metrics):
|
|
2725
2863
|
status = "enabled"
|
|
2726
2864
|
else:
|
|
2727
2865
|
status = "unavailable"
|
|
@@ -7,8 +7,15 @@ from dataclasses import dataclass
|
|
|
7
7
|
from typing import Any, Literal, cast
|
|
8
8
|
|
|
9
9
|
UsageTrackingMode = Literal["auto", "required", "off"]
|
|
10
|
+
UsageSource = Literal["provider_response", "agent_native_acp", "unavailable"]
|
|
10
11
|
|
|
11
12
|
USAGE_TRACKING_ENV = "BENCHFLOW_USAGE_TRACKING"
|
|
13
|
+
USAGE_SOURCE_PROVIDER_RESPONSE = "provider_response"
|
|
14
|
+
USAGE_SOURCE_AGENT_NATIVE_ACP = "agent_native_acp"
|
|
15
|
+
USAGE_SOURCE_UNAVAILABLE = "unavailable"
|
|
16
|
+
TRUSTED_USAGE_SOURCES: frozenset[str] = frozenset(
|
|
17
|
+
{USAGE_SOURCE_PROVIDER_RESPONSE, USAGE_SOURCE_AGENT_NATIVE_ACP}
|
|
18
|
+
)
|
|
12
19
|
|
|
13
20
|
_MODES: set[str] = {"auto", "required", "off"}
|
|
14
21
|
_LEGACY_USAGE_PROXY_KEYS: frozenset[str] = frozenset(
|
|
@@ -36,13 +43,39 @@ def _optional_mode(value: Any) -> UsageTrackingMode | None:
|
|
|
36
43
|
return normalize_usage_tracking_mode(str(value))
|
|
37
44
|
|
|
38
45
|
|
|
46
|
+
def is_trusted_usage_source(value: object) -> bool:
|
|
47
|
+
"""Return True for usage telemetry sources that satisfy required tracking."""
|
|
48
|
+
return str(value) in TRUSTED_USAGE_SOURCES
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def is_token_usage_available(metrics: dict[str, Any] | None) -> bool:
|
|
52
|
+
"""Return True when a usage metrics payload has trusted token telemetry."""
|
|
53
|
+
if not metrics:
|
|
54
|
+
return False
|
|
55
|
+
return is_trusted_usage_source(metrics.get("usage_source"))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def usage_unavailable() -> dict[str, Any]:
|
|
59
|
+
"""Return the canonical empty token-usage metrics payload."""
|
|
60
|
+
return {
|
|
61
|
+
"n_input_tokens": 0,
|
|
62
|
+
"n_output_tokens": 0,
|
|
63
|
+
"n_cache_read_tokens": 0,
|
|
64
|
+
"n_cache_creation_tokens": 0,
|
|
65
|
+
"total_tokens": 0,
|
|
66
|
+
"cost_usd": None,
|
|
67
|
+
"usage_source": USAGE_SOURCE_UNAVAILABLE,
|
|
68
|
+
"price_source": None,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
39
72
|
@dataclass(frozen=True, init=False)
|
|
40
73
|
class UsageTrackingConfig:
|
|
41
74
|
"""User-facing token/cost telemetry policy.
|
|
42
75
|
|
|
43
76
|
``mode`` is the operator contract:
|
|
44
|
-
- ``auto`` records usage when
|
|
45
|
-
- ``required`` fails
|
|
77
|
+
- ``auto`` records usage when LiteLLM or native ACP telemetry can be used.
|
|
78
|
+
- ``required`` fails when no trusted token telemetry can be captured.
|
|
46
79
|
- ``off`` leaves provider traffic untouched.
|
|
47
80
|
"""
|
|
48
81
|
|
|
@@ -119,6 +152,8 @@ class UsageTrackingConfig:
|
|
|
119
152
|
endpoint_kind = "sandbox" if environment == "daytona" else "host"
|
|
120
153
|
if self.mode == "off":
|
|
121
154
|
endpoint_kind = "none"
|
|
155
|
+
elif usage_source == USAGE_SOURCE_AGENT_NATIVE_ACP:
|
|
156
|
+
endpoint_kind = "agent_native"
|
|
122
157
|
return {
|
|
123
158
|
"requested": self.mode,
|
|
124
159
|
"status": status,
|