benchflow 0.5.0__tar.gz → 0.5.1.dev869__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/PKG-INFO +1 -1
  2. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/pyproject.toml +1 -1
  3. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/evaluation_results.py +6 -2
  4. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/client.py +1 -0
  5. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/session.py +91 -0
  6. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/env.py +63 -2
  7. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/metrics.py +2 -1
  8. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/models.py +5 -2
  9. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/litellm_logging.py +1 -13
  10. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/litellm_runtime.py +9 -2
  11. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rollout.py +142 -4
  12. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/usage_tracking.py +37 -2
  13. benchflow-0.5.1.dev869/tests/test_dashboard_daytona_key.py +129 -0
  14. benchflow-0.5.1.dev869/tests/test_experiments_status.py +181 -0
  15. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_litellm_runtime.py +46 -0
  16. benchflow-0.5.1.dev869/tests/test_native_acp_usage.py +243 -0
  17. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_subscription_auth.py +73 -0
  18. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/.gitignore +0 -0
  19. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/CHANGELOG.md +0 -0
  20. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/LICENSE +0 -0
  21. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/README.md +0 -0
  22. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/__init__.py +0 -0
  23. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_dotenv.py +0 -0
  24. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_paths.py +0 -0
  25. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_run.py +0 -0
  26. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_types.py +0 -0
  27. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/__init__.py +0 -0
  28. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/benchmark_repos.py +0 -0
  29. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/config.py +0 -0
  30. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/json_safe.py +0 -0
  31. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/learner_memory.py +0 -0
  32. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/result_metadata.py +0 -0
  33. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/reward_events.py +0 -0
  34. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/scoring.py +0 -0
  35. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/source_provenance.py +0 -0
  36. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/task_authoring.py +0 -0
  37. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/yaml_loader.py +0 -0
  38. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/__init__.py +0 -0
  39. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/container_transport.py +0 -0
  40. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/runtime.py +0 -0
  41. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/transport.py +0 -0
  42. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/types.py +0 -0
  43. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/__init__.py +0 -0
  44. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/harbor.py +0 -0
  45. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/inbound.py +0 -0
  46. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/inspect_ai.py +0 -0
  47. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/ors.py +0 -0
  48. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/terminal_bench.py +0 -0
  49. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/__init__.py +0 -0
  50. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/codex_config.py +0 -0
  51. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/credentials.py +0 -0
  52. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/errors.py +0 -0
  53. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
  54. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/install.py +0 -0
  55. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
  56. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/pi_acp_launcher.py +0 -0
  57. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/protocol.py +0 -0
  58. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/providers.py +0 -0
  59. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/registry.py +0 -0
  60. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/branch.py +0 -0
  61. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/__init__.py +0 -0
  62. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/main.py +0 -0
  63. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/trace_import.py +0 -0
  64. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/compat/__init__.py +0 -0
  65. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/compat/harbor_registry.py +0 -0
  66. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/contracts/__init__.py +0 -0
  67. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/contracts/planes.py +0 -0
  68. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/contracts/user.py +0 -0
  69. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/environment/Dockerfile +0 -0
  70. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/instruction.md +0 -0
  71. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/task.toml +0 -0
  72. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/tests/test.sh +0 -0
  73. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/diagnostics.py +0 -0
  74. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/environment/__init__.py +0 -0
  75. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/environment/manifest.py +0 -0
  76. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/environment/manifest_env.py +0 -0
  77. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/environment/protocol.py +0 -0
  78. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/environment/readiness.py +0 -0
  79. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/eval_sharding.py +0 -0
  80. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/eval_worker.py +0 -0
  81. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/evaluation.py +0 -0
  82. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/__init__.py +0 -0
  83. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/__init__.py +0 -0
  84. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/hooks.py +0 -0
  85. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
  86. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/hosted_env.py +0 -0
  87. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/learner_skills.py +0 -0
  88. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/learner_store.py +0 -0
  89. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/monitor.py +0 -0
  90. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/__init__.py +0 -0
  91. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
  92. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/litellm_config.py +0 -0
  93. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/providers/runtime.py +0 -0
  94. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/py.typed +0 -0
  95. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/README.md +0 -0
  96. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/__init__.py +0 -0
  97. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/builtins.py +0 -0
  98. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/events.py +0 -0
  99. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/file_readers.py +0 -0
  100. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/llm.py +0 -0
  101. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/memory_scorer.py +0 -0
  102. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/node.py +0 -0
  103. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/protocol.py +0 -0
  104. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/rubric.py +0 -0
  105. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/rubric_config.py +0 -0
  106. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/validation.py +0 -0
  107. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rollout_branch.py +0 -0
  108. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/rollout_planes.py +0 -0
  109. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/runtime.py +0 -0
  110. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/__init__.py +0 -0
  111. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_base.py +0 -0
  112. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose.py +0 -0
  113. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
  114. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
  115. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
  116. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
  117. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_sdk_ops.py +0 -0
  118. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/daytona.py +0 -0
  119. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/docker.py +0 -0
  120. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/lockdown.py +0 -0
  121. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/metadata.py +0 -0
  122. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/modal_impl.py +0 -0
  123. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/process.py +0 -0
  124. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/protocol.py +0 -0
  125. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/services.py +0 -0
  126. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/setup.py +0 -0
  127. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/snapshot.py +0 -0
  128. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/user.py +0 -0
  129. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/scenes.py +0 -0
  130. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/sdk.py +0 -0
  131. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/self_gen.py +0 -0
  132. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skill_eval/__init__.py +0 -0
  133. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skill_eval/_core.py +0 -0
  134. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skill_eval/gepa_export.py +0 -0
  135. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skill_eval/schema.py +0 -0
  136. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skill_policy.py +0 -0
  137. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/skills.py +0 -0
  138. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/__init__.py +0 -0
  139. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/config.py +0 -0
  140. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/env.py +0 -0
  141. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/paths.py +0 -0
  142. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/task.py +0 -0
  143. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/task/verifier.py +0 -0
  144. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/templates/__init__.py +0 -0
  145. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/templates/judge.py.tmpl +0 -0
  146. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/templates/test.sh.tmpl +0 -0
  147. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/__init__.py +0 -0
  148. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/huggingface.py +0 -0
  149. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/local.py +0 -0
  150. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/models.py +0 -0
  151. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/parsers.py +0 -0
  152. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/task_gen.py +0 -0
  153. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/__init__.py +0 -0
  154. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/_capture.py +0 -0
  155. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/export.py +0 -0
  156. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/metrics.py +0 -0
  157. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/otel.py +0 -0
  158. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/tree.py +0 -0
  159. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/types.py +0 -0
  160. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/viewer.py +0 -0
  161. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/__init__.py +0 -0
  162. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/agents/__init__.py +0 -0
  163. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/agents/test_protocol.py +0 -0
  164. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/README.md +0 -0
  165. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
  166. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
  167. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
  168. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/instruction.md +0 -0
  169. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
  170. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/task.toml +0 -0
  171. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/tests/test.sh +0 -0
  172. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/conformance-results.json +0 -0
  173. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/proof_multi_agent.py +0 -0
  174. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/proof_snapshot.py +0 -0
  175. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/run_conformance.py +0 -0
  176. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
  177. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/conftest.py +0 -0
  178. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/__init__.py +0 -0
  179. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_chibench_manifest.py +0 -0
  180. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_clawsbench_manifest.py +0 -0
  181. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_manifest.py +0 -0
  182. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_manifest_env.py +0 -0
  183. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_protocol.py +0 -0
  184. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/environment/test_readiness.py +0 -0
  185. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
  186. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/instruction.md +0 -0
  187. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/solution/solve.sh +0 -0
  188. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/task.toml +0 -0
  189. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/tests/test.sh +0 -0
  190. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
  191. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
  192. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
  193. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
  194. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
  195. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
  196. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/test_claude.sh +0 -0
  197. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/test_codex.sh +0 -0
  198. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/test_codex_custom_provider.sh +0 -0
  199. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/test_gemini.sh +0 -0
  200. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/test_openclaw.sh +0 -0
  201. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/traces/minimal-claude.jsonl +0 -0
  202. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
  203. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent.py +0 -0
  204. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
  205. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
  206. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_openai_responses_server.py +0 -0
  207. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/check_adapter_evidence.py +0 -0
  208. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/check_hosted_env_evidence.py +0 -0
  209. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/check_results.py +0 -0
  210. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
  211. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/check_trace_to_task_evidence.py +0 -0
  212. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/claude-agent-acp.yaml +0 -0
  213. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/codex-acp.yaml +0 -0
  214. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/gemini.yaml +0 -0
  215. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
  216. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/openclaw.yaml +0 -0
  217. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/opencode.yaml +0 -0
  218. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/openhands.yaml +0 -0
  219. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/configs/pi-acp.yaml +0 -0
  220. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/run.sh +0 -0
  221. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/run_suite.py +0 -0
  222. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/integration/suites/release.yaml +0 -0
  223. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_acp.py +0 -0
  224. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_acp_capability_advertising.py +0 -0
  225. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_acp_model_config_dispatch.py +0 -0
  226. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_acp_pinned_protocol_guard.py +0 -0
  227. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_acp_setup_failure_propagation.py +0 -0
  228. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_adapter_scripts.py +0 -0
  229. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_adapters.py +0 -0
  230. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_cli.py +0 -0
  231. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_env_resolution.py +0 -0
  232. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_gemini_defaults.py +0 -0
  233. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_idle_timeout_cli.py +0 -0
  234. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_model_decouple.py +0 -0
  235. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_registry.py +0 -0
  236. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_setup.py +0 -0
  237. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_agent_spec.py +0 -0
  238. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_base_install_imports.py +0 -0
  239. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_bedrock_thinking.py +0 -0
  240. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_branch.py +0 -0
  241. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_capture_trajectory.py +0 -0
  242. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_clawsbench_slice.py +0 -0
  243. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_cli_daytona.py +0 -0
  244. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_cli_docs_drift.py +0 -0
  245. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_cli_misc.py +0 -0
  246. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_compat_harbor_registry.py +0 -0
  247. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_config_redaction.py +0 -0
  248. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_connect_as_env.py +0 -0
  249. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_continuallearningbench_adapter.py +0 -0
  250. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_credential_env_scrub.py +0 -0
  251. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_no_host_paths.py +0 -0
  252. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_release_evidence.py +0 -0
  253. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_roadmap.py +0 -0
  254. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_symlink_ingestion.py +0 -0
  255. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_dashboard_sync.py +0 -0
  256. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_daytona_command_polling.py +0 -0
  257. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_daytona_litellm_runtime.py +0 -0
  258. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_daytona_status.py +0 -0
  259. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_docker_prune_scoping.py +0 -0
  260. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_docker_uploads.py +0 -0
  261. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_docs_examples.py +0 -0
  262. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eng50_capabilities.py +0 -0
  263. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_env_setup.py +0 -0
  264. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_environment_manifest_controls.py +0 -0
  265. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_filters_applied.py +0 -0
  266. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_sharding.py +0 -0
  267. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_single_task_summary.py +0 -0
  268. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_source_provenance.py +0 -0
  269. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_worker_retry.py +0 -0
  270. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_eval_zero_task_guard.py +0 -0
  271. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_evaluation_environment_manifest.py +0 -0
  272. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_exclude_tasks.py +0 -0
  273. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_harvey_lab_shim.py +0 -0
  274. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_hilbench_adapter.py +0 -0
  275. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_hosted_env.py +0 -0
  276. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_hosted_env_rollout_contract.py +0 -0
  277. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_inbound_adapter_manifest.py +0 -0
  278. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_inbound_adapters.py +0 -0
  279. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_integration_check_results.py +0 -0
  280. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_integration_run_suite.py +0 -0
  281. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_internet_policy.py +0 -0
  282. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_job.py +0 -0
  283. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_job_sequential_shared.py +0 -0
  284. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_job_sequential_shared_resume.py +0 -0
  285. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_judge_symlink_ingestion.py +0 -0
  286. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_learner_skills.py +0 -0
  287. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_learner_skills_traversal.py +0 -0
  288. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_learner_store.py +0 -0
  289. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_learner_store_persistence.py +0 -0
  290. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_litellm_config.py +0 -0
  291. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_litellm_hardening.py +0 -0
  292. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_litellm_logging.py +0 -0
  293. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_litellm_smoke.py +0 -0
  294. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_llm_judge.py +0 -0
  295. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_llm_judge_event_tags.py +0 -0
  296. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_llm_judge_verifier.py +0 -0
  297. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_memory_scorer.py +0 -0
  298. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_metrics.py +0 -0
  299. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_mock_openai_responses_server.py +0 -0
  300. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_monitor_scaffold.py +0 -0
  301. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_no_cross_provider_fallback.py +0 -0
  302. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_notification_order_real.py +0 -0
  303. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_oracle.py +0 -0
  304. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_oracle_chokepoint.py +0 -0
  305. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_paths_safe.py +0 -0
  306. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_paths_symlink_helpers.py +0 -0
  307. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_pi_acp_launcher.py +0 -0
  308. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_process.py +0 -0
  309. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_provider_auth_detection.py +0 -0
  310. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_providers.py +0 -0
  311. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_reexport.py +0 -0
  312. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_registry_invariants.py +0 -0
  313. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_release_version.py +0 -0
  314. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_resolve_env_helpers.py +0 -0
  315. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_reward_node.py +0 -0
  316. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_reward_unified_contract.py +0 -0
  317. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rewards.py +0 -0
  318. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rewards_jsonl.py +0 -0
  319. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_architecture.py +0 -0
  320. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_branch.py +0 -0
  321. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_config_path_coercion.py +0 -0
  322. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_environment.py +0 -0
  323. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_import_no_side_effects.py +0 -0
  324. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_on_ask_user_wiring.py +0 -0
  325. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_probe_sandbox_health.py +0 -0
  326. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rollout_upload.py +0 -0
  327. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_rubric_config.py +0 -0
  328. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_runtime.py +0 -0
  329. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_runtime_config_wired.py +0 -0
  330. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_runtime_live_sandbox.py +0 -0
  331. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox.py +0 -0
  332. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_exec_secret_handling.py +0 -0
  333. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_hardening.py +0 -0
  334. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
  335. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_multi_service.py +0 -0
  336. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_protocol.py +0 -0
  337. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_setup.py +0 -0
  338. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_snapshot_contract.py +0 -0
  339. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_upload_symlink.py +0 -0
  340. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_verifier_workspace.py +0 -0
  341. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_scene.py +0 -0
  342. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_scene_outbox_trial.py +0 -0
  343. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_scene_parallel_group.py +0 -0
  344. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_scene_result_aggregation.py +0 -0
  345. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_scoring.py +0 -0
  346. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sdk_internals.py +0 -0
  347. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_sdk_lockdown.py +0 -0
  348. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_cli.py +0 -0
  349. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_export_error_channel.py +0 -0
  350. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_export_failures.py +0 -0
  351. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_orchestration.py +0 -0
  352. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_session_request_permission_dispatch.py +0 -0
  353. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval.py +0 -0
  354. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_dryrun.py +0 -0
  355. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_integration.py +0 -0
  356. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_sweep.py +0 -0
  357. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_traversal.py +0 -0
  358. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_invocation_artifacts.py +0 -0
  359. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skill_policy.py +0 -0
  360. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skills.py +0 -0
  361. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skills_dir_agent_home_link.py +0 -0
  362. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skillsbench_harbor_parity.py +0 -0
  363. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_skillsbench_harbor_run_suite.py +0 -0
  364. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_smoke.py +0 -0
  365. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_task_check_eval_consistency.py +0 -0
  366. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_task_config.py +0 -0
  367. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_task_download.py +0 -0
  368. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_tasks.py +0 -0
  369. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_token_usage_normalization.py +0 -0
  370. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trace_import_cli.py +0 -0
  371. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trace_task_gen_traversal.py +0 -0
  372. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trace_to_task_evidence.py +0 -0
  373. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_traces_huggingface.py +0 -0
  374. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_traces_parsers.py +0 -0
  375. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_traces_task_gen.py +0 -0
  376. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_train_mode_artifact_emission.py +0 -0
  377. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trajectory_integration.py +0 -0
  378. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trajectory_streaming.py +0 -0
  379. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trial_agent_timeout_verify.py +0 -0
  380. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trial_install_agent_timeout.py +0 -0
  381. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_trial_litellm_runtime.py +0 -0
  382. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_usage_litellm.py +0 -0
  383. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_usage_required.py +0 -0
  384. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_usage_tracking.py +0 -0
  385. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_user.py +0 -0
  386. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_verifier_multi_container.py +0 -0
  387. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_verifier_output.py +0 -0
  388. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_verifier_output_freshness.py +0 -0
  389. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_verify.py +0 -0
  390. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_workflow_action_pinning.py +0 -0
  391. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/test_yaml_config.py +0 -0
  392. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/__init__.py +0 -0
  393. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/test_export.py +0 -0
  394. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/test_export_nan_handling.py +0 -0
  395. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/test_redaction.py +0 -0
  396. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/test_step_granularity.py +0 -0
  397. {benchflow-0.5.0 → benchflow-0.5.1.dev869}/tests/trajectories/test_tree.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchflow
3
- Version: 0.5.0
3
+ Version: 0.5.1.dev869
4
4
  Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
5
5
  Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
6
6
  Project-URL: Repository, https://github.com/benchflow-ai/benchflow
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "benchflow"
3
- version = "0.5.0"
3
+ version = "0.5.1.dev869"
4
4
  description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -17,6 +17,7 @@ from benchflow.trajectories.metrics import (
17
17
  count_skill_invocations,
18
18
  result_skill_invocations,
19
19
  )
20
+ from benchflow.usage_tracking import is_trusted_usage_source
20
21
 
21
22
  # Phase keys produced by Rollout (see rollout.py — environment_setup,
22
23
  # agent_setup, agent_execution, verifier, total). Kept here so summary
@@ -35,7 +36,7 @@ def agent_result_from_rollout(result: RolloutResult) -> dict[str, Any]:
35
36
  n_skill_invocations = result.n_skill_invocations or count_skill_invocations(
36
37
  result.trajectory
37
38
  )
38
- return {
39
+ agent_result = {
39
40
  "n_tool_calls": result.n_tool_calls,
40
41
  "n_skill_invocations": n_skill_invocations,
41
42
  "n_prompts": result.n_prompts,
@@ -48,6 +49,9 @@ def agent_result_from_rollout(result: RolloutResult) -> dict[str, Any]:
48
49
  "usage_source": result.usage_source,
49
50
  "price_source": result.price_source,
50
51
  }
52
+ if getattr(result, "usage_details", None) is not None:
53
+ agent_result["usage_details"] = result.usage_details
54
+ return agent_result
51
55
 
52
56
 
53
57
  def rollout_result_payload(
@@ -105,7 +109,7 @@ def usage_summary(results: dict[str, dict]) -> dict[str, Any]:
105
109
  covered = [
106
110
  r
107
111
  for r in completed
108
- if (r.get("agent_result") or {}).get("usage_source") == "provider_response"
112
+ if is_trusted_usage_source((r.get("agent_result") or {}).get("usage_source"))
109
113
  ]
110
114
 
111
115
  def total(field: str) -> int:
@@ -380,6 +380,7 @@ class ACPClient:
380
380
  # vendored ``StopReason`` enum so consumers keep ``.value`` / member
381
381
  # comparisons working.
382
382
  self._session.stop_reason = StopReason(prompt_result.stop_reason)
383
+ self._session.record_prompt_usage(getattr(prompt_result, "usage", None))
383
384
  return prompt_result
384
385
 
385
386
  async def cancel(self) -> None:
@@ -3,6 +3,7 @@
3
3
  import logging
4
4
  from collections.abc import Callable
5
5
  from datetime import datetime
6
+ from typing import Any
6
7
 
7
8
  from benchflow.trajectories.metrics import is_skill_invocation_event
8
9
 
@@ -15,6 +16,81 @@ from .types import (
15
16
 
16
17
  logger = logging.getLogger(__name__)
17
18
 
19
+ ACPUsageSnapshot = dict[str, int | None]
20
+
21
+ _ACP_USAGE_FIELDS: tuple[str, ...] = (
22
+ "input_tokens",
23
+ "output_tokens",
24
+ "total_tokens",
25
+ "cached_read_tokens",
26
+ "cached_write_tokens",
27
+ "thought_tokens",
28
+ )
29
+
30
+
31
+ def _coerce_usage_int(value: object) -> int | None:
32
+ if value is None:
33
+ return None
34
+ if isinstance(value, bool):
35
+ return int(value)
36
+ if isinstance(value, int):
37
+ return value
38
+ if isinstance(value, float | str | bytes | bytearray):
39
+ try:
40
+ return int(value)
41
+ except ValueError:
42
+ return None
43
+ try:
44
+ return int(str(value))
45
+ except ValueError:
46
+ return None
47
+
48
+
49
+ def _usage_mapping(usage: object) -> dict[str, Any]:
50
+ if isinstance(usage, dict):
51
+ return {str(key): value for key, value in usage.items()}
52
+ dump = getattr(usage, "model_dump", None)
53
+ if callable(dump):
54
+ data = dump(by_alias=False, exclude_none=True)
55
+ if isinstance(data, dict):
56
+ alias_data = dump(by_alias=True, exclude_none=True)
57
+ if isinstance(alias_data, dict):
58
+ data = {**alias_data, **data}
59
+ return data
60
+ return {
61
+ field: getattr(usage, field)
62
+ for field in _ACP_USAGE_FIELDS
63
+ if hasattr(usage, field)
64
+ }
65
+
66
+
67
+ def normalize_acp_usage(usage: object | None) -> ACPUsageSnapshot | None:
68
+ """Normalize SDK ACP usage into BenchFlow's snake_case token counters."""
69
+ if usage is None:
70
+ return None
71
+ raw = _usage_mapping(usage)
72
+ if not raw:
73
+ return None
74
+ aliases = {
75
+ "input_tokens": ("input_tokens", "inputTokens"),
76
+ "output_tokens": ("output_tokens", "outputTokens"),
77
+ "total_tokens": ("total_tokens", "totalTokens"),
78
+ "cached_read_tokens": ("cached_read_tokens", "cachedReadTokens"),
79
+ "cached_write_tokens": ("cached_write_tokens", "cachedWriteTokens"),
80
+ "thought_tokens": ("thought_tokens", "thoughtTokens"),
81
+ }
82
+ snapshot: ACPUsageSnapshot = {}
83
+ for field, names in aliases.items():
84
+ value = None
85
+ for name in names:
86
+ if name in raw:
87
+ value = raw[name]
88
+ break
89
+ snapshot[field] = _coerce_usage_int(value)
90
+ if all(value is None for value in snapshot.values()):
91
+ return None
92
+ return snapshot
93
+
18
94
 
19
95
  def _is_skill_tool_call(
20
96
  kind: object, title: object = "", content: object = None
@@ -93,6 +169,7 @@ class ACPSession:
93
169
  self.tool_calls: list[ToolCallRecord] = []
94
170
  self._tool_call_map: dict[str, ToolCallRecord] = {}
95
171
  self.stop_reason: StopReason | None = None
172
+ self.usage_snapshots: list[ACPUsageSnapshot] = []
96
173
  self.created_at = datetime.now()
97
174
  self.events: list[dict] = []
98
175
  self._pending_text: list[dict] = []
@@ -124,6 +201,20 @@ class ACPSession:
124
201
  self._flush_agent_text()
125
202
  self._notify_change()
126
203
 
204
+ def record_prompt_usage(self, usage: object | None) -> None:
205
+ """Record cumulative ACP token usage returned by session/prompt."""
206
+ snapshot = normalize_acp_usage(usage)
207
+ if snapshot is None:
208
+ return
209
+ self.usage_snapshots.append(snapshot)
210
+ self._notify_change()
211
+
212
+ def latest_usage_totals(self) -> ACPUsageSnapshot | None:
213
+ """Return the latest cumulative ACP usage snapshot, if any."""
214
+ if not self.usage_snapshots:
215
+ return None
216
+ return dict(self.usage_snapshots[-1])
217
+
127
218
  def _flush_agent_text(self) -> None:
128
219
  """Flush pending text events, merging consecutive same-type chunks."""
129
220
  if not self._pending_text:
@@ -47,9 +47,17 @@ _CODEX_ACCESS_TOKEN_ENV = "CODEX_ACCESS_TOKEN"
47
47
  _CODEX_AUTH_JSON_ENV = "CODEX_AUTH_JSON"
48
48
  _CLAUDE_CODE_OAUTH_TOKEN_ENV = "CLAUDE_CODE_OAUTH_TOKEN"
49
49
  _CLAUDE_OAUTH_TOKEN_ENV = "CLAUDE_OAUTH_TOKEN"
50
+ _SUBSCRIPTION_AUTH_MARKER = "_BENCHFLOW_SUBSCRIPTION_AUTH"
50
51
  _CUSTOM_OPENAI_ENDPOINT_KEYS = frozenset(
51
52
  {"BENCHFLOW_PROVIDER_BASE_URL", "OPENAI_BASE_URL"}
52
53
  )
54
+ _LITELLM_RUNTIME_MARKER_KEYS = frozenset(
55
+ {
56
+ "BENCHFLOW_LITELLM_MASTER_KEY",
57
+ "BENCHFLOW_LITELLM_MODEL_ALIAS",
58
+ "BENCHFLOW_LITELLM_MODEL_VIA_ENV",
59
+ }
60
+ )
53
61
  _CANONICAL_OPENAI_URL = "https://api.openai.com/v1"
54
62
  _GENERIC_PROVIDER_OVERRIDE_KEYS = frozenset(
55
63
  {
@@ -368,6 +376,59 @@ def _has_codex_auth_json_auth(
368
376
  ) and bool(agent_env.get(_CODEX_AUTH_JSON_ENV))
369
377
 
370
378
 
379
+ def uses_native_subscription_auth(
380
+ agent: str,
381
+ model: str | None,
382
+ agent_env: dict[str, str],
383
+ ) -> bool:
384
+ """Return True when an agent should use CLI/subscription auth directly.
385
+
386
+ This is the Harbor-style split point: API-key runs can be routed through
387
+ LiteLLM, while subscription-auth runs stay on the native Codex/Claude ACP
388
+ path and report usage from the agent protocol response.
389
+ """
390
+ if agent_env.get("BENCHFLOW_PROVIDER_NAME") == "litellm" or any(
391
+ agent_env.get(key) for key in _LITELLM_RUNTIME_MARKER_KEYS
392
+ ):
393
+ return False
394
+
395
+ if agent == "codex-acp":
396
+ if agent_env.get("OPENAI_API_KEY"):
397
+ return False
398
+ required_key = "OPENAI_API_KEY"
399
+ if not _can_use_codex_subscription_auth(
400
+ agent,
401
+ model,
402
+ required_key,
403
+ agent_env,
404
+ ):
405
+ return False
406
+ return (
407
+ bool(agent_env.get(_CODEX_ACCESS_TOKEN_ENV))
408
+ or bool(agent_env.get(_CODEX_AUTH_JSON_ENV))
409
+ or agent_env.get(_SUBSCRIPTION_AUTH_MARKER) == "1"
410
+ or check_subscription_auth(agent, required_key)
411
+ )
412
+
413
+ if agent == "claude-agent-acp":
414
+ if agent_env.get("ANTHROPIC_API_KEY"):
415
+ return False
416
+ if model is not None:
417
+ from benchflow.agents.registry import infer_env_key_for_model
418
+
419
+ if infer_env_key_for_model(model) != "ANTHROPIC_API_KEY":
420
+ return False
421
+ return (
422
+ bool(agent_env.get(_CLAUDE_CODE_OAUTH_TOKEN_ENV))
423
+ or bool(agent_env.get(_CLAUDE_OAUTH_TOKEN_ENV))
424
+ or bool(agent_env.get("ANTHROPIC_AUTH_TOKEN"))
425
+ or agent_env.get(_SUBSCRIPTION_AUTH_MARKER) == "1"
426
+ or check_subscription_auth(agent, "ANTHROPIC_API_KEY")
427
+ )
428
+
429
+ return False
430
+
431
+
371
432
  def inject_vertex_credentials(agent_env: dict[str, str], model: str) -> None:
372
433
  """Inject ADC credentials and defaults for Vertex AI models."""
373
434
  from benchflow.agents.registry import is_vertex_model
@@ -668,7 +729,7 @@ def resolve_agent_env(
668
729
  required_key,
669
730
  agent_env,
670
731
  ) and check_subscription_auth(agent, required_key):
671
- agent_env["_BENCHFLOW_SUBSCRIPTION_AUTH"] = "1"
732
+ agent_env[_SUBSCRIPTION_AUTH_MARKER] = "1"
672
733
  logger.info(
673
734
  "Using host subscription auth (no %s set)",
674
735
  required_key,
@@ -701,7 +762,7 @@ def resolve_agent_env(
701
762
  and _can_use_subscription_auth(agent, model, req_key, agent_env)
702
763
  and check_subscription_auth(agent, req_key)
703
764
  ):
704
- agent_env["_BENCHFLOW_SUBSCRIPTION_AUTH"] = "1"
765
+ agent_env[_SUBSCRIPTION_AUTH_MARKER] = "1"
705
766
  logger.info(
706
767
  "Using host subscription auth (no %s set)",
707
768
  req_key,
@@ -21,6 +21,7 @@ from benchflow._utils.scoring import (
21
21
  pass_rate_excl_errors,
22
22
  )
23
23
  from benchflow.trajectories.metrics import result_skill_invocations
24
+ from benchflow.usage_tracking import is_trusted_usage_source
24
25
 
25
26
  logger = logging.getLogger(__name__)
26
27
 
@@ -199,7 +200,7 @@ class BenchmarkMetrics:
199
200
  return [
200
201
  t
201
202
  for t in self.tasks
202
- if t.completed and t.usage_source == "provider_response"
203
+ if t.completed and is_trusted_usage_source(t.usage_source)
203
204
  ]
204
205
 
205
206
  @property
@@ -87,9 +87,10 @@ class RolloutResult:
87
87
  total_tokens: Sum of input, output, cache-read, and cache-creation tokens,
88
88
  or None when provider telemetry was unavailable.
89
89
  cost_usd: Provider cost estimate in USD, or None when unavailable.
90
- usage_source: Provider telemetry source. One of "provider_response" or
91
- "unavailable".
90
+ usage_source: Token telemetry source. One of "provider_response",
91
+ "agent_native_acp", or "unavailable".
92
92
  price_source: Pricing table version used for cost_usd, or None.
93
+ usage_details: Optional source-specific telemetry details.
93
94
  error: Error description string, or None on success.
94
95
  error_category: Stable category for ``error``, or None on success.
95
96
  verifier_error: Verifier error description, or None if verifier succeeded
@@ -139,6 +140,7 @@ class RolloutResult:
139
140
  cost_usd: float | None = None,
140
141
  usage_source: str = "unavailable",
141
142
  price_source: str | None = None,
143
+ usage_details: dict[str, Any] | None = None,
142
144
  error: str | None = None,
143
145
  error_category: str | None = None,
144
146
  verifier_error: str | None = None,
@@ -170,6 +172,7 @@ class RolloutResult:
170
172
  self.cost_usd = cost_usd
171
173
  self.usage_source = usage_source
172
174
  self.price_source = price_source
175
+ self.usage_details = usage_details
173
176
  self.error = error
174
177
  self.error_category = error_category
175
178
  self.verifier_error = verifier_error
@@ -13,6 +13,7 @@ from benchflow.trajectories.types import (
13
13
  LLMResponse,
14
14
  Trajectory,
15
15
  )
16
+ from benchflow.usage_tracking import usage_unavailable
16
17
 
17
18
  _PROVIDER_AUTH_STATUS_CODES = (401, 403)
18
19
  _STATUS_KEYS = {
@@ -353,19 +354,6 @@ def trajectory_from_litellm_callback_log(
353
354
  return trajectory
354
355
 
355
356
 
356
- def usage_unavailable() -> dict[str, Any]:
357
- return {
358
- "n_input_tokens": 0,
359
- "n_output_tokens": 0,
360
- "n_cache_read_tokens": 0,
361
- "n_cache_creation_tokens": 0,
362
- "total_tokens": 0,
363
- "cost_usd": None,
364
- "usage_source": "unavailable",
365
- "price_source": None,
366
- }
367
-
368
-
369
357
  def extract_usage_from_trajectory(
370
358
  trajectory: Trajectory | None,
371
359
  *,
@@ -25,6 +25,7 @@ import httpx
25
25
  import yaml
26
26
 
27
27
  from benchflow.agents.codex_config import apply_codex_provider_config
28
+ from benchflow.agents.env import uses_native_subscription_auth
28
29
  from benchflow.agents.registry import AGENTS
29
30
  from benchflow.providers.litellm_config import (
30
31
  LITELLM_MASTER_KEY_ENV,
@@ -38,10 +39,9 @@ from benchflow.providers.litellm_logging import (
38
39
  callback_module_source,
39
40
  extract_usage_from_trajectory,
40
41
  trajectory_from_litellm_callback_log,
41
- usage_unavailable,
42
42
  )
43
43
  from benchflow.trajectories.types import Trajectory
44
- from benchflow.usage_tracking import UsageTrackingConfig
44
+ from benchflow.usage_tracking import UsageTrackingConfig, usage_unavailable
45
45
 
46
46
  logger = logging.getLogger(__name__)
47
47
 
@@ -961,6 +961,13 @@ async def ensure_litellm_runtime(
961
961
  reason="usage_tracking=off leaves provider traffic untouched",
962
962
  )
963
963
 
964
+ if uses_native_subscription_auth(agent, model, agent_env):
965
+ return await _skip_litellm_runtime(
966
+ agent_env,
967
+ runtime,
968
+ reason="native subscription auth will use agent ACP usage telemetry",
969
+ )
970
+
964
971
  if not needs_litellm_runtime(agent, model):
965
972
  if usage_cfg.mode == "required" and agent != "oracle":
966
973
  raise RuntimeError(
@@ -103,7 +103,13 @@ from benchflow.trajectories._capture import (
103
103
  from benchflow.trajectories.metrics import count_skill_invocations
104
104
  from benchflow.trajectories.tree import RolloutNode, RolloutTree, Step
105
105
  from benchflow.trajectories.types import redact_acp_trajectory_jsonl
106
- from benchflow.usage_tracking import UsageTrackingConfig
106
+ from benchflow.usage_tracking import (
107
+ USAGE_SOURCE_AGENT_NATIVE_ACP,
108
+ USAGE_SOURCE_PROVIDER_RESPONSE,
109
+ UsageTrackingConfig,
110
+ is_token_usage_available,
111
+ usage_unavailable,
112
+ )
107
113
 
108
114
  logger = logging.getLogger(__name__)
109
115
 
@@ -129,6 +135,75 @@ def _provider_auth_status_from_runtime(runtime: Any) -> int | None:
129
135
  return None
130
136
 
131
137
 
138
+ _NATIVE_ACP_USAGE_SNAPSHOT_TO_RESULT = {
139
+ "input_tokens": "n_input_tokens",
140
+ "output_tokens": "n_output_tokens",
141
+ "cached_read_tokens": "n_cache_read_tokens",
142
+ "cached_write_tokens": "n_cache_creation_tokens",
143
+ "total_tokens": "total_tokens",
144
+ }
145
+
146
+
147
+ def _zero_native_acp_usage_metrics() -> dict[str, Any]:
148
+ return {**usage_unavailable(), "usage_details": {"thought_tokens": 0}}
149
+
150
+
151
+ def _as_nonnegative_int(value: object) -> int:
152
+ if value is None:
153
+ return 0
154
+ if isinstance(value, bool):
155
+ return int(value)
156
+ if isinstance(value, int):
157
+ return max(value, 0)
158
+ if isinstance(value, float | str | bytes | bytearray):
159
+ try:
160
+ return max(int(value), 0)
161
+ except ValueError:
162
+ return 0
163
+ try:
164
+ return max(int(str(value)), 0)
165
+ except ValueError:
166
+ return 0
167
+
168
+
169
+ def _native_acp_usage_delta(
170
+ previous: dict[str, int | None] | None,
171
+ current: dict[str, int | None],
172
+ ) -> dict[str, int]:
173
+ delta: dict[str, int] = {}
174
+ for usage_field in (
175
+ "input_tokens",
176
+ "output_tokens",
177
+ "cached_read_tokens",
178
+ "cached_write_tokens",
179
+ "thought_tokens",
180
+ ):
181
+ current_value = _as_nonnegative_int(current.get(usage_field))
182
+ previous_value = (
183
+ _as_nonnegative_int(previous.get(usage_field)) if previous else 0
184
+ )
185
+ delta[usage_field] = max(current_value - previous_value, 0)
186
+
187
+ current_total = current.get("total_tokens")
188
+ if current_total is not None:
189
+ current_value = _as_nonnegative_int(current_total)
190
+ previous_value = (
191
+ _as_nonnegative_int(previous.get("total_tokens"))
192
+ if previous and previous.get("total_tokens") is not None
193
+ else 0
194
+ )
195
+ delta["total_tokens"] = max(current_value - previous_value, 0)
196
+ else:
197
+ delta["total_tokens"] = (
198
+ delta["input_tokens"]
199
+ + delta["output_tokens"]
200
+ + delta["cached_read_tokens"]
201
+ + delta["cached_write_tokens"]
202
+ + delta["thought_tokens"]
203
+ )
204
+ return delta
205
+
206
+
132
207
  def _task_disallows_internet(task: Any) -> bool:
133
208
  """Return True when task.toml requests no internet for the agent task."""
134
209
  env_config = getattr(getattr(task, "config", None), "environment", None)
@@ -537,6 +612,7 @@ def _build_rollout_result(
537
612
  cost_usd: float | None = None,
538
613
  usage_source: str = "unavailable",
539
614
  price_source: str | None = None,
615
+ usage_details: dict[str, Any] | None = None,
540
616
  usage_tracking: dict[str, Any] | None = None,
541
617
  evolved_skills: dict[str, str] | None = None,
542
618
  source_provenance: dict[str, Any] | None = None,
@@ -588,6 +664,7 @@ def _build_rollout_result(
588
664
  cost_usd=cost_usd,
589
665
  usage_source=usage_source,
590
666
  price_source=price_source,
667
+ usage_details=usage_details,
591
668
  error=error,
592
669
  error_category=error_category,
593
670
  verifier_error=verifier_error,
@@ -615,6 +692,8 @@ def _build_rollout_result(
615
692
  "usage_source": result.usage_source,
616
693
  "price_source": result.price_source,
617
694
  }
695
+ if result.usage_details is not None:
696
+ agent_result["usage_details"] = result.usage_details
618
697
  final_metrics = final_metrics_from_agent_result(agent_result)
619
698
  trajectory_summary = trajectory_summary_from_events(
620
699
  trajectory,
@@ -1178,6 +1257,8 @@ class Rollout:
1178
1257
  self._task_skill_policy: TaskSkillPolicy | None = None
1179
1258
  self._usage_runtime: Any = None
1180
1259
  self._usage_metrics: dict[str, Any] = self._planes.extract_usage(None)
1260
+ self._native_usage_metrics: dict[str, Any] = _zero_native_acp_usage_metrics()
1261
+ self._native_usage_checkpoint: dict[str, int | None] | None = None
1181
1262
  # Provider 401/403 status snapshotted during cleanup, after the usage
1182
1263
  # proxy imports its captures (Daytona's SandboxUsageProxy only fills
1183
1264
  # trajectory on stop()). Read by _provider_auth_status() so ACP-error
@@ -1649,6 +1730,7 @@ class Rollout:
1649
1730
  agent_cwd=self._agent_cwd,
1650
1731
  reasoning_effort=cfg.primary_reasoning_effort,
1651
1732
  )
1733
+ self._native_usage_checkpoint = None
1652
1734
  self._reapply_ask_user_handler()
1653
1735
  self._attach_trajectory_writer(rollout_dir)
1654
1736
 
@@ -1827,6 +1909,7 @@ class Rollout:
1827
1909
  self._n_tool_calls += new_tools
1828
1910
  self._executed_prompts.extend(effective_prompts)
1829
1911
  self._trajectory_source = "acp"
1912
+ self._collect_native_acp_usage()
1830
1913
 
1831
1914
  # Grow the tree at Step-level granularity — one Step per ACP event
1832
1915
  # (tool_call, agent_message, agent_thought, user_message). A single
@@ -1859,6 +1942,46 @@ class Rollout:
1859
1942
  self._phase = "executed"
1860
1943
  return trajectory, n_tool_calls
1861
1944
 
1945
+ def _collect_native_acp_usage(self) -> None:
1946
+ """Accumulate ACP PromptResponse.usage deltas for native subscription runs."""
1947
+ session = getattr(self, "_session", None)
1948
+ latest_fn = getattr(session, "latest_usage_totals", None)
1949
+ if not callable(latest_fn):
1950
+ return
1951
+ latest = latest_fn()
1952
+ if not latest:
1953
+ return
1954
+ previous = getattr(self, "_native_usage_checkpoint", None)
1955
+ delta = _native_acp_usage_delta(previous, latest)
1956
+ self._native_usage_checkpoint = dict(latest)
1957
+ if not any(delta.values()):
1958
+ return
1959
+
1960
+ metrics = dict(
1961
+ getattr(self, "_native_usage_metrics", _zero_native_acp_usage_metrics())
1962
+ )
1963
+ for (
1964
+ snapshot_field,
1965
+ result_field,
1966
+ ) in _NATIVE_ACP_USAGE_SNAPSHOT_TO_RESULT.items():
1967
+ if result_field == "total_tokens":
1968
+ continue
1969
+ metrics[result_field] = _as_nonnegative_int(metrics.get(result_field)) + (
1970
+ delta.get(snapshot_field) or 0
1971
+ )
1972
+ metrics["total_tokens"] = _as_nonnegative_int(metrics.get("total_tokens")) + (
1973
+ delta.get("total_tokens") or 0
1974
+ )
1975
+ details = dict(metrics.get("usage_details") or {})
1976
+ details["thought_tokens"] = _as_nonnegative_int(
1977
+ details.get("thought_tokens")
1978
+ ) + (delta.get("thought_tokens") or 0)
1979
+ metrics["usage_details"] = details
1980
+ metrics["usage_source"] = USAGE_SOURCE_AGENT_NATIVE_ACP
1981
+ metrics["cost_usd"] = None
1982
+ metrics["price_source"] = None
1983
+ self._native_usage_metrics = metrics
1984
+
1862
1985
  def _build_step_batch(self, new_events: list[dict], new_tools: int) -> list[Step]:
1863
1986
  """Build one Step per ACP event from the events appended this execute.
1864
1987
 
@@ -2102,7 +2225,9 @@ class Rollout:
2102
2225
  logger.warning(f"LLM trajectory write failed: {e}")
2103
2226
  finally:
2104
2227
  self._usage_runtime = None
2105
- self._enforce_required_usage_tracking()
2228
+
2229
+ self._finalize_usage_metrics()
2230
+ self._enforce_required_usage_tracking()
2106
2231
 
2107
2232
  if self._environment is not None:
2108
2233
  with contextlib.suppress(Exception):
@@ -2126,11 +2251,24 @@ class Rollout:
2126
2251
 
2127
2252
  self._phase = "cleaned"
2128
2253
 
2254
+ def _finalize_usage_metrics(self) -> None:
2255
+ """Prefer LiteLLM usage, otherwise use trusted native ACP usage."""
2256
+ current_metrics = getattr(
2257
+ self, "_usage_metrics", {"usage_source": "unavailable"}
2258
+ )
2259
+ if current_metrics.get("usage_source") == USAGE_SOURCE_PROVIDER_RESPONSE:
2260
+ return
2261
+ native_metrics = getattr(self, "_native_usage_metrics", None)
2262
+ if isinstance(native_metrics, dict) and is_token_usage_available(
2263
+ native_metrics
2264
+ ):
2265
+ self._usage_metrics = native_metrics
2266
+
2129
2267
  def _enforce_required_usage_tracking(self) -> None:
2130
2268
  usage_cfg = self._config.usage_tracking.with_env_defaults()
2131
2269
  if usage_cfg.mode != "required" or self._config.primary_agent == "oracle":
2132
2270
  return
2133
- if self._usage_metrics.get("usage_source") == "provider_response":
2271
+ if is_token_usage_available(getattr(self, "_usage_metrics", None)):
2134
2272
  return
2135
2273
  if self._error is not None:
2136
2274
  return
@@ -2721,7 +2859,7 @@ class Rollout:
2721
2859
  usage_source = str(self._usage_metrics.get("usage_source", "unavailable"))
2722
2860
  if usage_cfg.mode == "off":
2723
2861
  status = "off"
2724
- elif usage_source == "provider_response":
2862
+ elif is_token_usage_available(self._usage_metrics):
2725
2863
  status = "enabled"
2726
2864
  else:
2727
2865
  status = "unavailable"
@@ -7,8 +7,15 @@ from dataclasses import dataclass
7
7
  from typing import Any, Literal, cast
8
8
 
9
9
  UsageTrackingMode = Literal["auto", "required", "off"]
10
+ UsageSource = Literal["provider_response", "agent_native_acp", "unavailable"]
10
11
 
11
12
  USAGE_TRACKING_ENV = "BENCHFLOW_USAGE_TRACKING"
13
+ USAGE_SOURCE_PROVIDER_RESPONSE = "provider_response"
14
+ USAGE_SOURCE_AGENT_NATIVE_ACP = "agent_native_acp"
15
+ USAGE_SOURCE_UNAVAILABLE = "unavailable"
16
+ TRUSTED_USAGE_SOURCES: frozenset[str] = frozenset(
17
+ {USAGE_SOURCE_PROVIDER_RESPONSE, USAGE_SOURCE_AGENT_NATIVE_ACP}
18
+ )
12
19
 
13
20
  _MODES: set[str] = {"auto", "required", "off"}
14
21
  _LEGACY_USAGE_PROXY_KEYS: frozenset[str] = frozenset(
@@ -36,13 +43,39 @@ def _optional_mode(value: Any) -> UsageTrackingMode | None:
36
43
  return normalize_usage_tracking_mode(str(value))
37
44
 
38
45
 
46
+ def is_trusted_usage_source(value: object) -> bool:
47
+ """Return True for usage telemetry sources that satisfy required tracking."""
48
+ return str(value) in TRUSTED_USAGE_SOURCES
49
+
50
+
51
+ def is_token_usage_available(metrics: dict[str, Any] | None) -> bool:
52
+ """Return True when a usage metrics payload has trusted token telemetry."""
53
+ if not metrics:
54
+ return False
55
+ return is_trusted_usage_source(metrics.get("usage_source"))
56
+
57
+
58
+ def usage_unavailable() -> dict[str, Any]:
59
+ """Return the canonical empty token-usage metrics payload."""
60
+ return {
61
+ "n_input_tokens": 0,
62
+ "n_output_tokens": 0,
63
+ "n_cache_read_tokens": 0,
64
+ "n_cache_creation_tokens": 0,
65
+ "total_tokens": 0,
66
+ "cost_usd": None,
67
+ "usage_source": USAGE_SOURCE_UNAVAILABLE,
68
+ "price_source": None,
69
+ }
70
+
71
+
39
72
  @dataclass(frozen=True, init=False)
40
73
  class UsageTrackingConfig:
41
74
  """User-facing token/cost telemetry policy.
42
75
 
43
76
  ``mode`` is the operator contract:
44
- - ``auto`` records usage when the LiteLLM gateway can be started.
45
- - ``required`` fails before the agent runs when telemetry cannot be wired.
77
+ - ``auto`` records usage when LiteLLM or native ACP telemetry can be used.
78
+ - ``required`` fails when no trusted token telemetry can be captured.
46
79
  - ``off`` leaves provider traffic untouched.
47
80
  """
48
81
 
@@ -119,6 +152,8 @@ class UsageTrackingConfig:
119
152
  endpoint_kind = "sandbox" if environment == "daytona" else "host"
120
153
  if self.mode == "off":
121
154
  endpoint_kind = "none"
155
+ elif usage_source == USAGE_SOURCE_AGENT_NATIVE_ACP:
156
+ endpoint_kind = "agent_native"
122
157
  return {
123
158
  "requested": self.mode,
124
159
  "status": status,