benchflow 0.5.3.dev894__tar.gz → 0.5.3.dev899__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (398) hide show
  1. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/PKG-INFO +1 -1
  2. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/pyproject.toml +1 -1
  3. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/registry.py +19 -7
  4. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_agent_registry.py +14 -5
  5. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_agent_setup.py +2 -2
  6. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/.gitignore +0 -0
  7. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/CHANGELOG.md +0 -0
  8. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/LICENSE +0 -0
  9. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/README.md +0 -0
  10. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/__init__.py +0 -0
  11. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_dotenv.py +0 -0
  12. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_paths.py +0 -0
  13. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_run.py +0 -0
  14. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_types.py +0 -0
  15. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/__init__.py +0 -0
  16. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/benchmark_repos.py +0 -0
  17. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/config.py +0 -0
  18. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/evaluation_results.py +0 -0
  19. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/json_safe.py +0 -0
  20. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/learner_memory.py +0 -0
  21. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/result_metadata.py +0 -0
  22. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/reward_events.py +0 -0
  23. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/scoring.py +0 -0
  24. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/source_provenance.py +0 -0
  25. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/task_authoring.py +0 -0
  26. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/_utils/yaml_loader.py +0 -0
  27. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/acp/__init__.py +0 -0
  28. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/acp/client.py +0 -0
  29. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/acp/container_transport.py +0 -0
  30. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/acp/runtime.py +0 -0
  31. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/acp/session.py +0 -0
  32. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/acp/transport.py +0 -0
  33. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/acp/types.py +0 -0
  34. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/adapters/__init__.py +0 -0
  35. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/adapters/harbor.py +0 -0
  36. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/adapters/inbound.py +0 -0
  37. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/adapters/inspect_ai.py +0 -0
  38. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/adapters/ors.py +0 -0
  39. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/adapters/terminal_bench.py +0 -0
  40. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/__init__.py +0 -0
  41. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/codex_config.py +0 -0
  42. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/credentials.py +0 -0
  43. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/env.py +0 -0
  44. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/errors.py +0 -0
  45. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
  46. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/install.py +0 -0
  47. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
  48. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/pi_acp_launcher.py +0 -0
  49. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/protocol.py +0 -0
  50. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/agents/providers.py +0 -0
  51. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/branch.py +0 -0
  52. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/cli/__init__.py +0 -0
  53. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/cli/main.py +0 -0
  54. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/cli/trace_import.py +0 -0
  55. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/compat/__init__.py +0 -0
  56. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/compat/harbor_registry.py +0 -0
  57. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/contracts/__init__.py +0 -0
  58. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/contracts/planes.py +0 -0
  59. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/contracts/user.py +0 -0
  60. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/demo_task/environment/Dockerfile +0 -0
  61. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/demo_task/instruction.md +0 -0
  62. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/demo_task/task.toml +0 -0
  63. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/demo_task/tests/test.sh +0 -0
  64. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/diagnostics.py +0 -0
  65. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/environment/__init__.py +0 -0
  66. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/environment/manifest.py +0 -0
  67. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/environment/manifest_env.py +0 -0
  68. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/environment/protocol.py +0 -0
  69. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/environment/readiness.py +0 -0
  70. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/eval_sharding.py +0 -0
  71. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/eval_worker.py +0 -0
  72. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/evaluation.py +0 -0
  73. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/experimental/__init__.py +0 -0
  74. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/experimental/mcp/__init__.py +0 -0
  75. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/experimental/mcp/hooks.py +0 -0
  76. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
  77. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/hosted_env.py +0 -0
  78. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/learner_skills.py +0 -0
  79. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/learner_store.py +0 -0
  80. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/metrics.py +0 -0
  81. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/models.py +0 -0
  82. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/monitor.py +0 -0
  83. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/providers/__init__.py +0 -0
  84. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
  85. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_config.py +0 -0
  86. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_logging.py +0 -0
  87. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/providers/litellm_runtime.py +0 -0
  88. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/providers/runtime.py +0 -0
  89. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/py.typed +0 -0
  90. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/README.md +0 -0
  91. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/__init__.py +0 -0
  92. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/builtins.py +0 -0
  93. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/events.py +0 -0
  94. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/file_readers.py +0 -0
  95. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/llm.py +0 -0
  96. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/memory_scorer.py +0 -0
  97. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/node.py +0 -0
  98. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/protocol.py +0 -0
  99. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/rubric.py +0 -0
  100. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/rubric_config.py +0 -0
  101. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rewards/validation.py +0 -0
  102. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rollout.py +0 -0
  103. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rollout_branch.py +0 -0
  104. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/rollout_planes.py +0 -0
  105. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/runtime.py +0 -0
  106. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/__init__.py +0 -0
  107. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_base.py +0 -0
  108. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_compose.py +0 -0
  109. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
  110. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
  111. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
  112. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
  113. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/_sdk_ops.py +0 -0
  114. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/daytona.py +0 -0
  115. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/docker.py +0 -0
  116. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/lockdown.py +0 -0
  117. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/metadata.py +0 -0
  118. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/modal_impl.py +0 -0
  119. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/process.py +0 -0
  120. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/protocol.py +0 -0
  121. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/services.py +0 -0
  122. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/setup.py +0 -0
  123. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/snapshot.py +0 -0
  124. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sandbox/user.py +0 -0
  125. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/scenes.py +0 -0
  126. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/sdk.py +0 -0
  127. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/self_gen.py +0 -0
  128. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/skill_eval/__init__.py +0 -0
  129. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/skill_eval/_core.py +0 -0
  130. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/skill_eval/gepa_export.py +0 -0
  131. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/skill_eval/schema.py +0 -0
  132. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/skill_policy.py +0 -0
  133. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/skills.py +0 -0
  134. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/task/__init__.py +0 -0
  135. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/task/config.py +0 -0
  136. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/task/env.py +0 -0
  137. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/task/paths.py +0 -0
  138. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/task/task.py +0 -0
  139. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/task/verifier.py +0 -0
  140. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/templates/__init__.py +0 -0
  141. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/templates/judge.py.tmpl +0 -0
  142. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/templates/test.sh.tmpl +0 -0
  143. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/traces/__init__.py +0 -0
  144. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/traces/huggingface.py +0 -0
  145. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/traces/local.py +0 -0
  146. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/traces/models.py +0 -0
  147. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/traces/parsers.py +0 -0
  148. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/traces/task_gen.py +0 -0
  149. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/__init__.py +0 -0
  150. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/_capture.py +0 -0
  151. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/export.py +0 -0
  152. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/metrics.py +0 -0
  153. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/otel.py +0 -0
  154. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/tree.py +0 -0
  155. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/types.py +0 -0
  156. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/trajectories/viewer.py +0 -0
  157. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/src/benchflow/usage_tracking.py +0 -0
  158. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/__init__.py +0 -0
  159. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/agents/__init__.py +0 -0
  160. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/agents/test_protocol.py +0 -0
  161. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/README.md +0 -0
  162. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
  163. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
  164. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
  165. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/instruction.md +0 -0
  166. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
  167. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/task.toml +0 -0
  168. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/acp_smoke/tests/test.sh +0 -0
  169. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/conformance-results.json +0 -0
  170. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/proof_multi_agent.py +0 -0
  171. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/proof_snapshot.py +0 -0
  172. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/run_conformance.py +0 -0
  173. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
  174. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/conftest.py +0 -0
  175. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/environment/__init__.py +0 -0
  176. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/environment/test_chibench_manifest.py +0 -0
  177. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/environment/test_clawsbench_manifest.py +0 -0
  178. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/environment/test_manifest.py +0 -0
  179. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/environment/test_manifest_env.py +0 -0
  180. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/environment/test_protocol.py +0 -0
  181. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/environment/test_readiness.py +0 -0
  182. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
  183. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/hello-world-task/instruction.md +0 -0
  184. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/hello-world-task/solution/solve.sh +0 -0
  185. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/hello-world-task/task.toml +0 -0
  186. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/hello-world-task/tests/test.sh +0 -0
  187. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
  188. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
  189. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
  190. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
  191. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
  192. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
  193. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/test_claude.sh +0 -0
  194. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/test_codex.sh +0 -0
  195. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/test_codex_custom_provider.sh +0 -0
  196. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/test_gemini.sh +0 -0
  197. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/test_openclaw.sh +0 -0
  198. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/traces/minimal-claude.jsonl +0 -0
  199. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
  200. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/fixtures/mock_acp_agent.py +0 -0
  201. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
  202. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
  203. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/fixtures/mock_openai_responses_server.py +0 -0
  204. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/check_adapter_evidence.py +0 -0
  205. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/check_hosted_env_evidence.py +0 -0
  206. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/check_results.py +0 -0
  207. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
  208. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/check_trace_to_task_evidence.py +0 -0
  209. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/configs/claude-agent-acp.yaml +0 -0
  210. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/configs/codex-acp.yaml +0 -0
  211. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/configs/gemini.yaml +0 -0
  212. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
  213. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/configs/openclaw.yaml +0 -0
  214. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/configs/opencode.yaml +0 -0
  215. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/configs/openhands.yaml +0 -0
  216. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/configs/pi-acp.yaml +0 -0
  217. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/run.sh +0 -0
  218. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/run_suite.py +0 -0
  219. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/integration/suites/release.yaml +0 -0
  220. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_acp.py +0 -0
  221. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_acp_capability_advertising.py +0 -0
  222. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_acp_model_config_dispatch.py +0 -0
  223. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_acp_pinned_protocol_guard.py +0 -0
  224. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_acp_setup_failure_propagation.py +0 -0
  225. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_adapter_scripts.py +0 -0
  226. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_adapters.py +0 -0
  227. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_agent_cli.py +0 -0
  228. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_agent_env_resolution.py +0 -0
  229. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_agent_gemini_defaults.py +0 -0
  230. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_agent_idle_timeout_cli.py +0 -0
  231. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_agent_model_decouple.py +0 -0
  232. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_agent_spec.py +0 -0
  233. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_base_install_imports.py +0 -0
  234. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_bedrock_thinking.py +0 -0
  235. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_branch.py +0 -0
  236. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_capture_trajectory.py +0 -0
  237. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_clawsbench_slice.py +0 -0
  238. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_cli_daytona.py +0 -0
  239. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_cli_docs_drift.py +0 -0
  240. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_cli_misc.py +0 -0
  241. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_compat_harbor_registry.py +0 -0
  242. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_config_redaction.py +0 -0
  243. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_connect_as_env.py +0 -0
  244. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_continuallearningbench_adapter.py +0 -0
  245. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_dashboard_credential_env_scrub.py +0 -0
  246. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_dashboard_daytona_key.py +0 -0
  247. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_dashboard_no_host_paths.py +0 -0
  248. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_dashboard_release_evidence.py +0 -0
  249. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_dashboard_roadmap.py +0 -0
  250. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_dashboard_symlink_ingestion.py +0 -0
  251. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_dashboard_sync.py +0 -0
  252. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_daytona_command_polling.py +0 -0
  253. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_daytona_litellm_runtime.py +0 -0
  254. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_daytona_status.py +0 -0
  255. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_docker_prune_scoping.py +0 -0
  256. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_docker_uploads.py +0 -0
  257. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_docs_examples.py +0 -0
  258. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_eng50_capabilities.py +0 -0
  259. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_env_setup.py +0 -0
  260. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_environment_manifest_controls.py +0 -0
  261. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_eval_filters_applied.py +0 -0
  262. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_eval_sharding.py +0 -0
  263. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_eval_single_task_summary.py +0 -0
  264. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_eval_source_provenance.py +0 -0
  265. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_eval_worker_retry.py +0 -0
  266. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_eval_zero_task_guard.py +0 -0
  267. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_evaluation_environment_manifest.py +0 -0
  268. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_exclude_tasks.py +0 -0
  269. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_experiments_status.py +0 -0
  270. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_harvey_lab_shim.py +0 -0
  271. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_hf_scores.py +0 -0
  272. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_hilbench_adapter.py +0 -0
  273. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_hosted_env.py +0 -0
  274. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_hosted_env_rollout_contract.py +0 -0
  275. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_inbound_adapter_manifest.py +0 -0
  276. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_inbound_adapters.py +0 -0
  277. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_integration_check_results.py +0 -0
  278. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_integration_run_suite.py +0 -0
  279. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_internet_policy.py +0 -0
  280. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_job.py +0 -0
  281. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_job_sequential_shared.py +0 -0
  282. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_job_sequential_shared_resume.py +0 -0
  283. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_judge_symlink_ingestion.py +0 -0
  284. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_learner_skills.py +0 -0
  285. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_learner_skills_traversal.py +0 -0
  286. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_learner_store.py +0 -0
  287. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_learner_store_persistence.py +0 -0
  288. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_litellm_config.py +0 -0
  289. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_litellm_hardening.py +0 -0
  290. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_litellm_logging.py +0 -0
  291. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_litellm_runtime.py +0 -0
  292. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_litellm_smoke.py +0 -0
  293. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_llm_judge.py +0 -0
  294. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_llm_judge_event_tags.py +0 -0
  295. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_llm_judge_verifier.py +0 -0
  296. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_memory_scorer.py +0 -0
  297. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_metrics.py +0 -0
  298. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_mock_openai_responses_server.py +0 -0
  299. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_monitor_scaffold.py +0 -0
  300. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_native_acp_usage.py +0 -0
  301. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_no_cross_provider_fallback.py +0 -0
  302. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_notification_order_real.py +0 -0
  303. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_oracle.py +0 -0
  304. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_oracle_chokepoint.py +0 -0
  305. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_paths_safe.py +0 -0
  306. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_paths_symlink_helpers.py +0 -0
  307. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_pi_acp_launcher.py +0 -0
  308. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_process.py +0 -0
  309. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_provider_auth_detection.py +0 -0
  310. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_providers.py +0 -0
  311. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_reexport.py +0 -0
  312. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_registry_invariants.py +0 -0
  313. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_release_version.py +0 -0
  314. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_resolve_env_helpers.py +0 -0
  315. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_reward_node.py +0 -0
  316. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_reward_unified_contract.py +0 -0
  317. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rewards.py +0 -0
  318. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rewards_jsonl.py +0 -0
  319. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rollout_architecture.py +0 -0
  320. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rollout_branch.py +0 -0
  321. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rollout_config_path_coercion.py +0 -0
  322. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rollout_environment.py +0 -0
  323. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rollout_import_no_side_effects.py +0 -0
  324. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rollout_on_ask_user_wiring.py +0 -0
  325. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rollout_probe_sandbox_health.py +0 -0
  326. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rollout_upload.py +0 -0
  327. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_rubric_config.py +0 -0
  328. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_runtime.py +0 -0
  329. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_runtime_config_wired.py +0 -0
  330. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_runtime_live_sandbox.py +0 -0
  331. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sandbox.py +0 -0
  332. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sandbox_exec_secret_handling.py +0 -0
  333. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sandbox_hardening.py +0 -0
  334. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
  335. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sandbox_multi_service.py +0 -0
  336. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sandbox_protocol.py +0 -0
  337. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sandbox_setup.py +0 -0
  338. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sandbox_snapshot_contract.py +0 -0
  339. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sandbox_upload_symlink.py +0 -0
  340. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sandbox_verifier_workspace.py +0 -0
  341. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_scene.py +0 -0
  342. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_scene_outbox_trial.py +0 -0
  343. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_scene_parallel_group.py +0 -0
  344. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_scene_result_aggregation.py +0 -0
  345. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_scoring.py +0 -0
  346. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sdk_internals.py +0 -0
  347. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_sdk_lockdown.py +0 -0
  348. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_self_gen_cli.py +0 -0
  349. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_self_gen_export_error_channel.py +0 -0
  350. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_self_gen_export_failures.py +0 -0
  351. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_self_gen_orchestration.py +0 -0
  352. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_session_request_permission_dispatch.py +0 -0
  353. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skill_eval.py +0 -0
  354. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skill_eval_dryrun.py +0 -0
  355. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skill_eval_integration.py +0 -0
  356. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skill_eval_sweep.py +0 -0
  357. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skill_eval_traversal.py +0 -0
  358. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skill_invocation_artifacts.py +0 -0
  359. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skill_policy.py +0 -0
  360. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skills.py +0 -0
  361. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skills_dir_agent_home_link.py +0 -0
  362. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skillsbench_harbor_parity.py +0 -0
  363. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_skillsbench_harbor_run_suite.py +0 -0
  364. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_smoke.py +0 -0
  365. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_subscription_auth.py +0 -0
  366. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_task_check_eval_consistency.py +0 -0
  367. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_task_config.py +0 -0
  368. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_task_download.py +0 -0
  369. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_tasks.py +0 -0
  370. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_token_usage_normalization.py +0 -0
  371. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_trace_import_cli.py +0 -0
  372. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_trace_task_gen_traversal.py +0 -0
  373. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_trace_to_task_evidence.py +0 -0
  374. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_traces_huggingface.py +0 -0
  375. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_traces_parsers.py +0 -0
  376. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_traces_task_gen.py +0 -0
  377. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_train_mode_artifact_emission.py +0 -0
  378. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_trajectory_integration.py +0 -0
  379. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_trajectory_streaming.py +0 -0
  380. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_trial_agent_timeout_verify.py +0 -0
  381. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_trial_install_agent_timeout.py +0 -0
  382. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_trial_litellm_runtime.py +0 -0
  383. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_usage_litellm.py +0 -0
  384. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_usage_required.py +0 -0
  385. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_usage_tracking.py +0 -0
  386. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_user.py +0 -0
  387. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_verifier_multi_container.py +0 -0
  388. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_verifier_output.py +0 -0
  389. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_verifier_output_freshness.py +0 -0
  390. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_verify.py +0 -0
  391. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_workflow_action_pinning.py +0 -0
  392. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/test_yaml_config.py +0 -0
  393. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/trajectories/__init__.py +0 -0
  394. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/trajectories/test_export.py +0 -0
  395. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/trajectories/test_export_nan_handling.py +0 -0
  396. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/trajectories/test_redaction.py +0 -0
  397. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/trajectories/test_step_granularity.py +0 -0
  398. {benchflow-0.5.3.dev894 → benchflow-0.5.3.dev899}/tests/trajectories/test_tree.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchflow
3
- Version: 0.5.3.dev894
3
+ Version: 0.5.3.dev899
4
4
  Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
5
5
  Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
6
6
  Project-URL: Repository, https://github.com/benchflow-ai/benchflow
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "benchflow"
3
- version = "0.5.3.dev894"
3
+ version = "0.5.3.dev899"
4
4
  description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -106,6 +106,9 @@ def _apt_install(*packages: str) -> str:
106
106
  _BENCHFLOW_NODE_PREFIX = "/opt/benchflow/node"
107
107
  _BENCHFLOW_JS_AGENT_PREFIX = "/opt/benchflow/js-agents"
108
108
  _BENCHFLOW_BIN_PREFIX = "/opt/benchflow/bin"
109
+ _OPENHANDS_CLI_VERSION = "1.16.0"
110
+ _OPENHANDS_SDK_VERSION = "1.22.1"
111
+ _OPENHANDS_TOOLS_VERSION = "1.22.1"
109
112
  _JS_AGENT_PATH = (
110
113
  f"{_BENCHFLOW_BIN_PREFIX}:{_BENCHFLOW_JS_AGENT_PREFIX}/bin:"
111
114
  f"{_BENCHFLOW_NODE_PREFIX}/bin:$PATH"
@@ -543,15 +546,15 @@ AGENTS: dict[str, AgentConfig] = {
543
546
  install_cmd=(
544
547
  "export DEBIAN_FRONTEND=noninteractive && "
545
548
  'export PATH="$HOME/.local/bin:$PATH" && '
546
- "( command -v curl >/dev/null 2>&1 && command -v git >/dev/null 2>&1 || "
549
+ "( command -v curl >/dev/null 2>&1 || "
547
550
  " if command -v apt-get >/dev/null 2>&1; then "
548
- f" {_apt_install('curl', 'ca-certificates', 'git')}; "
551
+ f" {_apt_install('curl', 'ca-certificates')}; "
549
552
  " elif command -v dnf >/dev/null 2>&1; then "
550
- " dnf -y --allowerasing install curl ca-certificates git >/dev/null 2>&1; "
553
+ " dnf -y --allowerasing install curl ca-certificates >/dev/null 2>&1; "
551
554
  " elif command -v apk >/dev/null 2>&1; then "
552
- " apk add --no-cache curl ca-certificates git >/dev/null 2>&1; "
555
+ " apk add --no-cache curl ca-certificates >/dev/null 2>&1; "
553
556
  " else "
554
- " echo 'OpenHands GitHub install requires curl and git' >&2; "
557
+ " echo 'OpenHands install requires curl' >&2; "
555
558
  " exit 127; "
556
559
  " fi ) && "
557
560
  "( UV_OK=0; "
@@ -566,9 +569,18 @@ AGENTS: dict[str, AgentConfig] = {
566
569
  " curl -LsSf https://astral.sh/uv/install.sh | sh >/dev/null 2>&1 && "
567
570
  ' export PATH="$HOME/.local/bin:$PATH"; '
568
571
  " fi && "
572
+ # OpenHands CLI 1.16.0 pins openhands-sdk/tools 1.21.0. That one
573
+ # SDK release makes the synthetic `security_risk` tool field
574
+ # required whenever LLMSecurityAnalyzer is attached; the ACP path
575
+ # attaches it even under --always-approve, so Claude Opus can loop
576
+ # on validation errors until timeout. 1.22.x restores the intended
577
+ # default-to-UNKNOWN behavior without the API drift seen in 1.26.x.
578
+ f"printf 'openhands-sdk=={_OPENHANDS_SDK_VERSION}\\n"
579
+ f"openhands-tools=={_OPENHANDS_TOOLS_VERSION}\\n' "
580
+ "> /tmp/oh-sdk-overrides.txt && "
569
581
  "uv tool install --force --refresh "
570
- "--from 'git+https://github.com/OpenHands/OpenHands-CLI.git@main' "
571
- "openhands --python 3.12 && "
582
+ "--overrides /tmp/oh-sdk-overrides.txt "
583
+ f"openhands=={_OPENHANDS_CLI_VERSION} --python 3.12 && "
572
584
  " uv tool list | grep -q '^openhands\\b' ) && "
573
585
  # Let sandbox user traverse to uv-managed Python interpreter path.
574
586
  "chmod o+x /root /root/.local /root/.local/share "
@@ -99,20 +99,29 @@ class TestOpenHandsConfig:
99
99
  assert "$HOME/.agents/skills" in cfg.skill_paths
100
100
  assert "$WORKSPACE/.agents/skills" in cfg.skill_paths
101
101
 
102
- def test_openhands_install_cmd_forces_github_main(self):
102
+ def test_openhands_install_cmd_pins_stable_pypi_release(self):
103
103
  cfg = AGENTS["openhands"]
104
104
  assert (
105
- "apt-get -o Acquire::Retries=3 install -y -qq curl ca-certificates git"
105
+ "apt-get -o Acquire::Retries=3 install -y -qq curl ca-certificates"
106
106
  in cfg.install_cmd
107
107
  )
108
108
  assert (
109
109
  "uv tool install --force --refresh "
110
- "--from 'git+https://github.com/OpenHands/OpenHands-CLI.git@main' "
111
- "openhands --python 3.12" in cfg.install_cmd
110
+ "--overrides /tmp/oh-sdk-overrides.txt "
111
+ "openhands==1.16.0 --python 3.12" in cfg.install_cmd
112
112
  )
113
- assert "command -v git" in cfg.install_cmd
113
+ assert "OpenHands/OpenHands-CLI.git@main" not in cfg.install_cmd
114
114
  assert "install.openhands.dev/install.sh" not in cfg.install_cmd
115
115
 
116
+ def test_openhands_install_cmd_overrides_buggy_sdk_pin(self):
117
+ """Guards PR #644 against Opus timeouts from OpenHands SDK 1.21.0."""
118
+ cfg = AGENTS["openhands"]
119
+
120
+ assert "openhands-sdk==1.22.1" in cfg.install_cmd
121
+ assert "openhands-tools==1.22.1" in cfg.install_cmd
122
+ assert "openhands-sdk>=1.22.0" not in cfg.install_cmd
123
+ assert "--overrides /tmp/oh-sdk-overrides.txt" in cfg.install_cmd
124
+
116
125
  def test_openhands_install_cmd_does_not_deploy_bedrock_shim(self):
117
126
  """Guards the LiteLLM runtime refactor: Bedrock patches live with LiteLLM."""
118
127
  cfg = AGENTS["openhands"]
@@ -637,8 +637,8 @@ async def test_install_agent_writes_command_stdout_and_stderr_on_failure(
637
637
  assert log_text.startswith("$ ")
638
638
  assert (
639
639
  "uv tool install --force --refresh "
640
- "--from 'git+https://github.com/OpenHands/OpenHands-CLI.git@main' "
641
- "openhands --python 3.12" in log_text
640
+ "--overrides /tmp/oh-sdk-overrides.txt "
641
+ "openhands==1.16.0 --python 3.12" in log_text
642
642
  )
643
643
  assert "=== stderr ===" in log_text
644
644
  assert "uv: command not found" in log_text