benchflow 0.5.3.dev906__tar.gz → 0.5.3.dev908__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (412) hide show
  1. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/CHANGELOG.md +10 -0
  2. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/PKG-INFO +1 -1
  3. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/pyproject.toml +1 -1
  4. benchflow-0.5.3.dev908/src/benchflow/cli/continue_cmd.py +236 -0
  5. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/cli/main.py +4 -0
  6. benchflow-0.5.3.dev908/src/benchflow/continue_run/__init__.py +35 -0
  7. benchflow-0.5.3.dev908/src/benchflow/continue_run/batch.py +125 -0
  8. benchflow-0.5.3.dev908/src/benchflow/continue_run/orchestrator.py +743 -0
  9. benchflow-0.5.3.dev908/src/benchflow/continue_run/replay_proxy.py +409 -0
  10. benchflow-0.5.3.dev908/src/benchflow/continue_run/run_folder.py +211 -0
  11. benchflow-0.5.3.dev908/src/benchflow/continue_run/sandbox_proxy.py +472 -0
  12. benchflow-0.5.3.dev908/tests/continue_run/_helpers.py +100 -0
  13. benchflow-0.5.3.dev908/tests/continue_run/test_batch.py +111 -0
  14. benchflow-0.5.3.dev908/tests/continue_run/test_orchestrator.py +302 -0
  15. benchflow-0.5.3.dev908/tests/continue_run/test_replay_proxy.py +176 -0
  16. benchflow-0.5.3.dev908/tests/continue_run/test_run_folder.py +97 -0
  17. benchflow-0.5.3.dev908/tests/trajectories/__init__.py +0 -0
  18. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/.gitignore +0 -0
  19. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/LICENSE +0 -0
  20. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/README.md +0 -0
  21. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/__init__.py +0 -0
  22. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_dotenv.py +0 -0
  23. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_paths.py +0 -0
  24. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_run.py +0 -0
  25. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_types.py +0 -0
  26. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/__init__.py +0 -0
  27. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/benchmark_repos.py +0 -0
  28. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/config.py +0 -0
  29. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/evaluation_results.py +0 -0
  30. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/json_safe.py +0 -0
  31. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/learner_memory.py +0 -0
  32. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/result_metadata.py +0 -0
  33. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/reward_events.py +0 -0
  34. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/scoring.py +0 -0
  35. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/source_provenance.py +0 -0
  36. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/task_authoring.py +0 -0
  37. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/_utils/yaml_loader.py +0 -0
  38. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/__init__.py +0 -0
  39. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/client.py +0 -0
  40. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/container_transport.py +0 -0
  41. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/runtime.py +0 -0
  42. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/session.py +0 -0
  43. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/transport.py +0 -0
  44. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/acp/types.py +0 -0
  45. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/__init__.py +0 -0
  46. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/harbor.py +0 -0
  47. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/inbound.py +0 -0
  48. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/inspect_ai.py +0 -0
  49. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/ors.py +0 -0
  50. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/adapters/terminal_bench.py +0 -0
  51. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/__init__.py +0 -0
  52. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/codex_config.py +0 -0
  53. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/credentials.py +0 -0
  54. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/env.py +0 -0
  55. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/errors.py +0 -0
  56. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
  57. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/install.py +0 -0
  58. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
  59. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/pi_acp_launcher.py +0 -0
  60. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/protocol.py +0 -0
  61. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/providers.py +0 -0
  62. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/agents/registry.py +0 -0
  63. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/branch.py +0 -0
  64. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/cli/__init__.py +0 -0
  65. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/cli/trace_import.py +0 -0
  66. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/compat/__init__.py +0 -0
  67. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/compat/harbor_registry.py +0 -0
  68. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/contracts/__init__.py +0 -0
  69. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/contracts/planes.py +0 -0
  70. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/contracts/user.py +0 -0
  71. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/demo_task/environment/Dockerfile +0 -0
  72. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/demo_task/instruction.md +0 -0
  73. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/demo_task/task.toml +0 -0
  74. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/demo_task/tests/test.sh +0 -0
  75. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/diagnostics.py +0 -0
  76. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/environment/__init__.py +0 -0
  77. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/environment/manifest.py +0 -0
  78. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/environment/manifest_env.py +0 -0
  79. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/environment/protocol.py +0 -0
  80. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/environment/readiness.py +0 -0
  81. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/eval_sharding.py +0 -0
  82. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/eval_worker.py +0 -0
  83. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/evaluation.py +0 -0
  84. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/experimental/__init__.py +0 -0
  85. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/experimental/mcp/__init__.py +0 -0
  86. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/experimental/mcp/hooks.py +0 -0
  87. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
  88. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/hosted_env.py +0 -0
  89. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/learner_skills.py +0 -0
  90. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/learner_store.py +0 -0
  91. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/metrics.py +0 -0
  92. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/models.py +0 -0
  93. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/monitor.py +0 -0
  94. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/__init__.py +0 -0
  95. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/litellm_bedrock_patch.py +0 -0
  96. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/litellm_config.py +0 -0
  97. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/litellm_logging.py +0 -0
  98. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/litellm_runtime.py +0 -0
  99. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/providers/runtime.py +0 -0
  100. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/py.typed +0 -0
  101. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/README.md +0 -0
  102. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/__init__.py +0 -0
  103. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/builtins.py +0 -0
  104. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/events.py +0 -0
  105. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/file_readers.py +0 -0
  106. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/llm.py +0 -0
  107. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/memory_scorer.py +0 -0
  108. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/node.py +0 -0
  109. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/protocol.py +0 -0
  110. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/rubric.py +0 -0
  111. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/rubric_config.py +0 -0
  112. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rewards/validation.py +0 -0
  113. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rollout.py +0 -0
  114. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rollout_branch.py +0 -0
  115. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/rollout_planes.py +0 -0
  116. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/runtime.py +0 -0
  117. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/__init__.py +0 -0
  118. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_base.py +0 -0
  119. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_compose.py +0 -0
  120. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +0 -0
  121. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
  122. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
  123. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
  124. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/_sdk_ops.py +0 -0
  125. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/daytona.py +0 -0
  126. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/docker.py +0 -0
  127. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/lockdown.py +0 -0
  128. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/metadata.py +0 -0
  129. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/modal_impl.py +0 -0
  130. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/process.py +0 -0
  131. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/protocol.py +0 -0
  132. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/services.py +0 -0
  133. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/setup.py +0 -0
  134. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/snapshot.py +0 -0
  135. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sandbox/user.py +0 -0
  136. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/scenes.py +0 -0
  137. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/sdk.py +0 -0
  138. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/self_gen.py +0 -0
  139. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skill_eval/__init__.py +0 -0
  140. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skill_eval/_core.py +0 -0
  141. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skill_eval/gepa_export.py +0 -0
  142. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skill_eval/schema.py +0 -0
  143. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skill_policy.py +0 -0
  144. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/skills.py +0 -0
  145. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/__init__.py +0 -0
  146. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/config.py +0 -0
  147. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/env.py +0 -0
  148. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/paths.py +0 -0
  149. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/task.py +0 -0
  150. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/task/verifier.py +0 -0
  151. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/templates/__init__.py +0 -0
  152. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/templates/judge.py.tmpl +0 -0
  153. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/templates/test.sh.tmpl +0 -0
  154. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/__init__.py +0 -0
  155. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/huggingface.py +0 -0
  156. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/local.py +0 -0
  157. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/models.py +0 -0
  158. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/parsers.py +0 -0
  159. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/traces/task_gen.py +0 -0
  160. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/__init__.py +0 -0
  161. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/_capture.py +0 -0
  162. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/export.py +0 -0
  163. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/metrics.py +0 -0
  164. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/otel.py +0 -0
  165. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/tree.py +0 -0
  166. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/types.py +0 -0
  167. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/trajectories/viewer.py +0 -0
  168. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/src/benchflow/usage_tracking.py +0 -0
  169. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/__init__.py +0 -0
  170. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/agents/__init__.py +0 -0
  171. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/agents/test_protocol.py +0 -0
  172. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/README.md +0 -0
  173. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
  174. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/environment/docker-compose.yaml +0 -0
  175. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +0 -0
  176. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/instruction.md +0 -0
  177. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
  178. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/task.toml +0 -0
  179. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/acp_smoke/tests/test.sh +0 -0
  180. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/conformance-results.json +0 -0
  181. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/proof_multi_agent.py +0 -0
  182. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/proof_snapshot.py +0 -0
  183. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/run_conformance.py +0 -0
  184. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +0 -0
  185. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/conftest.py +0 -0
  186. {benchflow-0.5.3.dev906/tests/environment → benchflow-0.5.3.dev908/tests/continue_run}/__init__.py +0 -0
  187. {benchflow-0.5.3.dev906/tests/trajectories → benchflow-0.5.3.dev908/tests/environment}/__init__.py +0 -0
  188. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_chibench_manifest.py +0 -0
  189. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_clawsbench_manifest.py +0 -0
  190. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_manifest.py +0 -0
  191. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_manifest_env.py +0 -0
  192. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_protocol.py +0 -0
  193. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/environment/test_readiness.py +0 -0
  194. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
  195. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/hello-world-task/instruction.md +0 -0
  196. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/hello-world-task/solution/solve.sh +0 -0
  197. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/hello-world-task/task.toml +0 -0
  198. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/hello-world-task/tests/test.sh +0 -0
  199. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +0 -0
  200. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/instruction.md +0 -0
  201. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/solution/solve.sh +0 -0
  202. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/task.toml +0 -0
  203. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/tests/test.sh +0 -0
  204. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/terminal-bench-smoke-task/tests/test_state.py +0 -0
  205. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/test_claude.sh +0 -0
  206. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/test_codex.sh +0 -0
  207. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/test_codex_custom_provider.sh +0 -0
  208. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/test_gemini.sh +0 -0
  209. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/test_openclaw.sh +0 -0
  210. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/traces/minimal-claude.jsonl +0 -0
  211. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/examples/traces/minimal-opentraces.jsonl +0 -0
  212. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/fixtures/mock_acp_agent.py +0 -0
  213. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
  214. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
  215. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/fixtures/mock_openai_responses_server.py +0 -0
  216. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/check_adapter_evidence.py +0 -0
  217. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/check_hosted_env_evidence.py +0 -0
  218. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/check_results.py +0 -0
  219. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/check_skillsbench_harbor_parity.py +0 -0
  220. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/check_trace_to_task_evidence.py +0 -0
  221. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/claude-agent-acp.yaml +0 -0
  222. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/codex-acp.yaml +0 -0
  223. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/gemini.yaml +0 -0
  224. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
  225. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/openclaw.yaml +0 -0
  226. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/opencode.yaml +0 -0
  227. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/openhands.yaml +0 -0
  228. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/configs/pi-acp.yaml +0 -0
  229. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/run.sh +0 -0
  230. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/run_suite.py +0 -0
  231. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/integration/suites/release.yaml +0 -0
  232. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_acp.py +0 -0
  233. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_acp_capability_advertising.py +0 -0
  234. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_acp_model_config_dispatch.py +0 -0
  235. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_acp_pinned_protocol_guard.py +0 -0
  236. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_acp_setup_failure_propagation.py +0 -0
  237. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_adapter_scripts.py +0 -0
  238. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_adapters.py +0 -0
  239. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_cli.py +0 -0
  240. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_env_resolution.py +0 -0
  241. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_gemini_defaults.py +0 -0
  242. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_idle_timeout_cli.py +0 -0
  243. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_model_decouple.py +0 -0
  244. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_registry.py +0 -0
  245. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_setup.py +0 -0
  246. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_agent_spec.py +0 -0
  247. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_base_install_imports.py +0 -0
  248. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_bedrock_thinking.py +0 -0
  249. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_branch.py +0 -0
  250. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_capture_trajectory.py +0 -0
  251. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_clawsbench_slice.py +0 -0
  252. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_cli_daytona.py +0 -0
  253. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_cli_docs_drift.py +0 -0
  254. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_cli_misc.py +0 -0
  255. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_compat_harbor_registry.py +0 -0
  256. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_config_redaction.py +0 -0
  257. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_connect_as_env.py +0 -0
  258. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_continuallearningbench_adapter.py +0 -0
  259. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_credential_env_scrub.py +0 -0
  260. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_daytona_key.py +0 -0
  261. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_no_host_paths.py +0 -0
  262. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_release_evidence.py +0 -0
  263. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_roadmap.py +0 -0
  264. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_symlink_ingestion.py +0 -0
  265. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_dashboard_sync.py +0 -0
  266. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_daytona_command_polling.py +0 -0
  267. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_daytona_litellm_runtime.py +0 -0
  268. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_daytona_status.py +0 -0
  269. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_docker_prune_scoping.py +0 -0
  270. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_docker_uploads.py +0 -0
  271. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_docs_examples.py +0 -0
  272. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eng50_capabilities.py +0 -0
  273. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_env_setup.py +0 -0
  274. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_environment_manifest_controls.py +0 -0
  275. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_filters_applied.py +0 -0
  276. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_sharding.py +0 -0
  277. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_single_task_summary.py +0 -0
  278. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_source_provenance.py +0 -0
  279. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_worker_retry.py +0 -0
  280. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_eval_zero_task_guard.py +0 -0
  281. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_evaluation_environment_manifest.py +0 -0
  282. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_exclude_tasks.py +0 -0
  283. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_experiments_status.py +0 -0
  284. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_harvey_lab_shim.py +0 -0
  285. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_hf_scores.py +0 -0
  286. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_hilbench_adapter.py +0 -0
  287. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_hosted_env.py +0 -0
  288. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_hosted_env_rollout_contract.py +0 -0
  289. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_inbound_adapter_manifest.py +0 -0
  290. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_inbound_adapters.py +0 -0
  291. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_integration_check_results.py +0 -0
  292. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_integration_run_suite.py +0 -0
  293. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_internet_policy.py +0 -0
  294. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_job.py +0 -0
  295. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_job_sequential_shared.py +0 -0
  296. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_job_sequential_shared_resume.py +0 -0
  297. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_judge_symlink_ingestion.py +0 -0
  298. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_learner_skills.py +0 -0
  299. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_learner_skills_traversal.py +0 -0
  300. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_learner_store.py +0 -0
  301. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_learner_store_persistence.py +0 -0
  302. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_litellm_config.py +0 -0
  303. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_litellm_hardening.py +0 -0
  304. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_litellm_logging.py +0 -0
  305. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_litellm_runtime.py +0 -0
  306. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_litellm_smoke.py +0 -0
  307. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_llm_judge.py +0 -0
  308. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_llm_judge_event_tags.py +0 -0
  309. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_llm_judge_verifier.py +0 -0
  310. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_memory_scorer.py +0 -0
  311. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_metrics.py +0 -0
  312. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_mock_openai_responses_server.py +0 -0
  313. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_monitor_scaffold.py +0 -0
  314. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_native_acp_usage.py +0 -0
  315. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_no_cross_provider_fallback.py +0 -0
  316. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_notification_order_real.py +0 -0
  317. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_oracle.py +0 -0
  318. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_oracle_chokepoint.py +0 -0
  319. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_paths_safe.py +0 -0
  320. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_paths_symlink_helpers.py +0 -0
  321. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_pi_acp_launcher.py +0 -0
  322. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_process.py +0 -0
  323. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_provider_auth_detection.py +0 -0
  324. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_providers.py +0 -0
  325. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_reexport.py +0 -0
  326. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_registry_invariants.py +0 -0
  327. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_release_version.py +0 -0
  328. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_resolve_env_helpers.py +0 -0
  329. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_reward_node.py +0 -0
  330. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_reward_unified_contract.py +0 -0
  331. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rewards.py +0 -0
  332. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rewards_jsonl.py +0 -0
  333. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_architecture.py +0 -0
  334. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_branch.py +0 -0
  335. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_config_path_coercion.py +0 -0
  336. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_environment.py +0 -0
  337. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_import_no_side_effects.py +0 -0
  338. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_on_ask_user_wiring.py +0 -0
  339. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_probe_sandbox_health.py +0 -0
  340. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rollout_upload.py +0 -0
  341. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_rubric_config.py +0 -0
  342. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_runtime.py +0 -0
  343. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_runtime_config_wired.py +0 -0
  344. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_runtime_live_sandbox.py +0 -0
  345. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox.py +0 -0
  346. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_exec_secret_handling.py +0 -0
  347. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_hardening.py +0 -0
  348. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_isolation_copy_traversal.py +0 -0
  349. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_multi_service.py +0 -0
  350. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_protocol.py +0 -0
  351. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_setup.py +0 -0
  352. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_snapshot_contract.py +0 -0
  353. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_upload_symlink.py +0 -0
  354. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sandbox_verifier_workspace.py +0 -0
  355. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_scene.py +0 -0
  356. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_scene_outbox_trial.py +0 -0
  357. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_scene_parallel_group.py +0 -0
  358. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_scene_result_aggregation.py +0 -0
  359. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_scoring.py +0 -0
  360. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sdk_internals.py +0 -0
  361. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_sdk_lockdown.py +0 -0
  362. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_self_gen_cli.py +0 -0
  363. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_self_gen_export_error_channel.py +0 -0
  364. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_self_gen_export_failures.py +0 -0
  365. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_self_gen_orchestration.py +0 -0
  366. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_session_request_permission_dispatch.py +0 -0
  367. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_eval.py +0 -0
  368. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_eval_dryrun.py +0 -0
  369. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_eval_integration.py +0 -0
  370. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_eval_sweep.py +0 -0
  371. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_eval_traversal.py +0 -0
  372. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_invocation_artifacts.py +0 -0
  373. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skill_policy.py +0 -0
  374. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skills.py +0 -0
  375. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skills_dir_agent_home_link.py +0 -0
  376. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skillsbench_harbor_parity.py +0 -0
  377. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skillsbench_harbor_run_suite.py +0 -0
  378. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_skillsbench_publish_scrub.py +0 -0
  379. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_smoke.py +0 -0
  380. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_subscription_auth.py +0 -0
  381. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_task_check_eval_consistency.py +0 -0
  382. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_task_config.py +0 -0
  383. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_task_download.py +0 -0
  384. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_tasks.py +0 -0
  385. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_token_usage_normalization.py +0 -0
  386. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trace_import_cli.py +0 -0
  387. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trace_task_gen_traversal.py +0 -0
  388. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trace_to_task_evidence.py +0 -0
  389. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_traces_huggingface.py +0 -0
  390. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_traces_parsers.py +0 -0
  391. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_traces_task_gen.py +0 -0
  392. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_train_mode_artifact_emission.py +0 -0
  393. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trajectory_integration.py +0 -0
  394. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trajectory_streaming.py +0 -0
  395. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trial_agent_timeout_verify.py +0 -0
  396. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trial_install_agent_timeout.py +0 -0
  397. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_trial_litellm_runtime.py +0 -0
  398. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_usage_litellm.py +0 -0
  399. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_usage_required.py +0 -0
  400. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_usage_tracking.py +0 -0
  401. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_user.py +0 -0
  402. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_verifier_multi_container.py +0 -0
  403. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_verifier_output.py +0 -0
  404. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_verifier_output_freshness.py +0 -0
  405. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_verify.py +0 -0
  406. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_workflow_action_pinning.py +0 -0
  407. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/test_yaml_config.py +0 -0
  408. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/trajectories/test_export.py +0 -0
  409. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/trajectories/test_export_nan_handling.py +0 -0
  410. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/trajectories/test_redaction.py +0 -0
  411. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/trajectories/test_step_granularity.py +0 -0
  412. {benchflow-0.5.3.dev906 → benchflow-0.5.3.dev908}/tests/trajectories/test_tree.py +0 -0
@@ -2,6 +2,16 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ### Added
6
+
7
+ - **`benchflow continue <run-folder>`** — resume a previous, unfinished
8
+ (timed-out) `openhands` run to completion. A standalone tool (it does not
9
+ touch the normal run path) that reconstructs the run's exact workspace and
10
+ agent memory from the recorded `llm_trajectory.jsonl` via record-replay,
11
+ then continues with the live model — no injected prompt — and writes a new
12
+ HF-compatible folder with `continued_from` provenance. See
13
+ [`docs/continue-runs.md`](docs/continue-runs.md).
14
+
5
15
  ### Changed
6
16
 
7
17
  - Document the public vs internal preview install/upgrade command matrix,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchflow
3
- Version: 0.5.3.dev906
3
+ Version: 0.5.3.dev908
4
4
  Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
5
5
  Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
6
6
  Project-URL: Repository, https://github.com/benchflow-ai/benchflow
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "benchflow"
3
- version = "0.5.3.dev906"
3
+ version = "0.5.3.dev908"
4
4
  description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -0,0 +1,236 @@
1
+ """``benchflow continue`` — resume a timed-out run to completion.
2
+
3
+ Standalone command (does not touch the normal eval/run path): reconstruct a
4
+ previous unfinished ``openhands`` run's exact env + memory from its recorded
5
+ trajectory via record-replay, continue it live, and write a new HF-compatible
6
+ folder linked to the parent. See :mod:`benchflow.continue_run`.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import logging
13
+ import os
14
+ from pathlib import Path
15
+ from typing import Annotated
16
+
17
+ import typer
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _load_env_defaults() -> None:
23
+ from benchflow._dotenv import load_dotenv_env
24
+
25
+ for key, value in load_dotenv_env().items():
26
+ os.environ.setdefault(key, value)
27
+
28
+
29
+ def register_continue(app: typer.Typer) -> None:
30
+ """Attach the ``continue`` command to the top-level benchflow app."""
31
+
32
+ @app.command("continue")
33
+ def continue_cmd(
34
+ folder: Annotated[
35
+ Path,
36
+ typer.Argument(
37
+ help="Original run output folder (contains config.json + "
38
+ "trajectory/llm_trajectory.jsonl)."
39
+ ),
40
+ ],
41
+ tasks_dir: Annotated[
42
+ Path | None,
43
+ typer.Option(
44
+ "--tasks-dir",
45
+ help="Directory holding the task source (instruction + verifier). "
46
+ "Required unless the recorded task_path still exists on disk.",
47
+ ),
48
+ ] = None,
49
+ model: Annotated[
50
+ str | None,
51
+ typer.Option(
52
+ "--model",
53
+ help="Override the live-continuation model (default: the "
54
+ "original run's model). Tests use gemini-3.1-flash-lite-preview.",
55
+ ),
56
+ ] = None,
57
+ timeout: Annotated[
58
+ int | None,
59
+ typer.Option(
60
+ "--timeout",
61
+ help="Wall-clock budget for the continuation, in seconds "
62
+ "(default: the original run's timeout).",
63
+ ),
64
+ ] = None,
65
+ output: Annotated[
66
+ Path | None,
67
+ typer.Option(
68
+ "--output",
69
+ help="Output jobs dir for the new run (default: "
70
+ "<orig-parent>/continued).",
71
+ ),
72
+ ] = None,
73
+ require_timeout: Annotated[
74
+ bool,
75
+ typer.Option(
76
+ "--require-timeout/--no-require-timeout",
77
+ help="Refuse runs whose recorded status is not a timeout.",
78
+ ),
79
+ ] = False,
80
+ strict_divergence: Annotated[
81
+ bool,
82
+ typer.Option(
83
+ "--strict-divergence/--no-strict-divergence",
84
+ help="Abort if replay leaves the original rails (message-count "
85
+ "mismatch) instead of warning.",
86
+ ),
87
+ ] = False,
88
+ replay_only: Annotated[
89
+ bool,
90
+ typer.Option(
91
+ "--replay-only/--no-replay-only",
92
+ help="Rebuild the env via replay and stop at the cut-point "
93
+ "(no live model needed) — useful for testing.",
94
+ ),
95
+ ] = False,
96
+ proxy_mode: Annotated[
97
+ str,
98
+ typer.Option(
99
+ "--proxy-mode",
100
+ help=(
101
+ "Replay proxy placement: auto, host, or sandbox. Auto uses "
102
+ "sandbox-local replay for Daytona/Modal and host replay for Docker."
103
+ ),
104
+ ),
105
+ ] = "auto",
106
+ ) -> None:
107
+ """Resume a previous unfinished (timed-out) openhands run to completion."""
108
+ from benchflow.continue_run.orchestrator import continue_run
109
+ from benchflow.continue_run.run_folder import RunFolderError
110
+
111
+ _load_env_defaults()
112
+
113
+ try:
114
+ result = asyncio.run(
115
+ continue_run(
116
+ folder,
117
+ tasks_dir=tasks_dir,
118
+ model=model,
119
+ timeout=timeout,
120
+ output_dir=output,
121
+ require_timeout=require_timeout,
122
+ strict_divergence=strict_divergence,
123
+ replay_only=replay_only,
124
+ proxy_mode=proxy_mode,
125
+ )
126
+ )
127
+ except RunFolderError as exc:
128
+ typer.secho(f"benchflow continue: {exc}", fg=typer.colors.RED, err=True)
129
+ raise typer.Exit(1) from exc
130
+
131
+ typer.secho(
132
+ f"\n✓ continued run written to {result.rollout_dir}", fg=typer.colors.GREEN
133
+ )
134
+ typer.echo(
135
+ f" replayed {result.n_recorded} recorded turn(s); "
136
+ f"{result.n_live} live turn(s); {result.divergences} divergence(s)"
137
+ )
138
+ if result.rewards is not None:
139
+ typer.echo(f" rewards: {result.rewards}")
140
+ if result.error:
141
+ typer.secho(f" agent error: {result.error}", fg=typer.colors.YELLOW)
142
+
143
+ @app.command("continue-batch")
144
+ def continue_batch_cmd(
145
+ root: Annotated[
146
+ Path,
147
+ typer.Argument(
148
+ help=(
149
+ "Run folder or directory tree containing timeout run folders "
150
+ "(config.json + trajectory/llm_trajectory.jsonl)."
151
+ )
152
+ ),
153
+ ],
154
+ tasks_dir: Annotated[
155
+ Path | None,
156
+ typer.Option(
157
+ "--tasks-dir",
158
+ help="Directory holding task sources; required unless recorded task_path exists.",
159
+ ),
160
+ ] = None,
161
+ model: Annotated[
162
+ str | None,
163
+ typer.Option("--model", help="Override live-continuation model."),
164
+ ] = None,
165
+ timeout: Annotated[
166
+ int | None,
167
+ typer.Option("--timeout", help="Wall-clock budget per continuation."),
168
+ ] = None,
169
+ output: Annotated[
170
+ Path | None,
171
+ typer.Option("--output", help="Output jobs dir for continued runs."),
172
+ ] = None,
173
+ concurrency: Annotated[
174
+ int,
175
+ typer.Option(
176
+ "--concurrency",
177
+ help="Maximum number of continuation runs in flight.",
178
+ ),
179
+ ] = 100,
180
+ limit: Annotated[
181
+ int | None,
182
+ typer.Option("--limit", help="Limit discovered timeout folders."),
183
+ ] = None,
184
+ strict_divergence: Annotated[
185
+ bool,
186
+ typer.Option(
187
+ "--strict-divergence/--no-strict-divergence",
188
+ help="Abort a run if replay leaves the original rails.",
189
+ ),
190
+ ] = False,
191
+ proxy_mode: Annotated[
192
+ str,
193
+ typer.Option(
194
+ "--proxy-mode",
195
+ help=(
196
+ "Replay proxy placement: auto, host, or sandbox. For PR5 "
197
+ "Daytona runs, use the default auto or sandbox."
198
+ ),
199
+ ),
200
+ ] = "auto",
201
+ ) -> None:
202
+ """Continue all timed-out OpenHands runs under a directory tree."""
203
+ import json
204
+
205
+ from benchflow.continue_run.batch import (
206
+ continue_batch,
207
+ discover_timeout_run_folders,
208
+ summarize_batch,
209
+ )
210
+
211
+ _load_env_defaults()
212
+ folders = discover_timeout_run_folders(root, limit=limit)
213
+ if not folders:
214
+ typer.secho("No timeout run folders found.", fg=typer.colors.YELLOW)
215
+ return
216
+
217
+ typer.echo(
218
+ f"Continuing {len(folders)} timeout run(s) with concurrency={concurrency}"
219
+ )
220
+ results = asyncio.run(
221
+ continue_batch(
222
+ folders,
223
+ concurrency=concurrency,
224
+ tasks_dir=tasks_dir,
225
+ model=model,
226
+ timeout=timeout,
227
+ output_dir=output,
228
+ require_timeout=True,
229
+ strict_divergence=strict_divergence,
230
+ proxy_mode=proxy_mode,
231
+ )
232
+ )
233
+ summary = summarize_batch(results)
234
+ typer.echo(json.dumps(summary, indent=2))
235
+ if summary["failed"]:
236
+ raise typer.Exit(1)
@@ -21,6 +21,7 @@ from benchflow._utils.config import (
21
21
  normalize_sandbox_user,
22
22
  )
23
23
  from benchflow.agents.registry import parse_agent_spec
24
+ from benchflow.cli.continue_cmd import register_continue
24
25
  from benchflow.cli.trace_import import register_tasks_generate
25
26
  from benchflow.evaluation import DEFAULT_AGENT, effective_model
26
27
  from benchflow.skill_policy import SKILL_MODE_NO_SKILL
@@ -40,6 +41,9 @@ app = typer.Typer(
40
41
  no_args_is_help=True,
41
42
  )
42
43
 
44
+ # Standalone `benchflow continue <orig-run-folder>` — resume a timed-out run.
45
+ register_continue(app)
46
+
43
47
 
44
48
  def _version_callback(value: bool) -> None:
45
49
  if value:
@@ -0,0 +1,35 @@
1
+ """Resume a previous, unfinished (timed-out) agent run to completion.
2
+
3
+ ``benchflow continue <orig-run-output-folder>`` is a *standalone* tool that does
4
+ **not** touch benchflow's normal run path. It reconstructs a timed-out run's
5
+ exact workspace and agent memory from the recorded trajectory (record-replay),
6
+ then lets the agent continue as if the timeout had simply been larger, and
7
+ writes a new HF-compatible result folder linked to the parent.
8
+
9
+ The mechanism (agreed design):
10
+
11
+ 1. Load the original run folder (``config.json`` + ``trajectory/llm_trajectory.jsonl``).
12
+ 2. Boot a fresh *pristine* sandbox from the same base image.
13
+ 3. Stand up a :class:`~benchflow.continue_run.replay_proxy.ReplayProxy` that
14
+ OpenHands talks to via ``LLM_BASE_URL``. It serves the recorded LLM
15
+ responses **in order**, so the agent re-executes its own past decisions for
16
+ real — rebuilding the byte-exact workspace and its exact internal state.
17
+ 4. When the recorded responses run out (the timeout cut-point), the proxy flips
18
+ to the **live** model and the agent continues — no injected prompt.
19
+ 5. Re-verify and write a new folder with ``continued_from`` provenance.
20
+
21
+ Only the ``openhands`` agent is supported for now (the LLM-proxy seam this relies
22
+ on is wired for openhands via ``LLM_BASE_URL``).
23
+ """
24
+
25
+ from benchflow.continue_run.run_folder import (
26
+ RunFolder,
27
+ RunFolderError,
28
+ load_run_folder,
29
+ )
30
+
31
+ __all__ = [
32
+ "RunFolder",
33
+ "RunFolderError",
34
+ "load_run_folder",
35
+ ]
@@ -0,0 +1,125 @@
1
+ """Batch orchestration for continuing many timed-out runs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from collections.abc import Awaitable, Callable
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from benchflow.continue_run.orchestrator import ContinueResult, continue_run
12
+ from benchflow.continue_run.run_folder import RunFolderError, load_run_folder
13
+
14
+ ContinueRunner = Callable[..., Awaitable[ContinueResult]]
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class BatchContinueResult:
19
+ """Result for one source folder in a batch continuation."""
20
+
21
+ folder: Path
22
+ ok: bool
23
+ continued: ContinueResult | None = None
24
+ error: str | None = None
25
+
26
+
27
+ def discover_timeout_run_folders(
28
+ root: str | Path, *, limit: int | None = None
29
+ ) -> list[Path]:
30
+ """Find OpenHands timeout run folders below ``root``.
31
+
32
+ Discovery is intentionally artifact-based: a candidate must have a
33
+ ``config.json`` and a usable ``trajectory/llm_trajectory.jsonl``. Non-timeout
34
+ runs are skipped by ``load_run_folder(require_timeout=True)``.
35
+ """
36
+ root_path = Path(root).expanduser()
37
+ candidates = [root_path] if (root_path / "config.json").is_file() else []
38
+ candidates.extend(path.parent for path in root_path.rglob("config.json"))
39
+
40
+ folders: list[Path] = []
41
+ seen: set[Path] = set()
42
+ for folder in sorted(candidates):
43
+ resolved = folder.resolve()
44
+ if resolved in seen:
45
+ continue
46
+ seen.add(resolved)
47
+ try:
48
+ load_run_folder(folder, require_timeout=True)
49
+ except RunFolderError:
50
+ continue
51
+ folders.append(folder)
52
+ if limit is not None and len(folders) >= limit:
53
+ break
54
+ return folders
55
+
56
+
57
+ async def continue_batch(
58
+ folders: list[Path],
59
+ *,
60
+ concurrency: int,
61
+ tasks_dir: str | Path | None,
62
+ model: str | None,
63
+ timeout: int | None,
64
+ output_dir: str | Path | None,
65
+ require_timeout: bool = True,
66
+ strict_divergence: bool = False,
67
+ proxy_mode: str = "auto",
68
+ runner: ContinueRunner = continue_run,
69
+ ) -> list[BatchContinueResult]:
70
+ """Run ``benchflow continue`` over folders with rolling concurrency."""
71
+ if concurrency < 1:
72
+ raise ValueError("concurrency must be >= 1")
73
+ semaphore = asyncio.Semaphore(concurrency)
74
+
75
+ async def _one(folder: Path) -> BatchContinueResult:
76
+ async with semaphore:
77
+ try:
78
+ result = await runner(
79
+ folder,
80
+ tasks_dir=tasks_dir,
81
+ model=model,
82
+ timeout=timeout,
83
+ output_dir=output_dir,
84
+ require_timeout=require_timeout,
85
+ strict_divergence=strict_divergence,
86
+ proxy_mode=proxy_mode,
87
+ )
88
+ except Exception as exc:
89
+ return BatchContinueResult(folder=folder, ok=False, error=str(exc))
90
+ if result.error:
91
+ return BatchContinueResult(
92
+ folder=folder,
93
+ ok=False,
94
+ continued=result,
95
+ error=result.error,
96
+ )
97
+ return BatchContinueResult(folder=folder, ok=True, continued=result)
98
+
99
+ return list(await asyncio.gather(*(_one(folder) for folder in folders)))
100
+
101
+
102
+ def summarize_batch(results: list[BatchContinueResult]) -> dict[str, Any]:
103
+ """Small JSON-serializable summary for CLI output and dashboards."""
104
+ ok = [result for result in results if result.ok]
105
+ failed = [result for result in results if not result.ok]
106
+ return {
107
+ "total": len(results),
108
+ "succeeded": len(ok),
109
+ "failed": len(failed),
110
+ "outputs": [
111
+ str(result.continued.rollout_dir)
112
+ for result in ok
113
+ if result.continued is not None
114
+ ],
115
+ "errors": [
116
+ {
117
+ "folder": str(result.folder),
118
+ "output": str(result.continued.rollout_dir)
119
+ if result.continued is not None
120
+ else None,
121
+ "error": result.error,
122
+ }
123
+ for result in failed
124
+ ],
125
+ }