benchflow 0.3.4__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. {benchflow-0.3.4 → benchflow-0.4.0}/.gitignore +1 -0
  2. {benchflow-0.3.4 → benchflow-0.4.0}/PKG-INFO +10 -5
  3. {benchflow-0.3.4 → benchflow-0.4.0}/README.md +2 -2
  4. {benchflow-0.3.4 → benchflow-0.4.0}/pyproject.toml +29 -3
  5. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/__init__.py +39 -77
  6. benchflow-0.4.0/src/benchflow/_dotenv.py +47 -0
  7. benchflow-0.4.0/src/benchflow/_utils/__init__.py +11 -0
  8. benchflow-0.3.4/src/benchflow/task_download.py → benchflow-0.4.0/src/benchflow/_utils/benchmark_repos.py +5 -1
  9. benchflow-0.4.0/src/benchflow/_utils/config.py +22 -0
  10. benchflow-0.3.4/src/benchflow/trial_yaml.py → benchflow-0.4.0/src/benchflow/_utils/yaml_loader.py +18 -18
  11. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/container_transport.py +1 -1
  12. benchflow-0.3.4/src/benchflow/_acp_run.py → benchflow-0.4.0/src/benchflow/acp/runtime.py +9 -9
  13. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/transport.py +1 -1
  14. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/adapters/ors.py +28 -8
  15. benchflow-0.3.4/src/benchflow/_credentials.py → benchflow-0.4.0/src/benchflow/agents/credentials.py +31 -6
  16. benchflow-0.3.4/src/benchflow/_agent_env.py → benchflow-0.4.0/src/benchflow/agents/env.py +125 -42
  17. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/harvey_lab_acp_shim.py +144 -2
  18. benchflow-0.3.4/src/benchflow/_agent_setup.py → benchflow-0.4.0/src/benchflow/agents/install.py +3 -3
  19. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/registry.py +59 -23
  20. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/cli/main.py +401 -102
  21. benchflow-0.4.0/src/benchflow/cli/trace_import.py +383 -0
  22. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/evaluation.py +18 -17
  23. benchflow-0.4.0/src/benchflow/experimental/__init__.py +1 -0
  24. {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/experimental}/mcp/hooks.py +1 -1
  25. {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/experimental}/mcp/reviewer_server.py +1 -1
  26. benchflow-0.4.0/src/benchflow/hosted_env.py +408 -0
  27. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/metrics.py +1 -1
  28. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/models.py +3 -3
  29. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/builtins.py +6 -10
  30. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/file_readers.py +1 -3
  31. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/llm.py +2 -6
  32. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/rubric_config.py +3 -3
  33. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rollout.py +192 -113
  34. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/runtime.py +39 -32
  35. benchflow-0.4.0/src/benchflow/sandbox/__init__.py +37 -0
  36. benchflow-0.4.0/src/benchflow/sandbox/_base.py +189 -0
  37. benchflow-0.4.0/src/benchflow/sandbox/_compose.py +9 -0
  38. benchflow-0.4.0/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +11 -0
  39. benchflow-0.4.0/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +6 -0
  40. benchflow-0.4.0/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +3 -0
  41. benchflow-0.4.0/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +4 -0
  42. benchflow-0.4.0/src/benchflow/sandbox/daytona.py +1077 -0
  43. benchflow-0.4.0/src/benchflow/sandbox/docker.py +465 -0
  44. benchflow-0.3.4/src/benchflow/_sandbox.py → benchflow-0.4.0/src/benchflow/sandbox/lockdown.py +1 -1
  45. benchflow-0.4.0/src/benchflow/sandbox/modal_impl.py +368 -0
  46. {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/sandbox}/process.py +5 -5
  47. benchflow-0.3.4/src/benchflow/_env_setup.py → benchflow-0.4.0/src/benchflow/sandbox/setup.py +72 -50
  48. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/sdk.py +17 -14
  49. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/self_gen.py +11 -11
  50. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/skill_eval.py +97 -33
  51. benchflow-0.4.0/src/benchflow/task/__init__.py +69 -0
  52. benchflow-0.4.0/src/benchflow/task/config.py +235 -0
  53. benchflow-0.4.0/src/benchflow/task/env.py +41 -0
  54. benchflow-0.4.0/src/benchflow/task/paths.py +171 -0
  55. benchflow-0.4.0/src/benchflow/task/task.py +49 -0
  56. benchflow-0.4.0/src/benchflow/task/verifier.py +166 -0
  57. benchflow-0.4.0/src/benchflow/traces/__init__.py +28 -0
  58. benchflow-0.4.0/src/benchflow/traces/huggingface.py +540 -0
  59. benchflow-0.4.0/src/benchflow/traces/local.py +117 -0
  60. benchflow-0.4.0/src/benchflow/traces/models.py +108 -0
  61. benchflow-0.4.0/src/benchflow/traces/parsers.py +485 -0
  62. benchflow-0.4.0/src/benchflow/traces/task_gen.py +562 -0
  63. {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/trajectories}/viewer.py +18 -16
  64. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/proof_multi_agent.py +22 -20
  65. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/proof_snapshot.py +6 -7
  66. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/run_conformance.py +6 -6
  67. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conftest.py +18 -12
  68. {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/test_codex.sh +16 -7
  69. {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/test_codex_custom_provider.sh +1 -1
  70. benchflow-0.4.0/tests/integration/check_adapter_evidence.py +373 -0
  71. {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/check_results.py +26 -6
  72. {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/run.sh +15 -8
  73. benchflow-0.4.0/tests/integration/run_suite.py +439 -0
  74. benchflow-0.4.0/tests/integration/suites/release.yaml +262 -0
  75. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_acp.py +21 -21
  76. benchflow-0.4.0/tests/test_adapter_scripts.py +33 -0
  77. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_adapters.py +19 -0
  78. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_agent_registry.py +1 -1
  79. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_agent_setup.py +1 -1
  80. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_agent_spec.py +9 -8
  81. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_capture_trajectory.py +1 -1
  82. benchflow-0.4.0/tests/test_cli_daytona.py +89 -0
  83. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_connect_as_env.py +51 -8
  84. benchflow-0.4.0/tests/test_docs_examples.py +58 -0
  85. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_eng50_capabilities.py +50 -48
  86. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_env_setup.py +39 -9
  87. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_exclude_tasks.py +9 -9
  88. benchflow-0.4.0/tests/test_harvey_lab_shim.py +46 -0
  89. benchflow-0.4.0/tests/test_hosted_env.py +257 -0
  90. benchflow-0.4.0/tests/test_integration_check_results.py +110 -0
  91. benchflow-0.4.0/tests/test_integration_run_suite.py +261 -0
  92. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_internet_policy.py +19 -19
  93. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_job.py +17 -15
  94. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_llm_judge.py +7 -21
  95. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_notification_order_real.py +1 -1
  96. benchflow-0.4.0/tests/test_oracle_chokepoint.py +469 -0
  97. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_process.py +3 -3
  98. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_provider_runtime.py +2 -2
  99. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_reexport.py +12 -14
  100. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_resolve_env_helpers.py +67 -3
  101. benchflow-0.4.0/tests/test_rollout_upload.py +70 -0
  102. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_runtime.py +23 -4
  103. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sandbox_hardening.py +92 -68
  104. benchflow-0.4.0/tests/test_sandbox_protocol.py +225 -0
  105. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sandbox_setup.py +1 -1
  106. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sandbox_verifier_workspace.py +10 -10
  107. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_scene.py +15 -15
  108. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_scene_outbox_trial.py +52 -15
  109. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_scoring.py +2 -2
  110. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sdk_internals.py +120 -22
  111. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sdk_lockdown.py +10 -10
  112. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_self_gen_orchestration.py +15 -15
  113. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_skill_eval.py +155 -1
  114. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_skill_eval_dryrun.py +37 -6
  115. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_skill_eval_integration.py +3 -0
  116. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_smoke.py +5 -5
  117. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_subscription_auth.py +164 -2
  118. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_task_download.py +10 -9
  119. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_tasks.py +1 -1
  120. benchflow-0.4.0/tests/test_trace_import_cli.py +61 -0
  121. benchflow-0.4.0/tests/test_traces_parsers.py +474 -0
  122. benchflow-0.4.0/tests/test_traces_task_gen.py +514 -0
  123. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_trajectory_integration.py +4 -4
  124. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_trial_agent_timeout_verify.py +7 -7
  125. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_trial_bedrock_proxy.py +8 -8
  126. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_trial_install_agent_timeout.py +15 -10
  127. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_user.py +23 -20
  128. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_verify.py +36 -14
  129. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_yaml_config.py +60 -24
  130. benchflow-0.3.4/src/benchflow/job.py +0 -29
  131. benchflow-0.3.4/src/benchflow/sandbox/__init__.py +0 -9
  132. benchflow-0.3.4/src/benchflow/sandbox/daytona.py +0 -74
  133. benchflow-0.3.4/src/benchflow/sandbox/docker.py +0 -74
  134. benchflow-0.3.4/src/benchflow/trial.py +0 -39
  135. benchflow-0.3.4/tests/test_oracle_chokepoint.py +0 -224
  136. benchflow-0.3.4/tests/test_sandbox_protocol.py +0 -250
  137. {benchflow-0.3.4 → benchflow-0.4.0}/CHANGELOG.md +0 -0
  138. {benchflow-0.3.4 → benchflow-0.4.0}/LICENSE +0 -0
  139. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/_run.py +0 -0
  140. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/_types.py +0 -0
  141. /benchflow-0.3.4/src/benchflow/_scoring.py → /benchflow-0.4.0/src/benchflow/_utils/scoring.py +0 -0
  142. /benchflow-0.3.4/src/benchflow/tasks.py → /benchflow-0.4.0/src/benchflow/_utils/task_authoring.py +0 -0
  143. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/__init__.py +0 -0
  144. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/client.py +0 -0
  145. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/session.py +0 -0
  146. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/acp/types.py +0 -0
  147. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/adapters/__init__.py +0 -0
  148. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/adapters/inspect_ai.py +0 -0
  149. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/__init__.py +0 -0
  150. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
  151. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/pi_acp_launcher.py +0 -0
  152. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/agents/providers.py +0 -0
  153. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/cli/__init__.py +0 -0
  154. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/demo_task/environment/Dockerfile +0 -0
  155. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/demo_task/instruction.md +0 -0
  156. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/demo_task/task.toml +0 -0
  157. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/demo_task/tests/test.sh +0 -0
  158. {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/experimental}/mcp/__init__.py +0 -0
  159. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/providers/__init__.py +0 -0
  160. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/providers/bedrock_proxy.py +0 -0
  161. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/providers/bedrock_runtime.py +0 -0
  162. /benchflow-0.3.4/src/benchflow/_provider_runtime.py → /benchflow-0.4.0/src/benchflow/providers/runtime.py +0 -0
  163. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/py.typed +0 -0
  164. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/README.md +0 -0
  165. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/__init__.py +0 -0
  166. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/events.py +0 -0
  167. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/protocol.py +0 -0
  168. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/rewards/rubric.py +0 -0
  169. /benchflow-0.3.4/src/benchflow/_daytona_patches.py → /benchflow-0.4.0/src/benchflow/sandbox/_sdk_ops.py +0 -0
  170. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/sandbox/protocol.py +0 -0
  171. /benchflow-0.3.4/src/benchflow/environments.py → /benchflow-0.4.0/src/benchflow/sandbox/services.py +0 -0
  172. /benchflow-0.3.4/src/benchflow/_snapshot.py → /benchflow-0.4.0/src/benchflow/sandbox/snapshot.py +0 -0
  173. {benchflow-0.3.4/src/benchflow → benchflow-0.4.0/src/benchflow/sandbox}/user.py +0 -0
  174. /benchflow-0.3.4/src/benchflow/_scene.py → /benchflow-0.4.0/src/benchflow/scenes.py +0 -0
  175. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/skills.py +0 -0
  176. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/templates/__init__.py +0 -0
  177. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/templates/judge.py.tmpl +0 -0
  178. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/templates/test.sh.tmpl +0 -0
  179. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/trajectories/__init__.py +0 -0
  180. /benchflow-0.3.4/src/benchflow/_trajectory.py → /benchflow-0.4.0/src/benchflow/trajectories/_capture.py +0 -0
  181. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/trajectories/otel.py +0 -0
  182. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/trajectories/proxy.py +0 -0
  183. {benchflow-0.3.4 → benchflow-0.4.0}/src/benchflow/trajectories/types.py +0 -0
  184. {benchflow-0.3.4 → benchflow-0.4.0}/tests/__init__.py +0 -0
  185. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/README.md +0 -0
  186. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
  187. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/acp_smoke/instruction.md +0 -0
  188. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
  189. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/acp_smoke/task.toml +0 -0
  190. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/acp_smoke/tests/test.sh +0 -0
  191. {benchflow-0.3.4 → benchflow-0.4.0}/tests/conformance/conformance-results.json +0 -0
  192. {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
  193. {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/hello-world-task/instruction.md +0 -0
  194. {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/hello-world-task/solution/solve.sh +0 -0
  195. {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/hello-world-task/task.toml +0 -0
  196. {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/hello-world-task/tests/test.sh +0 -0
  197. {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/test_claude.sh +0 -0
  198. {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/test_gemini.sh +0 -0
  199. {benchflow-0.3.4 → benchflow-0.4.0}/tests/examples/test_openclaw.sh +0 -0
  200. {benchflow-0.3.4 → benchflow-0.4.0}/tests/fixtures/mock_acp_agent.py +0 -0
  201. {benchflow-0.3.4 → benchflow-0.4.0}/tests/fixtures/mock_acp_agent_interleaved.py +0 -0
  202. {benchflow-0.3.4 → benchflow-0.4.0}/tests/fixtures/mock_acp_agent_multi_turn.py +0 -0
  203. {benchflow-0.3.4 → benchflow-0.4.0}/tests/fixtures/mock_openai_responses_server.py +0 -0
  204. {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/claude-agent-acp.yaml +0 -0
  205. {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/codex-acp.yaml +0 -0
  206. {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/gemini.yaml +0 -0
  207. {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/harvey-lab-harness.yaml +0 -0
  208. {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/openclaw.yaml +0 -0
  209. {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/opencode.yaml +0 -0
  210. {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/openhands.yaml +0 -0
  211. {benchflow-0.3.4 → benchflow-0.4.0}/tests/integration/configs/pi-acp.yaml +0 -0
  212. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_agent_model_decouple.py +0 -0
  213. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_atif_trajectory.py +0 -0
  214. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_bedrock_proxy.py +0 -0
  215. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_bedrock_runtime.py +0 -0
  216. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_metrics.py +0 -0
  217. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_mock_openai_responses_server.py +0 -0
  218. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_oracle.py +0 -0
  219. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_pi_acp_launcher.py +0 -0
  220. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_providers.py +0 -0
  221. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_registry_invariants.py +0 -0
  222. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_rewards.py +0 -0
  223. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_rewards_jsonl.py +0 -0
  224. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_rubric_config.py +0 -0
  225. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_sandbox.py +0 -0
  226. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_self_gen_cli.py +0 -0
  227. {benchflow-0.3.4 → benchflow-0.4.0}/tests/test_skills.py +0 -0
@@ -184,3 +184,4 @@ tmp/
184
184
  tests/.smoke-jobs/
185
185
  context/
186
186
  tutorials/
187
+ .playwright-mcp/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchflow
3
- Version: 0.3.4
3
+ Version: 0.4.0
4
4
  Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
5
5
  Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
6
6
  Project-URL: Repository, https://github.com/benchflow-ai/benchflow
@@ -11,7 +11,7 @@ Author-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@g
11
11
  Maintainer-email: Xiangyi Li <xiangyi@benchflow.ai>, Kyoung Whan Choe <choe.kyoung@gmail.com>
12
12
  License: Apache-2.0
13
13
  License-File: LICENSE
14
- Keywords: acp,agent-evaluation,benchmark,llm-agents,multi-turn,skillsbench,terminal-bench
14
+ Keywords: acp,agent-evaluation,benchmark,llm-agents,multi-turn,skillsbench
15
15
  Classifier: License :: OSI Approved :: Apache Software License
16
16
  Classifier: Operating System :: OS Independent
17
17
  Classifier: Programming Language :: Python :: 3
@@ -19,7 +19,6 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3.13
20
20
  Requires-Python: >=3.12
21
21
  Requires-Dist: anyio>=4.0
22
- Requires-Dist: harbor==0.3.0
23
22
  Requires-Dist: httpx>=0.27.0
24
23
  Requires-Dist: pydantic>=2.0
25
24
  Requires-Dist: pyyaml>=6.0
@@ -33,6 +32,12 @@ Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
33
32
  Requires-Dist: pytest>=9.0.3; extra == 'dev'
34
33
  Requires-Dist: ruff>=0.7.0; extra == 'dev'
35
34
  Requires-Dist: ty>=0.0.1a1; extra == 'dev'
35
+ Provides-Extra: sandbox-daytona
36
+ Requires-Dist: daytona>=0.153.0; extra == 'sandbox-daytona'
37
+ Requires-Dist: tenacity>=8.0; extra == 'sandbox-daytona'
38
+ Provides-Extra: sandbox-modal
39
+ Requires-Dist: modal>=0.73; extra == 'sandbox-modal'
40
+ Requires-Dist: tenacity>=8.0; extra == 'sandbox-modal'
36
41
  Description-Content-Type: text/markdown
37
42
 
38
43
  <div align="center">
@@ -116,7 +121,7 @@ SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
116
121
 
117
122
  ## Featured
118
123
 
119
- - **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). Also benchflow's [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) parity answer for the no-second-LLM case. See [Progressive disclosure](./docs/progressive-disclosure.md).
124
+ - **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). See [Progressive disclosure](./docs/progressive-disclosure.md).
120
125
 
121
126
  ## Research artifacts
122
127
 
@@ -130,7 +135,7 @@ Two runnable labs validate the security story:
130
135
  - **Eval researchers / paper writers** → [Getting started](./docs/getting-started.md) → [Concepts](./docs/concepts.md) → [Use cases](./docs/use-cases.md)
131
136
  - **Task authors** → [Task authoring](./docs/task-authoring.md) → [Sandbox hardening](./docs/sandbox-hardening.md)
132
137
  - **Agent builders integrating with benchflow** → [Concepts](./docs/concepts.md) → [Python API reference](./docs/reference/python-api.md) → [`benchflow.agents.registry`](./src/benchflow/agents/registry.py)
133
- - **Existing Harbor users migrating** → [Use cases — migration section](./docs/use-cases.md#migration-from-harbor) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
138
+ - **External benchmark adapters** → [Task authoring](./docs/task-authoring.md) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
134
139
 
135
140
  ## Contributing
136
141
 
@@ -79,7 +79,7 @@ SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
79
79
 
80
80
  ## Featured
81
81
 
82
- - **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). Also benchflow's [Harbor #1316](https://github.com/harbor-ai/harbor/issues/1316) parity answer for the no-second-LLM case. See [Progressive disclosure](./docs/progressive-disclosure.md).
82
+ - **Progressive disclosure on SWE-bench Pro** — the `BaseUser` abstraction drives a multi-round rollout: terse round-0 prompt → failing-test hints → full spec. 5/5 oracle on Daytona, runnable demo at [`docs/examples/swebench_pro_progressive_disclosure.ipynb`](./docs/examples/swebench_pro_progressive_disclosure.ipynb). See [Progressive disclosure](./docs/progressive-disclosure.md).
83
83
 
84
84
  ## Research artifacts
85
85
 
@@ -93,7 +93,7 @@ Two runnable labs validate the security story:
93
93
  - **Eval researchers / paper writers** → [Getting started](./docs/getting-started.md) → [Concepts](./docs/concepts.md) → [Use cases](./docs/use-cases.md)
94
94
  - **Task authors** → [Task authoring](./docs/task-authoring.md) → [Sandbox hardening](./docs/sandbox-hardening.md)
95
95
  - **Agent builders integrating with benchflow** → [Concepts](./docs/concepts.md) → [Python API reference](./docs/reference/python-api.md) → [`benchflow.agents.registry`](./src/benchflow/agents/registry.py)
96
- - **Existing Harbor users migrating** → [Use cases — migration section](./docs/use-cases.md#migration-from-harbor) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
96
+ - **External benchmark adapters** → [Task authoring](./docs/task-authoring.md) → [Progressive disclosure](./docs/progressive-disclosure.md#comparison-with-multi-agent-simulated-user)
97
97
 
98
98
  ## Contributing
99
99
 
@@ -1,12 +1,11 @@
1
1
  [project]
2
2
  name = "benchflow"
3
- version = "0.3.4"
3
+ version = "0.4.0"
4
4
  description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
7
- keywords = ["benchmark", "llm-agents", "acp", "agent-evaluation", "multi-turn", "terminal-bench", "skillsbench"]
7
+ keywords = ["benchmark", "llm-agents", "acp", "agent-evaluation", "multi-turn", "skillsbench"]
8
8
  dependencies = [
9
- "harbor==0.3.0",
10
9
  "httpx>=0.27.0",
11
10
  "anyio>=4.0",
12
11
  "pydantic>=2.0",
@@ -42,6 +41,14 @@ dev = [
42
41
  "ruff>=0.7.0",
43
42
  "ty>=0.0.1a1",
44
43
  ]
44
+ sandbox-daytona = [
45
+ "daytona>=0.153.0",
46
+ "tenacity>=8.0",
47
+ ]
48
+ sandbox-modal = [
49
+ "modal>=0.73",
50
+ "tenacity>=8.0",
51
+ ]
45
52
  bedrock = [
46
53
  "boto3>=1.40",
47
54
  ]
@@ -114,5 +121,24 @@ ignore = [
114
121
  [tool.ty.environment]
115
122
  python-version = "3.12"
116
123
 
124
+ [tool.ty.rules]
125
+ # Many modules lazily import optional deps (daytona, modal, openai, toml, …).
126
+ # These are guarded by try/except at runtime but ty can't resolve them in CI.
127
+ unresolved-import = "ignore"
128
+
117
129
  [tool.ty.src]
118
130
  include = ["src"]
131
+ # Modules that heavily use optional-dep types (daytona, modal, openai, boto3, …)
132
+ # produce cascading type errors when those packages aren't installed.
133
+ exclude = [
134
+ "src/benchflow/sandbox/daytona.py",
135
+ "src/benchflow/sandbox/modal_impl.py",
136
+ "src/benchflow/sandbox/docker.py",
137
+ "src/benchflow/sandbox/_base.py",
138
+ "src/benchflow/_env_setup.py",
139
+ "src/benchflow/rewards/llm.py",
140
+ "src/benchflow/rewards/file_readers.py",
141
+ "src/benchflow/rewards/rubric_config.py",
142
+ "src/benchflow/providers/bedrock_runtime.py",
143
+ "src/benchflow/experimental/mcp/reviewer_server.py",
144
+ ]
@@ -1,10 +1,12 @@
1
1
  """benchflow — ACP-native agent benchmarking framework.
2
2
 
3
- Re-exports environment APIs and adds:
3
+ Public API surface:
4
+ - Sandbox protocol for isolated execution environments
4
5
  - ACP client for multi-turn agent communication
5
6
  - Trajectory capture (HTTP proxy, OTel collector, ACP native)
6
7
  - Rollout lifecycle for single-task execution
7
8
  - Evaluation orchestration with retries and concurrency
9
+ - Rewards protocol (composable Rubric + RewardFunc)
8
10
  - Metrics collection and aggregation
9
11
  """
10
12
 
@@ -12,23 +14,9 @@ from importlib.metadata import version as _version
12
14
 
13
15
  __version__ = _version("benchflow")
14
16
 
15
- # Re-export Harbor's core types for downstream task authors
16
- from harbor import (
17
- BaseAgent,
18
- BaseEnvironment,
19
- ExecResult,
20
- Task,
21
- TaskConfig,
22
- Verifier,
23
- VerifierResult,
24
- )
25
-
26
- # benchflow's additions
27
- from benchflow._env_setup import stage_dockerfile_deps
28
- from benchflow._scene import MailboxTransport, Message, MessageTransport, SceneRole
29
- from benchflow._scene import Scene as SceneRuntime
30
- from benchflow._snapshot import list_snapshots, restore, snapshot
17
+ # Core types
31
18
  from benchflow._types import Role, Scene, Turn
19
+ from benchflow._utils.yaml_loader import rollout_config_from_yaml
32
20
  from benchflow.acp.client import ACPClient
33
21
  from benchflow.acp.session import ACPSession
34
22
  from benchflow.adapters import (
@@ -45,12 +33,6 @@ from benchflow.agents.registry import (
45
33
  list_agents,
46
34
  register_agent,
47
35
  )
48
- from benchflow.environments import (
49
- SERVICES,
50
- build_service_hooks,
51
- detect_services_from_dockerfile,
52
- register_service,
53
- )
54
36
  from benchflow.evaluation import (
55
37
  Evaluation,
56
38
  EvaluationConfig,
@@ -85,32 +67,39 @@ from benchflow.runtime import (
85
67
  RuntimeResult,
86
68
  run,
87
69
  ) # bf.run() — supports Agent, RolloutConfig, and str calling conventions
70
+ from benchflow.sandbox import (
71
+ SERVICES,
72
+ ImageBuilder,
73
+ ImageConfig,
74
+ ImageRef,
75
+ Sandbox,
76
+ build_service_hooks,
77
+ detect_services_from_dockerfile,
78
+ register_service,
79
+ )
88
80
 
89
- # Sandbox protocol (v0.4 — parallel types, Harbor not yet removed)
81
+ # Sandbox protocol (v0.4)
90
82
  from benchflow.sandbox import ExecResult as SandboxExecResult
91
- from benchflow.sandbox import ImageBuilder, ImageConfig, ImageRef, Sandbox
83
+ from benchflow.sandbox.protocol import ExecResult
84
+ from benchflow.sandbox.setup import stage_dockerfile_deps
85
+ from benchflow.sandbox.snapshot import list_snapshots, restore, snapshot
86
+ from benchflow.sandbox.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
87
+ from benchflow.scenes import MailboxTransport, Message, MessageTransport, SceneRole
88
+ from benchflow.scenes import Scene as SceneRuntime
92
89
  from benchflow.sdk import SDK
93
90
  from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
91
+ from benchflow.task import (
92
+ Task,
93
+ TaskConfig,
94
+ Verifier,
95
+ VerifierResult,
96
+ )
94
97
  from benchflow.trajectories.otel import OTelCollector
95
98
  from benchflow.trajectories.proxy import TrajectoryProxy
96
99
  from benchflow.trajectories.types import Trajectory
97
- from benchflow.trial_yaml import trial_config_from_yaml
98
- from benchflow.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
99
-
100
- # Backward-compat aliases
101
- Trial = Rollout
102
- TrialConfig = RolloutConfig
103
- TrialRole = Role
104
- TrialScene = Scene
105
- RunResult = RolloutResult
106
- Job = Evaluation
107
- JobConfig = EvaluationConfig
108
- JobResult = EvaluationResult
109
100
 
110
101
  # Public API surface. Anything not in this list is implementation detail and
111
- # may change without notice. Names are grouped by source module to match the
112
- # imports above and to make it obvious to a future agent which module owns
113
- # what.
102
+ # may change without notice.
114
103
  __all__ = [
115
104
  "__version__",
116
105
  # Rewards protocol (v0.4)
@@ -122,21 +111,17 @@ __all__ = [
122
111
  "LLMJudgeRewardFunc",
123
112
  "StringMatchRewardFunc",
124
113
  "CodeExecRewardFunc",
125
- # Rubric config (ENG-55)
126
114
  "Criterion",
127
115
  "JudgeConfig",
128
116
  "RubricConfig",
129
117
  "ScoringConfig",
130
118
  "load_rubric_toml",
131
- # Sandbox protocol (v0.4)
119
+ # Sandbox protocol
132
120
  "Sandbox",
133
121
  "SandboxExecResult",
134
122
  "ImageBuilder",
135
123
  "ImageConfig",
136
124
  "ImageRef",
137
- # Harbor re-exports
138
- "BaseAgent",
139
- "BaseEnvironment",
140
125
  "ExecResult",
141
126
  "Task",
142
127
  "TaskConfig",
@@ -152,15 +137,11 @@ __all__ = [
152
137
  "is_vertex_model",
153
138
  "list_agents",
154
139
  "register_agent",
155
- # Evaluation orchestration (new names)
140
+ # Evaluation orchestration
156
141
  "Evaluation",
157
142
  "EvaluationConfig",
158
143
  "EvaluationResult",
159
144
  "RetryConfig",
160
- # Backward-compat aliases for Job
161
- "Job",
162
- "JobConfig",
163
- "JobResult",
164
145
  # Metrics
165
146
  "BenchmarkMetrics",
166
147
  "collect_metrics",
@@ -168,8 +149,7 @@ __all__ = [
168
149
  "AgentInstallError",
169
150
  "AgentTimeoutError",
170
151
  "RolloutResult",
171
- "RunResult",
172
- # Runtime (0.3 compat)
152
+ # Runtime
173
153
  "Agent",
174
154
  "Environment",
175
155
  "Runtime",
@@ -177,7 +157,7 @@ __all__ = [
177
157
  "RuntimeResult",
178
158
  # Single entry point
179
159
  "run",
180
- # Canonical declarative types (_types.py — ENG-47)
160
+ # Declarative types
181
161
  "Role",
182
162
  "Scene",
183
163
  "Turn",
@@ -191,23 +171,18 @@ __all__ = [
191
171
  "snapshot",
192
172
  "restore",
193
173
  "list_snapshots",
194
- # Rollout (single execution path — ENG-46)
174
+ # Rollout
195
175
  "Rollout",
196
176
  "RolloutConfig",
197
- # Backward-compat aliases for Trial
198
- "Trial",
199
- "TrialConfig",
200
- "TrialRole",
201
- "TrialScene",
202
- "trial_config_from_yaml",
177
+ "rollout_config_from_yaml",
203
178
  # User abstraction (progressive disclosure)
204
179
  "BaseUser",
205
180
  "FunctionUser",
206
181
  "PassthroughUser",
207
182
  "RoundResult",
208
- # SDK (backwards compat)
183
+ # SDK
209
184
  "SDK",
210
- # Environments / dep staging
185
+ # Sandbox services
211
186
  "SERVICES",
212
187
  "build_service_hooks",
213
188
  "detect_services_from_dockerfile",
@@ -222,7 +197,7 @@ __all__ = [
222
197
  "OTelCollector",
223
198
  "TrajectoryProxy",
224
199
  "Trajectory",
225
- # External adapters (ENG-51)
200
+ # External adapters
226
201
  "InspectAdapter",
227
202
  "ORSAdapter",
228
203
  "to_inspect_task",
@@ -231,8 +206,7 @@ __all__ = [
231
206
 
232
207
 
233
208
  def __getattr__(name: str):
234
- """Fall through to harbor for names not explicitly re-exported."""
235
- # Let Python's normal submodule resolution handle subpackages first.
209
+ """Lazy submodule resolution."""
236
210
  import importlib
237
211
 
238
212
  try:
@@ -240,16 +214,4 @@ def __getattr__(name: str):
240
214
  except ModuleNotFoundError as e:
241
215
  if e.name != f"benchflow.{name}":
242
216
  raise
243
-
244
- import harbor
245
-
246
- if hasattr(harbor, name):
247
- import warnings
248
-
249
- warnings.warn(
250
- f"'{name}' is not directly re-exported by benchflow. Use 'from harbor import {name}' instead.",
251
- ImportWarning,
252
- stacklevel=2,
253
- )
254
- return getattr(harbor, name)
255
217
  raise AttributeError(f"module 'benchflow' has no attribute {name!r}")
@@ -0,0 +1,47 @@
1
+ """Small `.env` reader shared by CLI/runtime env resolution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ _DEFAULT_DOTENV_PATH = Path(".env")
9
+ _DOTENV_PATH_ENV = "BENCHFLOW_DOTENV_PATH"
10
+
11
+
12
+ def load_dotenv_env(path: str | Path | None = None) -> dict[str, str]:
13
+ """Read a local `.env` file into a plain dict.
14
+
15
+ Missing files are treated as empty input. `BENCHFLOW_DOTENV_PATH` lets tests
16
+ or callers override the implicit `.env` lookup without changing cwd.
17
+ """
18
+ if path is not None:
19
+ dotenv_path = Path(path)
20
+ else:
21
+ dotenv_path = Path(os.environ.get(_DOTENV_PATH_ENV, _DEFAULT_DOTENV_PATH))
22
+ if not dotenv_path.exists() or not dotenv_path.is_file():
23
+ return {}
24
+
25
+ parsed: dict[str, str] = {}
26
+ for raw_line in dotenv_path.read_text().splitlines():
27
+ line = raw_line.strip()
28
+ if not line or line.startswith("#"):
29
+ continue
30
+ if line.startswith("export "):
31
+ line = line[len("export ") :].lstrip()
32
+ if "=" not in line:
33
+ continue
34
+
35
+ key, value = line.split("=", 1)
36
+ key = key.strip()
37
+ value = value.strip()
38
+ if not key:
39
+ continue
40
+
41
+ if value[:1] in {"'", '"'} and value[-1:] == value[:1]:
42
+ value = value[1:-1]
43
+ elif " #" in value:
44
+ value = value.split(" #", 1)[0].rstrip()
45
+
46
+ parsed[key] = value
47
+ return parsed
@@ -0,0 +1,11 @@
1
+ """benchflow._utils — small periphery I/O glue, private.
2
+
3
+ Holds small (<200 LOC) periphery modules that translate between external
4
+ artifacts (YAML files, git repos, scaffolded task dirs) and benchflow
5
+ shapes.
6
+
7
+ Members:
8
+ yaml_loader — YAML → RolloutConfig/EvaluationConfig
9
+ benchmark_repos — clone benchmark repos
10
+ task_authoring — init_task / check_task scaffolding
11
+ """
@@ -138,7 +138,11 @@ def resolve_source(repo: str, path: str | None = None, ref: str | None = None) -
138
138
  # Format: (org/repo, ref, subpath)
139
139
  TASK_ALIASES: dict[str, tuple[str, str | None, str | None]] = {
140
140
  "skillsbench": ("benchflow-ai/skillsbench", "main", "tasks"),
141
- "programbench": ("benchflow-ai/benchmarks", "main", "datasets/programbench/tasks"),
141
+ "programbench": (
142
+ "facebookresearch/programbench",
143
+ "main",
144
+ "src/programbench/data/tasks",
145
+ ),
142
146
  "harvey-lab": ("benchflow-ai/benchmarks", "main", "datasets/harvey-lab/tasks"),
143
147
  }
144
148
 
@@ -0,0 +1,22 @@
1
+ """Shared configuration normalization helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from benchflow.agents.registry import parse_agent_spec
6
+
7
+
8
+ def normalize_agent_name(agent: str) -> str:
9
+ """Return the canonical registry name for an ACP agent alias."""
10
+ protocol, canonical = parse_agent_spec(agent)
11
+ if protocol == "acp":
12
+ return canonical
13
+ return agent
14
+
15
+
16
+ def normalize_sandbox_user(sandbox_user: str | None) -> str | None:
17
+ """Map text root-user sentinels to ``None``."""
18
+ if sandbox_user is None:
19
+ return None
20
+ if sandbox_user.lower() in {"none", "null"}:
21
+ return None
22
+ return sandbox_user
@@ -1,6 +1,6 @@
1
- """YAML trial config loader.
1
+ """YAML rollout config loader.
2
2
 
3
- Parses trial YAML files into TrialConfig with Scene support.
3
+ Parses rollout YAML files into RolloutConfig with Scene support.
4
4
  Handles both new scene-based format and legacy flat format.
5
5
 
6
6
  New format::
@@ -44,13 +44,13 @@ from typing import Any
44
44
  import yaml
45
45
 
46
46
  from benchflow._types import Role, Scene, Turn
47
- from benchflow.trial import TrialConfig
47
+ from benchflow.rollout import RolloutConfig
48
48
 
49
49
  logger = logging.getLogger(__name__)
50
50
 
51
51
 
52
- def load_trial_yaml(path: str | Path) -> dict:
53
- """Load and normalize a trial YAML file."""
52
+ def load_rollout_yaml(path: str | Path) -> dict:
53
+ """Load and normalize a rollout YAML file."""
54
54
  with open(path) as f:
55
55
  raw = yaml.safe_load(f)
56
56
  if not isinstance(raw, dict):
@@ -58,23 +58,23 @@ def load_trial_yaml(path: str | Path) -> dict:
58
58
  return raw
59
59
 
60
60
 
61
- def trial_config_from_yaml(
61
+ def rollout_config_from_yaml(
62
62
  path: str | Path,
63
63
  task_path: Path | None = None,
64
- ) -> TrialConfig:
65
- """Parse a YAML file into a TrialConfig.
64
+ ) -> RolloutConfig:
65
+ """Parse a YAML file into a RolloutConfig.
66
66
 
67
67
  If task_path is provided, it overrides task_dir from the YAML.
68
68
  """
69
- raw = load_trial_yaml(path)
70
- return trial_config_from_dict(raw, task_path=task_path)
69
+ raw = load_rollout_yaml(path)
70
+ return rollout_config_from_dict(raw, task_path=task_path)
71
71
 
72
72
 
73
- def trial_config_from_dict(
73
+ def rollout_config_from_dict(
74
74
  raw: dict[str, Any],
75
75
  task_path: Path | None = None,
76
- ) -> TrialConfig:
77
- """Convert a raw dict (from YAML or programmatic) into a TrialConfig."""
76
+ ) -> RolloutConfig:
77
+ """Convert a raw dict (from YAML or programmatic) into a RolloutConfig."""
78
78
  tp = task_path or Path(raw.get("task_dir", raw.get("task_path", ".")))
79
79
 
80
80
  # Scene-based format
@@ -105,7 +105,7 @@ def trial_config_from_dict(
105
105
  else:
106
106
  raise ValueError("YAML must have either 'scenes' or 'agent' at top level")
107
107
 
108
- return TrialConfig(
108
+ return RolloutConfig(
109
109
  task_path=tp,
110
110
  scenes=scenes,
111
111
  environment=raw.get("environment", "docker"),
@@ -113,7 +113,7 @@ def trial_config_from_dict(
113
113
  sandbox_locked_paths=raw.get("sandbox_locked_paths"),
114
114
  sandbox_setup_timeout=raw.get("sandbox_setup_timeout", 120),
115
115
  job_name=raw.get("job_name"),
116
- trial_name=raw.get("trial_name"),
116
+ rollout_name=raw.get("rollout_name"),
117
117
  jobs_dir=raw.get("jobs_dir", "jobs"),
118
118
  context_root=raw.get("context_root"),
119
119
  agent=raw.get("agent", "claude-agent-acp"),
@@ -165,9 +165,9 @@ def job_config_from_yaml(path: str | Path) -> dict:
165
165
  """Parse a YAML file and return both job-level and trial-level config.
166
166
 
167
167
  Returns a dict with keys: task_dir, concurrency, max_retries,
168
- trial_config (TrialConfig), and any other job-level fields.
168
+ trial_config (RolloutConfig), and any other job-level fields.
169
169
  """
170
- raw = load_trial_yaml(path)
170
+ raw = load_rollout_yaml(path)
171
171
  task_dir = Path(raw.get("task_dir", raw.get("tasks_dir", ".")))
172
172
  concurrency = raw.get("concurrency", 4)
173
173
  max_retries = raw.get("max_retries", 2)
@@ -176,6 +176,6 @@ def job_config_from_yaml(path: str | Path) -> dict:
176
176
  "task_dir": task_dir,
177
177
  "concurrency": concurrency,
178
178
  "max_retries": max_retries,
179
- "trial_config": trial_config_from_dict(raw, task_path=task_dir),
179
+ "trial_config": rollout_config_from_dict(raw, task_path=task_dir),
180
180
  "raw": raw,
181
181
  }
@@ -5,7 +5,7 @@ import logging
5
5
  from pathlib import Path
6
6
  from typing import Any
7
7
 
8
- from benchflow.process import LiveProcess
8
+ from benchflow.sandbox.process import LiveProcess
9
9
 
10
10
  from .transport import Transport, decode_json_rpc_message
11
11
 
@@ -7,7 +7,7 @@ Owns the live agent-side of a run:
7
7
  ACP-native trajectory, and report tool-call counts
8
8
 
9
9
  The one allowed horizontal phase import in this refactor lives here:
10
- ``from benchflow._sandbox import build_priv_drop_cmd``. connect_acp wraps
10
+ ``from benchflow.sandbox.lockdown import build_priv_drop_cmd``. connect_acp wraps
11
11
  the agent launch command in the sandbox user's privilege-drop prefix
12
12
  before handing it to the transport. It is a single pure-function call
13
13
  with no shared state — not a coupling of concerns.
@@ -22,13 +22,13 @@ import contextlib
22
22
  import logging
23
23
  from pathlib import Path
24
24
 
25
- from benchflow._sandbox import build_priv_drop_cmd
26
- from benchflow._trajectory import _capture_session_trajectory
27
25
  from benchflow.acp.client import ACPClient
28
26
  from benchflow.acp.container_transport import ContainerTransport
29
27
  from benchflow.agents.providers import find_provider, strip_provider_prefix
30
28
  from benchflow.agents.registry import AGENTS
31
- from benchflow.process import DaytonaProcess, DaytonaPtyProcess, DockerProcess
29
+ from benchflow.sandbox.lockdown import build_priv_drop_cmd
30
+ from benchflow.sandbox.process import DaytonaProcess, DaytonaPtyProcess, DockerProcess
31
+ from benchflow.trajectories._capture import _capture_session_trajectory
32
32
 
33
33
  logger = logging.getLogger(__name__)
34
34
 
@@ -144,7 +144,7 @@ async def connect_acp(
144
144
  agent_env: dict,
145
145
  sandbox_user: str | None,
146
146
  model: str | None,
147
- trial_dir: Path,
147
+ rollout_dir: Path,
148
148
  environment: str,
149
149
  agent_cwd: str,
150
150
  ) -> tuple[ACPClient, object, str]:
@@ -181,18 +181,18 @@ async def connect_acp(
181
181
 
182
182
  try:
183
183
  if environment == "docker":
184
- live_proc = DockerProcess.from_harbor_env(env)
184
+ live_proc = DockerProcess.from_sandbox_env(env)
185
185
  else:
186
186
  is_dind = hasattr(env, "_strategy") and hasattr(
187
187
  env._strategy, "_compose_cmd"
188
188
  )
189
189
  if is_dind:
190
- live_proc = await DaytonaPtyProcess.from_harbor_env(env)
190
+ live_proc = await DaytonaPtyProcess.from_sandbox_env(env)
191
191
  logger.info("Using PTY transport for DinD compose task")
192
192
  else:
193
- live_proc = await DaytonaProcess.from_harbor_env(env)
193
+ live_proc = await DaytonaProcess.from_sandbox_env(env)
194
194
 
195
- agent_log = trial_dir / "agent" / f"{agent.replace('-', '_')}.txt"
195
+ agent_log = rollout_dir / "agent" / f"{agent.replace('-', '_')}.txt"
196
196
  transport = ContainerTransport(
197
197
  container_process=live_proc,
198
198
  command=agent_launch,
@@ -6,7 +6,7 @@ import logging
6
6
  from abc import ABC, abstractmethod
7
7
  from typing import Any
8
8
 
9
- from benchflow.process import drain_oversized_line
9
+ from benchflow.sandbox.process import drain_oversized_line
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12