benchflow 0.4.0__tar.gz → 0.5.1.dev869__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (458) hide show
  1. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/.gitignore +1 -0
  2. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/CHANGELOG.md +10 -0
  3. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/PKG-INFO +26 -16
  4. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/README.md +15 -13
  5. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/pyproject.toml +27 -6
  6. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/__init__.py +55 -17
  7. benchflow-0.5.1.dev869/src/benchflow/_paths.py +218 -0
  8. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/_run.py +2 -2
  9. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/_types.py +13 -5
  10. benchflow-0.5.1.dev869/src/benchflow/_utils/benchmark_repos.py +516 -0
  11. benchflow-0.5.1.dev869/src/benchflow/_utils/config.py +66 -0
  12. benchflow-0.5.1.dev869/src/benchflow/_utils/evaluation_results.py +228 -0
  13. benchflow-0.5.1.dev869/src/benchflow/_utils/json_safe.py +44 -0
  14. benchflow-0.5.1.dev869/src/benchflow/_utils/learner_memory.py +162 -0
  15. benchflow-0.5.1.dev869/src/benchflow/_utils/result_metadata.py +75 -0
  16. benchflow-0.5.1.dev869/src/benchflow/_utils/reward_events.py +95 -0
  17. benchflow-0.5.1.dev869/src/benchflow/_utils/scoring.py +264 -0
  18. benchflow-0.5.1.dev869/src/benchflow/_utils/source_provenance.py +129 -0
  19. benchflow-0.5.1.dev869/src/benchflow/_utils/task_authoring.py +236 -0
  20. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/yaml_loader.py +11 -2
  21. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/client.py +164 -26
  22. benchflow-0.5.1.dev869/src/benchflow/acp/runtime.py +646 -0
  23. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/session.py +171 -3
  24. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/transport.py +13 -1
  25. benchflow-0.5.1.dev869/src/benchflow/acp/types.py +328 -0
  26. benchflow-0.5.1.dev869/src/benchflow/adapters/__init__.py +51 -0
  27. benchflow-0.5.1.dev869/src/benchflow/adapters/harbor.py +137 -0
  28. benchflow-0.5.1.dev869/src/benchflow/adapters/inbound.py +219 -0
  29. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/ors.py +11 -0
  30. benchflow-0.5.1.dev869/src/benchflow/adapters/terminal_bench.py +353 -0
  31. benchflow-0.5.1.dev869/src/benchflow/agents/codex_config.py +68 -0
  32. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/credentials.py +10 -0
  33. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/env.py +342 -19
  34. benchflow-0.5.1.dev869/src/benchflow/agents/errors.py +8 -0
  35. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/install.py +56 -15
  36. benchflow-0.5.1.dev869/src/benchflow/agents/protocol.py +242 -0
  37. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/providers.py +145 -1
  38. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/registry.py +194 -20
  39. benchflow-0.5.1.dev869/src/benchflow/branch.py +74 -0
  40. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/main.py +623 -359
  41. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/trace_import.py +3 -5
  42. benchflow-0.5.1.dev869/src/benchflow/compat/__init__.py +19 -0
  43. benchflow-0.5.1.dev869/src/benchflow/compat/harbor_registry.py +289 -0
  44. benchflow-0.5.1.dev869/src/benchflow/contracts/__init__.py +68 -0
  45. benchflow-0.5.1.dev869/src/benchflow/contracts/planes.py +96 -0
  46. benchflow-0.5.1.dev869/src/benchflow/contracts/user.py +74 -0
  47. benchflow-0.5.1.dev869/src/benchflow/diagnostics.py +393 -0
  48. benchflow-0.5.1.dev869/src/benchflow/environment/__init__.py +42 -0
  49. benchflow-0.5.1.dev869/src/benchflow/environment/manifest.py +230 -0
  50. benchflow-0.5.1.dev869/src/benchflow/environment/manifest_env.py +308 -0
  51. benchflow-0.5.1.dev869/src/benchflow/environment/protocol.py +77 -0
  52. benchflow-0.5.1.dev869/src/benchflow/environment/readiness.py +74 -0
  53. benchflow-0.5.1.dev869/src/benchflow/eval_sharding.py +349 -0
  54. benchflow-0.5.1.dev869/src/benchflow/eval_worker.py +116 -0
  55. benchflow-0.5.1.dev869/src/benchflow/evaluation.py +1458 -0
  56. benchflow-0.5.1.dev869/src/benchflow/hosted_env.py +849 -0
  57. benchflow-0.5.1.dev869/src/benchflow/learner_skills.py +142 -0
  58. benchflow-0.5.1.dev869/src/benchflow/learner_store.py +277 -0
  59. benchflow-0.5.1.dev869/src/benchflow/metrics.py +419 -0
  60. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/models.py +84 -9
  61. benchflow-0.5.1.dev869/src/benchflow/monitor.py +209 -0
  62. benchflow-0.5.1.dev869/src/benchflow/providers/__init__.py +18 -0
  63. benchflow-0.5.1.dev869/src/benchflow/providers/litellm_bedrock_patch.py +93 -0
  64. benchflow-0.5.1.dev869/src/benchflow/providers/litellm_config.py +335 -0
  65. benchflow-0.5.1.dev869/src/benchflow/providers/litellm_logging.py +385 -0
  66. benchflow-0.5.1.dev869/src/benchflow/providers/litellm_runtime.py +1109 -0
  67. benchflow-0.5.1.dev869/src/benchflow/providers/runtime.py +70 -0
  68. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/__init__.py +16 -1
  69. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/builtins.py +79 -9
  70. benchflow-0.5.1.dev869/src/benchflow/rewards/events.py +44 -0
  71. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/file_readers.py +7 -3
  72. benchflow-0.5.1.dev869/src/benchflow/rewards/llm.py +297 -0
  73. benchflow-0.5.1.dev869/src/benchflow/rewards/memory_scorer.py +146 -0
  74. benchflow-0.5.1.dev869/src/benchflow/rewards/node.py +164 -0
  75. benchflow-0.5.1.dev869/src/benchflow/rewards/protocol.py +89 -0
  76. benchflow-0.5.1.dev869/src/benchflow/rewards/rubric_config.py +220 -0
  77. benchflow-0.5.1.dev869/src/benchflow/rewards/validation.py +67 -0
  78. benchflow-0.5.1.dev869/src/benchflow/rollout.py +2920 -0
  79. benchflow-0.5.1.dev869/src/benchflow/rollout_branch.py +222 -0
  80. benchflow-0.5.1.dev869/src/benchflow/rollout_planes.py +195 -0
  81. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/runtime.py +97 -13
  82. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/__init__.py +4 -0
  83. benchflow-0.5.1.dev869/src/benchflow/sandbox/_base.py +362 -0
  84. benchflow-0.5.1.dev869/src/benchflow/sandbox/_compose.py +28 -0
  85. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-base.yaml +6 -0
  86. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/daytona.py +596 -67
  87. benchflow-0.5.1.dev869/src/benchflow/sandbox/docker.py +853 -0
  88. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/lockdown.py +145 -15
  89. benchflow-0.5.1.dev869/src/benchflow/sandbox/metadata.py +34 -0
  90. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/modal_impl.py +57 -12
  91. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/process.py +393 -116
  92. benchflow-0.5.1.dev869/src/benchflow/sandbox/protocol.py +181 -0
  93. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/setup.py +234 -42
  94. benchflow-0.5.1.dev869/src/benchflow/sandbox/snapshot.py +107 -0
  95. benchflow-0.5.1.dev869/src/benchflow/sandbox/user.py +10 -0
  96. benchflow-0.5.1.dev869/src/benchflow/scenes.py +99 -0
  97. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sdk.py +24 -3
  98. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/self_gen.py +34 -15
  99. benchflow-0.5.1.dev869/src/benchflow/skill_eval/__init__.py +45 -0
  100. benchflow-0.4.0/src/benchflow/skill_eval.py → benchflow-0.5.1.dev869/src/benchflow/skill_eval/_core.py +145 -164
  101. benchflow-0.5.1.dev869/src/benchflow/skill_eval/gepa_export.py +195 -0
  102. benchflow-0.5.1.dev869/src/benchflow/skill_eval/schema.py +153 -0
  103. benchflow-0.5.1.dev869/src/benchflow/skill_policy.py +191 -0
  104. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/task/__init__.py +4 -0
  105. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/task/config.py +114 -5
  106. benchflow-0.5.1.dev869/src/benchflow/task/verifier.py +457 -0
  107. benchflow-0.5.1.dev869/src/benchflow/templates/test.sh.tmpl +20 -0
  108. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/huggingface.py +89 -20
  109. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/parsers.py +8 -0
  110. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/task_gen.py +353 -67
  111. benchflow-0.5.1.dev869/src/benchflow/trajectories/__init__.py +36 -0
  112. benchflow-0.5.1.dev869/src/benchflow/trajectories/_capture.py +252 -0
  113. benchflow-0.5.1.dev869/src/benchflow/trajectories/export.py +304 -0
  114. benchflow-0.5.1.dev869/src/benchflow/trajectories/metrics.py +161 -0
  115. benchflow-0.5.1.dev869/src/benchflow/trajectories/tree.py +144 -0
  116. benchflow-0.5.1.dev869/src/benchflow/trajectories/types.py +388 -0
  117. benchflow-0.5.1.dev869/src/benchflow/usage_tracking.py +163 -0
  118. benchflow-0.5.1.dev869/tests/agents/__init__.py +1 -0
  119. benchflow-0.5.1.dev869/tests/agents/test_protocol.py +191 -0
  120. benchflow-0.5.1.dev869/tests/conformance/acp_smoke/environment/docker-compose.yaml +3 -0
  121. benchflow-0.5.1.dev869/tests/conformance/acp_smoke/environment/skills/conformance-writer/SKILL.md +16 -0
  122. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/task.toml +2 -1
  123. benchflow-0.5.1.dev869/tests/conformance/proof_multi_agent.py +87 -0
  124. benchflow-0.5.1.dev869/tests/conformance/self_gen_smoke_skills/skill-creator/SKILL.md +22 -0
  125. benchflow-0.5.1.dev869/tests/environment/__init__.py +0 -0
  126. benchflow-0.5.1.dev869/tests/environment/test_chibench_manifest.py +65 -0
  127. benchflow-0.5.1.dev869/tests/environment/test_clawsbench_manifest.py +41 -0
  128. benchflow-0.5.1.dev869/tests/environment/test_manifest.py +171 -0
  129. benchflow-0.5.1.dev869/tests/environment/test_manifest_env.py +410 -0
  130. benchflow-0.5.1.dev869/tests/environment/test_protocol.py +57 -0
  131. benchflow-0.5.1.dev869/tests/environment/test_readiness.py +76 -0
  132. benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/environment/Dockerfile +5 -0
  133. benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/instruction.md +13 -0
  134. benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/solution/solve.sh +11 -0
  135. benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/task.toml +18 -0
  136. benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/tests/test.sh +8 -0
  137. benchflow-0.5.1.dev869/tests/examples/terminal-bench-smoke-task/tests/test_state.py +27 -0
  138. benchflow-0.5.1.dev869/tests/examples/traces/minimal-claude.jsonl +3 -0
  139. benchflow-0.5.1.dev869/tests/examples/traces/minimal-opentraces.jsonl +1 -0
  140. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent.py +21 -1
  141. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent_interleaved.py +2 -1
  142. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_acp_agent_multi_turn.py +2 -1
  143. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/check_adapter_evidence.py +73 -15
  144. benchflow-0.5.1.dev869/tests/integration/check_hosted_env_evidence.py +209 -0
  145. benchflow-0.5.1.dev869/tests/integration/check_results.py +1079 -0
  146. benchflow-0.5.1.dev869/tests/integration/check_skillsbench_harbor_parity.py +509 -0
  147. benchflow-0.5.1.dev869/tests/integration/check_trace_to_task_evidence.py +353 -0
  148. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/claude-agent-acp.yaml +1 -1
  149. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/codex-acp.yaml +1 -1
  150. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/gemini.yaml +1 -1
  151. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/harvey-lab-harness.yaml +1 -1
  152. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/openclaw.yaml +1 -1
  153. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/opencode.yaml +1 -1
  154. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/openhands.yaml +1 -1
  155. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/configs/pi-acp.yaml +1 -1
  156. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/integration/run.sh +69 -44
  157. benchflow-0.5.1.dev869/tests/integration/run_suite.py +879 -0
  158. benchflow-0.5.1.dev869/tests/integration/suites/release.yaml +526 -0
  159. benchflow-0.5.1.dev869/tests/test_acp.py +1659 -0
  160. benchflow-0.5.1.dev869/tests/test_acp_capability_advertising.py +184 -0
  161. benchflow-0.5.1.dev869/tests/test_acp_model_config_dispatch.py +134 -0
  162. benchflow-0.5.1.dev869/tests/test_acp_pinned_protocol_guard.py +93 -0
  163. benchflow-0.5.1.dev869/tests/test_acp_setup_failure_propagation.py +208 -0
  164. benchflow-0.5.1.dev869/tests/test_adapter_scripts.py +227 -0
  165. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_adapters.py +71 -0
  166. benchflow-0.5.1.dev869/tests/test_agent_cli.py +29 -0
  167. benchflow-0.5.1.dev869/tests/test_agent_env_resolution.py +132 -0
  168. benchflow-0.5.1.dev869/tests/test_agent_gemini_defaults.py +82 -0
  169. benchflow-0.5.1.dev869/tests/test_agent_idle_timeout_cli.py +188 -0
  170. benchflow-0.5.1.dev869/tests/test_agent_registry.py +246 -0
  171. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_agent_setup.py +277 -22
  172. benchflow-0.5.1.dev869/tests/test_agent_spec.py +213 -0
  173. benchflow-0.5.1.dev869/tests/test_base_install_imports.py +106 -0
  174. benchflow-0.5.1.dev869/tests/test_bedrock_thinking.py +62 -0
  175. benchflow-0.5.1.dev869/tests/test_branch.py +89 -0
  176. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_capture_trajectory.py +5 -1
  177. benchflow-0.5.1.dev869/tests/test_clawsbench_slice.py +52 -0
  178. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_cli_daytona.py +11 -3
  179. benchflow-0.5.1.dev869/tests/test_cli_docs_drift.py +115 -0
  180. benchflow-0.5.1.dev869/tests/test_cli_misc.py +176 -0
  181. benchflow-0.5.1.dev869/tests/test_compat_harbor_registry.py +328 -0
  182. benchflow-0.5.1.dev869/tests/test_config_redaction.py +160 -0
  183. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_connect_as_env.py +41 -58
  184. benchflow-0.5.1.dev869/tests/test_continuallearningbench_adapter.py +90 -0
  185. benchflow-0.5.1.dev869/tests/test_dashboard_credential_env_scrub.py +115 -0
  186. benchflow-0.5.1.dev869/tests/test_dashboard_daytona_key.py +129 -0
  187. benchflow-0.5.1.dev869/tests/test_dashboard_no_host_paths.py +169 -0
  188. benchflow-0.5.1.dev869/tests/test_dashboard_release_evidence.py +297 -0
  189. benchflow-0.5.1.dev869/tests/test_dashboard_roadmap.py +818 -0
  190. benchflow-0.5.1.dev869/tests/test_dashboard_symlink_ingestion.py +170 -0
  191. benchflow-0.5.1.dev869/tests/test_dashboard_sync.py +1699 -0
  192. benchflow-0.5.1.dev869/tests/test_daytona_command_polling.py +51 -0
  193. benchflow-0.5.1.dev869/tests/test_daytona_litellm_runtime.py +121 -0
  194. benchflow-0.5.1.dev869/tests/test_daytona_status.py +90 -0
  195. benchflow-0.5.1.dev869/tests/test_docker_prune_scoping.py +153 -0
  196. benchflow-0.5.1.dev869/tests/test_docker_uploads.py +41 -0
  197. benchflow-0.5.1.dev869/tests/test_docs_examples.py +121 -0
  198. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_eng50_capabilities.py +29 -111
  199. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_env_setup.py +44 -0
  200. benchflow-0.5.1.dev869/tests/test_environment_manifest_controls.py +299 -0
  201. benchflow-0.5.1.dev869/tests/test_eval_filters_applied.py +158 -0
  202. benchflow-0.5.1.dev869/tests/test_eval_sharding.py +38 -0
  203. benchflow-0.5.1.dev869/tests/test_eval_single_task_summary.py +159 -0
  204. benchflow-0.5.1.dev869/tests/test_eval_source_provenance.py +681 -0
  205. benchflow-0.5.1.dev869/tests/test_eval_worker_retry.py +57 -0
  206. benchflow-0.5.1.dev869/tests/test_eval_zero_task_guard.py +116 -0
  207. benchflow-0.5.1.dev869/tests/test_evaluation_environment_manifest.py +213 -0
  208. benchflow-0.5.1.dev869/tests/test_experiments_status.py +181 -0
  209. benchflow-0.5.1.dev869/tests/test_hilbench_adapter.py +104 -0
  210. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_hosted_env.py +14 -1
  211. benchflow-0.5.1.dev869/tests/test_hosted_env_rollout_contract.py +207 -0
  212. benchflow-0.5.1.dev869/tests/test_inbound_adapter_manifest.py +265 -0
  213. benchflow-0.5.1.dev869/tests/test_inbound_adapters.py +440 -0
  214. benchflow-0.5.1.dev869/tests/test_integration_check_results.py +2345 -0
  215. benchflow-0.5.1.dev869/tests/test_integration_run_suite.py +894 -0
  216. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_internet_policy.py +68 -30
  217. benchflow-0.5.1.dev869/tests/test_job.py +781 -0
  218. benchflow-0.5.1.dev869/tests/test_job_sequential_shared.py +595 -0
  219. benchflow-0.5.1.dev869/tests/test_job_sequential_shared_resume.py +314 -0
  220. benchflow-0.5.1.dev869/tests/test_judge_symlink_ingestion.py +64 -0
  221. benchflow-0.5.1.dev869/tests/test_learner_skills.py +137 -0
  222. benchflow-0.5.1.dev869/tests/test_learner_skills_traversal.py +95 -0
  223. benchflow-0.5.1.dev869/tests/test_learner_store.py +257 -0
  224. benchflow-0.5.1.dev869/tests/test_learner_store_persistence.py +138 -0
  225. benchflow-0.5.1.dev869/tests/test_litellm_config.py +99 -0
  226. benchflow-0.5.1.dev869/tests/test_litellm_hardening.py +659 -0
  227. benchflow-0.5.1.dev869/tests/test_litellm_logging.py +121 -0
  228. benchflow-0.5.1.dev869/tests/test_litellm_runtime.py +429 -0
  229. benchflow-0.5.1.dev869/tests/test_litellm_smoke.py +160 -0
  230. benchflow-0.5.1.dev869/tests/test_llm_judge.py +852 -0
  231. benchflow-0.5.1.dev869/tests/test_llm_judge_event_tags.py +217 -0
  232. benchflow-0.5.1.dev869/tests/test_llm_judge_verifier.py +734 -0
  233. benchflow-0.5.1.dev869/tests/test_memory_scorer.py +394 -0
  234. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_metrics.py +155 -0
  235. benchflow-0.5.1.dev869/tests/test_monitor_scaffold.py +133 -0
  236. benchflow-0.5.1.dev869/tests/test_native_acp_usage.py +243 -0
  237. benchflow-0.5.1.dev869/tests/test_no_cross_provider_fallback.py +84 -0
  238. benchflow-0.5.1.dev869/tests/test_oracle_chokepoint.py +1011 -0
  239. benchflow-0.5.1.dev869/tests/test_paths_safe.py +106 -0
  240. benchflow-0.5.1.dev869/tests/test_paths_symlink_helpers.py +95 -0
  241. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_pi_acp_launcher.py +52 -0
  242. benchflow-0.5.1.dev869/tests/test_process.py +760 -0
  243. benchflow-0.5.1.dev869/tests/test_provider_auth_detection.py +308 -0
  244. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_providers.py +136 -0
  245. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_reexport.py +6 -2
  246. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_registry_invariants.py +117 -6
  247. benchflow-0.5.1.dev869/tests/test_release_version.py +132 -0
  248. benchflow-0.5.1.dev869/tests/test_resolve_env_helpers.py +1089 -0
  249. benchflow-0.5.1.dev869/tests/test_reward_node.py +145 -0
  250. benchflow-0.5.1.dev869/tests/test_reward_unified_contract.py +175 -0
  251. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_rewards.py +40 -41
  252. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_rewards_jsonl.py +66 -0
  253. benchflow-0.5.1.dev869/tests/test_rollout_architecture.py +54 -0
  254. benchflow-0.5.1.dev869/tests/test_rollout_branch.py +451 -0
  255. benchflow-0.5.1.dev869/tests/test_rollout_config_path_coercion.py +96 -0
  256. benchflow-0.5.1.dev869/tests/test_rollout_environment.py +24 -0
  257. benchflow-0.5.1.dev869/tests/test_rollout_import_no_side_effects.py +137 -0
  258. benchflow-0.5.1.dev869/tests/test_rollout_on_ask_user_wiring.py +299 -0
  259. benchflow-0.5.1.dev869/tests/test_rollout_probe_sandbox_health.py +101 -0
  260. benchflow-0.5.1.dev869/tests/test_rollout_upload.py +633 -0
  261. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_rubric_config.py +106 -1
  262. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_runtime.py +15 -0
  263. benchflow-0.5.1.dev869/tests/test_runtime_config_wired.py +199 -0
  264. benchflow-0.5.1.dev869/tests/test_runtime_live_sandbox.py +257 -0
  265. benchflow-0.5.1.dev869/tests/test_sandbox.py +320 -0
  266. benchflow-0.5.1.dev869/tests/test_sandbox_exec_secret_handling.py +231 -0
  267. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_hardening.py +156 -14
  268. benchflow-0.5.1.dev869/tests/test_sandbox_isolation_copy_traversal.py +102 -0
  269. benchflow-0.5.1.dev869/tests/test_sandbox_multi_service.py +792 -0
  270. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_setup.py +89 -0
  271. benchflow-0.5.1.dev869/tests/test_sandbox_snapshot_contract.py +259 -0
  272. benchflow-0.5.1.dev869/tests/test_sandbox_upload_symlink.py +227 -0
  273. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_verifier_workspace.py +7 -7
  274. benchflow-0.5.1.dev869/tests/test_scene.py +102 -0
  275. benchflow-0.5.1.dev869/tests/test_scene_outbox_trial.py +397 -0
  276. benchflow-0.5.1.dev869/tests/test_scene_parallel_group.py +40 -0
  277. benchflow-0.5.1.dev869/tests/test_scene_result_aggregation.py +189 -0
  278. benchflow-0.5.1.dev869/tests/test_scoring.py +231 -0
  279. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sdk_internals.py +245 -6
  280. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sdk_lockdown.py +17 -12
  281. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_cli.py +17 -24
  282. benchflow-0.5.1.dev869/tests/test_self_gen_export_error_channel.py +202 -0
  283. benchflow-0.5.1.dev869/tests/test_self_gen_export_failures.py +175 -0
  284. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_self_gen_orchestration.py +85 -7
  285. benchflow-0.5.1.dev869/tests/test_session_request_permission_dispatch.py +289 -0
  286. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval.py +74 -2
  287. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_dryrun.py +7 -10
  288. benchflow-0.5.1.dev869/tests/test_skill_eval_sweep.py +472 -0
  289. benchflow-0.5.1.dev869/tests/test_skill_eval_traversal.py +141 -0
  290. benchflow-0.5.1.dev869/tests/test_skill_invocation_artifacts.py +188 -0
  291. benchflow-0.5.1.dev869/tests/test_skill_policy.py +209 -0
  292. benchflow-0.5.1.dev869/tests/test_skills_dir_agent_home_link.py +146 -0
  293. benchflow-0.5.1.dev869/tests/test_skillsbench_harbor_parity.py +180 -0
  294. benchflow-0.5.1.dev869/tests/test_skillsbench_harbor_run_suite.py +87 -0
  295. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_smoke.py +1 -1
  296. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_subscription_auth.py +162 -0
  297. benchflow-0.5.1.dev869/tests/test_task_check_eval_consistency.py +126 -0
  298. benchflow-0.5.1.dev869/tests/test_task_config.py +49 -0
  299. benchflow-0.5.1.dev869/tests/test_task_download.py +660 -0
  300. benchflow-0.5.1.dev869/tests/test_tasks.py +298 -0
  301. benchflow-0.5.1.dev869/tests/test_token_usage_normalization.py +158 -0
  302. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_trace_import_cli.py +49 -0
  303. benchflow-0.5.1.dev869/tests/test_trace_task_gen_traversal.py +123 -0
  304. benchflow-0.5.1.dev869/tests/test_trace_to_task_evidence.py +164 -0
  305. benchflow-0.5.1.dev869/tests/test_traces_huggingface.py +130 -0
  306. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_traces_parsers.py +5 -1
  307. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_traces_task_gen.py +226 -7
  308. benchflow-0.5.1.dev869/tests/test_train_mode_artifact_emission.py +328 -0
  309. benchflow-0.5.1.dev869/tests/test_trajectory_streaming.py +450 -0
  310. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_trial_agent_timeout_verify.py +25 -10
  311. benchflow-0.5.1.dev869/tests/test_trial_install_agent_timeout.py +182 -0
  312. benchflow-0.5.1.dev869/tests/test_trial_litellm_runtime.py +115 -0
  313. benchflow-0.5.1.dev869/tests/test_usage_litellm.py +315 -0
  314. benchflow-0.5.1.dev869/tests/test_usage_required.py +70 -0
  315. benchflow-0.5.1.dev869/tests/test_usage_tracking.py +231 -0
  316. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_user.py +56 -3
  317. benchflow-0.5.1.dev869/tests/test_verifier_multi_container.py +313 -0
  318. benchflow-0.5.1.dev869/tests/test_verifier_output.py +214 -0
  319. benchflow-0.5.1.dev869/tests/test_verifier_output_freshness.py +114 -0
  320. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_verify.py +374 -45
  321. benchflow-0.5.1.dev869/tests/test_workflow_action_pinning.py +121 -0
  322. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_yaml_config.py +289 -2
  323. benchflow-0.5.1.dev869/tests/trajectories/__init__.py +0 -0
  324. benchflow-0.5.1.dev869/tests/trajectories/test_export.py +157 -0
  325. benchflow-0.5.1.dev869/tests/trajectories/test_export_nan_handling.py +101 -0
  326. benchflow-0.5.1.dev869/tests/trajectories/test_redaction.py +357 -0
  327. benchflow-0.5.1.dev869/tests/trajectories/test_step_granularity.py +201 -0
  328. benchflow-0.5.1.dev869/tests/trajectories/test_tree.py +159 -0
  329. benchflow-0.4.0/src/benchflow/_utils/benchmark_repos.py +0 -165
  330. benchflow-0.4.0/src/benchflow/_utils/config.py +0 -22
  331. benchflow-0.4.0/src/benchflow/_utils/scoring.py +0 -56
  332. benchflow-0.4.0/src/benchflow/_utils/task_authoring.py +0 -154
  333. benchflow-0.4.0/src/benchflow/acp/runtime.py +0 -358
  334. benchflow-0.4.0/src/benchflow/acp/types.py +0 -321
  335. benchflow-0.4.0/src/benchflow/adapters/__init__.py +0 -25
  336. benchflow-0.4.0/src/benchflow/evaluation.py +0 -680
  337. benchflow-0.4.0/src/benchflow/hosted_env.py +0 -408
  338. benchflow-0.4.0/src/benchflow/metrics.py +0 -240
  339. benchflow-0.4.0/src/benchflow/providers/__init__.py +0 -25
  340. benchflow-0.4.0/src/benchflow/providers/bedrock_proxy.py +0 -534
  341. benchflow-0.4.0/src/benchflow/providers/bedrock_runtime.py +0 -665
  342. benchflow-0.4.0/src/benchflow/providers/runtime.py +0 -172
  343. benchflow-0.4.0/src/benchflow/rewards/events.py +0 -26
  344. benchflow-0.4.0/src/benchflow/rewards/llm.py +0 -182
  345. benchflow-0.4.0/src/benchflow/rewards/protocol.py +0 -33
  346. benchflow-0.4.0/src/benchflow/rewards/rubric_config.py +0 -127
  347. benchflow-0.4.0/src/benchflow/rollout.py +0 -1845
  348. benchflow-0.4.0/src/benchflow/sandbox/_base.py +0 -189
  349. benchflow-0.4.0/src/benchflow/sandbox/_compose.py +0 -9
  350. benchflow-0.4.0/src/benchflow/sandbox/docker.py +0 -465
  351. benchflow-0.4.0/src/benchflow/sandbox/protocol.py +0 -74
  352. benchflow-0.4.0/src/benchflow/sandbox/snapshot.py +0 -85
  353. benchflow-0.4.0/src/benchflow/sandbox/user.py +0 -101
  354. benchflow-0.4.0/src/benchflow/scenes.py +0 -315
  355. benchflow-0.4.0/src/benchflow/task/verifier.py +0 -166
  356. benchflow-0.4.0/src/benchflow/templates/test.sh.tmpl +0 -12
  357. benchflow-0.4.0/src/benchflow/trajectories/__init__.py +0 -37
  358. benchflow-0.4.0/src/benchflow/trajectories/_capture.py +0 -113
  359. benchflow-0.4.0/src/benchflow/trajectories/proxy.py +0 -425
  360. benchflow-0.4.0/src/benchflow/trajectories/types.py +0 -107
  361. benchflow-0.4.0/tests/conformance/proof_multi_agent.py +0 -167
  362. benchflow-0.4.0/tests/integration/check_results.py +0 -199
  363. benchflow-0.4.0/tests/integration/run_suite.py +0 -439
  364. benchflow-0.4.0/tests/integration/suites/release.yaml +0 -262
  365. benchflow-0.4.0/tests/test_acp.py +0 -551
  366. benchflow-0.4.0/tests/test_adapter_scripts.py +0 -33
  367. benchflow-0.4.0/tests/test_agent_registry.py +0 -104
  368. benchflow-0.4.0/tests/test_agent_spec.py +0 -81
  369. benchflow-0.4.0/tests/test_atif_trajectory.py +0 -299
  370. benchflow-0.4.0/tests/test_bedrock_proxy.py +0 -375
  371. benchflow-0.4.0/tests/test_bedrock_runtime.py +0 -405
  372. benchflow-0.4.0/tests/test_docs_examples.py +0 -58
  373. benchflow-0.4.0/tests/test_integration_check_results.py +0 -110
  374. benchflow-0.4.0/tests/test_integration_run_suite.py +0 -261
  375. benchflow-0.4.0/tests/test_job.py +0 -298
  376. benchflow-0.4.0/tests/test_llm_judge.py +0 -488
  377. benchflow-0.4.0/tests/test_oracle_chokepoint.py +0 -469
  378. benchflow-0.4.0/tests/test_process.py +0 -296
  379. benchflow-0.4.0/tests/test_provider_runtime.py +0 -224
  380. benchflow-0.4.0/tests/test_resolve_env_helpers.py +0 -481
  381. benchflow-0.4.0/tests/test_rollout_upload.py +0 -70
  382. benchflow-0.4.0/tests/test_sandbox.py +0 -97
  383. benchflow-0.4.0/tests/test_scene.py +0 -198
  384. benchflow-0.4.0/tests/test_scene_outbox_trial.py +0 -503
  385. benchflow-0.4.0/tests/test_scoring.py +0 -102
  386. benchflow-0.4.0/tests/test_task_download.py +0 -186
  387. benchflow-0.4.0/tests/test_tasks.py +0 -159
  388. benchflow-0.4.0/tests/test_trial_bedrock_proxy.py +0 -129
  389. benchflow-0.4.0/tests/test_trial_install_agent_timeout.py +0 -129
  390. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/LICENSE +0 -0
  391. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/_dotenv.py +0 -0
  392. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/_utils/__init__.py +0 -0
  393. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/__init__.py +0 -0
  394. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/acp/container_transport.py +0 -0
  395. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/adapters/inspect_ai.py +0 -0
  396. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/__init__.py +0 -0
  397. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/harvey_lab_acp_shim.py +0 -0
  398. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/openclaw_acp_shim.py +0 -0
  399. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/agents/pi_acp_launcher.py +0 -0
  400. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/cli/__init__.py +0 -0
  401. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/environment/Dockerfile +0 -0
  402. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/instruction.md +0 -0
  403. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/task.toml +0 -0
  404. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/demo_task/tests/test.sh +0 -0
  405. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/__init__.py +0 -0
  406. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/__init__.py +0 -0
  407. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/hooks.py +0 -0
  408. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/experimental/mcp/reviewer_server.py +0 -0
  409. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/py.typed +0 -0
  410. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/README.md +0 -0
  411. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/rewards/rubric.py +0 -0
  412. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-build.yaml +0 -0
  413. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-no-network.yaml +0 -0
  414. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_compose_files/docker-compose-prebuilt.yaml +0 -0
  415. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/_sdk_ops.py +0 -0
  416. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/sandbox/services.py +0 -0
  417. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/skills.py +0 -0
  418. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/task/env.py +0 -0
  419. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/task/paths.py +0 -0
  420. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/task/task.py +0 -0
  421. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/templates/__init__.py +0 -0
  422. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/templates/judge.py.tmpl +0 -0
  423. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/__init__.py +0 -0
  424. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/local.py +0 -0
  425. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/traces/models.py +0 -0
  426. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/otel.py +0 -0
  427. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/src/benchflow/trajectories/viewer.py +0 -0
  428. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/__init__.py +0 -0
  429. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/README.md +0 -0
  430. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/environment/Dockerfile +0 -0
  431. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/instruction.md +0 -0
  432. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/solution/solve.sh +0 -0
  433. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/acp_smoke/tests/test.sh +0 -0
  434. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/conformance-results.json +0 -0
  435. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/proof_snapshot.py +0 -0
  436. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conformance/run_conformance.py +0 -0
  437. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/conftest.py +0 -0
  438. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/environment/Dockerfile +0 -0
  439. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/instruction.md +0 -0
  440. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/solution/solve.sh +0 -0
  441. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/task.toml +0 -0
  442. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/hello-world-task/tests/test.sh +0 -0
  443. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/test_claude.sh +0 -0
  444. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/test_codex.sh +0 -0
  445. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/test_codex_custom_provider.sh +0 -0
  446. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/test_gemini.sh +0 -0
  447. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/examples/test_openclaw.sh +0 -0
  448. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/fixtures/mock_openai_responses_server.py +0 -0
  449. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_agent_model_decouple.py +0 -0
  450. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_exclude_tasks.py +0 -0
  451. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_harvey_lab_shim.py +0 -0
  452. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_mock_openai_responses_server.py +0 -0
  453. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_notification_order_real.py +0 -0
  454. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_oracle.py +0 -0
  455. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_sandbox_protocol.py +0 -0
  456. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_skill_eval_integration.py +0 -0
  457. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_skills.py +0 -0
  458. {benchflow-0.4.0 → benchflow-0.5.1.dev869}/tests/test_trajectory_integration.py +0 -0
@@ -185,3 +185,4 @@ tests/.smoke-jobs/
185
185
  context/
186
186
  tutorials/
187
187
  .playwright-mcp/
188
+ /.claude/handoffs
@@ -2,6 +2,16 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ### Added
6
+
7
+ - **Daytona usage telemetry by default** — Daytona runs now start a sandbox-local provider usage proxy so token/cost telemetry works without an external tunnel; use `--usage-tracking off` to bypass proxying when needed.
8
+ - **Azure AI Foundry providers** — new `azure-foundry-openai/` and `azure-foundry-anthropic/` prefixes routing through Foundry's unified resource. Export `AZURE_API_KEY` plus `AZURE_API_ENDPOINT` (e.g. `https://<resource>.openai.azure.com/`); benchflow derives the resource name from the endpoint host, builds the per-surface base URL, and maps the key onto the agent-native auth env automatically. Missing/unrecognized endpoints and unsupported agent/provider protocol pairings fail fast with clear errors instead of falling through to the wrong endpoint.
9
+ - **Azure Foundry auth guidance** — agent discovery output and docs now call out that provider-prefixed models can use provider-specific credentials instead of the agent's native/default API key.
10
+
11
+ ### Fixed
12
+
13
+ - Inherit `BENCHFLOW_PROVIDER_BASE_URL` / `BENCHFLOW_PROVIDER_API_KEY` from the host environment so self-hosted / OpenAI-compatible endpoints route correctly instead of falling back to `api.openai.com`; empty or whitespace-only host values are skipped so they cannot shadow the resolved provider URL (benchflow-ai/skillsbench#817).
14
+
5
15
  ## 0.3.3 — 2026-05-15
6
16
 
7
17
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchflow
3
- Version: 0.4.0
3
+ Version: 0.5.1.dev869
4
4
  Summary: Multi-turn agent benchmarking with ACP — run any agent, any model, any provider.
5
5
  Project-URL: Homepage, https://github.com/benchflow-ai/benchflow
6
6
  Project-URL: Repository, https://github.com/benchflow-ai/benchflow
@@ -18,22 +18,30 @@ Classifier: Programming Language :: Python :: 3
18
18
  Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3.13
20
20
  Requires-Python: >=3.12
21
+ Requires-Dist: agent-client-protocol>=0.10
21
22
  Requires-Dist: anyio>=4.0
22
23
  Requires-Dist: httpx>=0.27.0
23
- Requires-Dist: pydantic>=2.0
24
+ Requires-Dist: litellm[proxy]==1.88.0rc1
25
+ Requires-Dist: pydantic>=2.7
24
26
  Requires-Dist: pyyaml>=6.0
25
27
  Requires-Dist: rich>=13.0
28
+ Requires-Dist: tomli-w>=1.0
26
29
  Requires-Dist: typer>=0.9
27
30
  Provides-Extra: bedrock
28
31
  Requires-Dist: boto3>=1.40; extra == 'bedrock'
29
32
  Provides-Extra: dev
33
+ Requires-Dist: packaging>=24; extra == 'dev'
30
34
  Requires-Dist: pre-commit>=3.7; extra == 'dev'
31
35
  Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
32
36
  Requires-Dist: pytest>=9.0.3; extra == 'dev'
33
37
  Requires-Dist: ruff>=0.7.0; extra == 'dev'
34
38
  Requires-Dist: ty>=0.0.1a1; extra == 'dev'
39
+ Provides-Extra: judge
40
+ Requires-Dist: anthropic>=0.40; extra == 'judge'
41
+ Requires-Dist: google-genai>=1.0; extra == 'judge'
42
+ Requires-Dist: openai>=1.40; extra == 'judge'
35
43
  Provides-Extra: sandbox-daytona
36
- Requires-Dist: daytona>=0.153.0; extra == 'sandbox-daytona'
44
+ Requires-Dist: daytona>=0.184.0; extra == 'sandbox-daytona'
37
45
  Requires-Dist: tenacity>=8.0; extra == 'sandbox-daytona'
38
46
  Provides-Extra: sandbox-modal
39
47
  Requires-Dist: modal>=0.73; extra == 'sandbox-modal'
@@ -66,7 +74,7 @@ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. Sing
66
74
  uv tool install benchflow
67
75
  ```
68
76
 
69
- Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth.
77
+ Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth. Provider-prefixed models may use provider-specific credentials; Azure Foundry models use `AZURE_API_KEY` plus `AZURE_API_ENDPOINT`.
70
78
 
71
79
  ## Documentation
72
80
 
@@ -81,6 +89,7 @@ Start with [Getting started](./docs/getting-started.md), then [Concepts](./docs/
81
89
  | Multi-round single-agent (progressive disclosure, oracle access) | [Progressive disclosure](./docs/progressive-disclosure.md) |
82
90
  | Skill evaluation (when the artifact is a skill, not a workspace) | [Skill eval](./docs/skill-eval.md) |
83
91
  | Understand the security model | [Sandbox hardening](./docs/sandbox-hardening.md) |
92
+ | Use public vs internal preview SDK releases | [Release channels](./docs/release.md) |
84
93
  | CLI flags + commands | [CLI reference](./docs/reference/cli.md) |
85
94
  | Python API surface | [Python API reference](./docs/reference/python-api.md) |
86
95
 
@@ -91,20 +100,20 @@ Notebooks and runnable example scripts live under [`docs/examples/`](./docs/exam
91
100
  Benchmark datasets live in external Git repos and are referenced with two fields:
92
101
 
93
102
  ```yaml
94
- # benchmarks/skillsbench-claude-glm51.yaml
103
+ # benchmarks/harvey-lab/harvey-lab-gemini-flash-lite.yaml
95
104
  source:
96
- repo: benchflow-ai/skillsbench # GitHub org/repo
97
- path: tasks # optional subpath within repo
105
+ repo: benchflow-ai/benchmarks # GitHub org/repo
106
+ path: datasets/harvey-lab/tasks # optional subpath within repo
98
107
  ref: main # optional branch/tag
99
- agent: claude-agent-acp
100
- model: claude-sonnet-4-6
108
+ agent: gemini
109
+ model: gemini/gemini-3.1-flash-lite-preview
101
110
  ```
102
111
 
103
112
  Run any benchmark via the CLI:
104
113
 
105
114
  ```bash
106
- # From a YAML config
107
- bench eval create --config benchmarks/skillsbench-claude-glm51.yaml
115
+ # From a YAML config (shipped with the repo)
116
+ bench eval create --config benchmarks/harvey-lab/harvey-lab-gemini-flash-lite.yaml
108
117
 
109
118
  # Inline — mirrors the YAML source fields
110
119
  bench eval create \
@@ -114,10 +123,9 @@ bench eval create \
114
123
 
115
124
  Repos are cloned and cached locally under `.cache/datasets/` on first use.
116
125
 
117
- SkillsBench itself sources BenchFlow from GitHub `main` in its
118
- [`pyproject.toml`](https://github.com/benchflow-ai/skillsbench/blob/main/pyproject.toml).
119
- After a BenchFlow change lands, run `uv lock --upgrade-package benchflow` in
120
- SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
126
+ Downstream projects should depend on the public PyPI release by default. For
127
+ internal validation before the next public release, install or lock the internal
128
+ preview channel with prereleases enabled; see [Release channels](./docs/release.md).
121
129
 
122
130
  ## Featured
123
131
 
@@ -141,7 +149,9 @@ Two runnable labs validate the security story:
141
149
 
142
150
  PRs welcome. Open against `main`. CI runs ruff + tests on every PR; please run `ruff check .` and `pytest tests/` locally first.
143
151
 
144
- For a release: bump `pyproject.toml` to the next stable version, tag `v<version>` on main, push the tag — CI publishes to PyPI. Then bump main to the next `.dev0`.
152
+ Release channels are documented in [Release channels](./docs/release.md). In
153
+ short: merges to `main` publish an internal preview after CI passes, while a
154
+ matching `v<version>` tag publishes the public release.
145
155
 
146
156
  ## License
147
157
 
@@ -24,7 +24,7 @@ BenchFlow runs AI agents against benchmark tasks in sandboxed environments. Sing
24
24
  uv tool install benchflow
25
25
  ```
26
26
 
27
- Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth.
27
+ Requires Python 3.12+ and [uv](https://docs.astral.sh/uv/). Set `DAYTONA_API_KEY` for Daytona runs or configure Modal auth for Modal runs; export the relevant agent API key (`GEMINI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) or run `claude login` / `codex --login` for subscription auth. Provider-prefixed models may use provider-specific credentials; Azure Foundry models use `AZURE_API_KEY` plus `AZURE_API_ENDPOINT`.
28
28
 
29
29
  ## Documentation
30
30
 
@@ -39,6 +39,7 @@ Start with [Getting started](./docs/getting-started.md), then [Concepts](./docs/
39
39
  | Multi-round single-agent (progressive disclosure, oracle access) | [Progressive disclosure](./docs/progressive-disclosure.md) |
40
40
  | Skill evaluation (when the artifact is a skill, not a workspace) | [Skill eval](./docs/skill-eval.md) |
41
41
  | Understand the security model | [Sandbox hardening](./docs/sandbox-hardening.md) |
42
+ | Use public vs internal preview SDK releases | [Release channels](./docs/release.md) |
42
43
  | CLI flags + commands | [CLI reference](./docs/reference/cli.md) |
43
44
  | Python API surface | [Python API reference](./docs/reference/python-api.md) |
44
45
 
@@ -49,20 +50,20 @@ Notebooks and runnable example scripts live under [`docs/examples/`](./docs/exam
49
50
  Benchmark datasets live in external Git repos and are referenced with two fields:
50
51
 
51
52
  ```yaml
52
- # benchmarks/skillsbench-claude-glm51.yaml
53
+ # benchmarks/harvey-lab/harvey-lab-gemini-flash-lite.yaml
53
54
  source:
54
- repo: benchflow-ai/skillsbench # GitHub org/repo
55
- path: tasks # optional subpath within repo
55
+ repo: benchflow-ai/benchmarks # GitHub org/repo
56
+ path: datasets/harvey-lab/tasks # optional subpath within repo
56
57
  ref: main # optional branch/tag
57
- agent: claude-agent-acp
58
- model: claude-sonnet-4-6
58
+ agent: gemini
59
+ model: gemini/gemini-3.1-flash-lite-preview
59
60
  ```
60
61
 
61
62
  Run any benchmark via the CLI:
62
63
 
63
64
  ```bash
64
- # From a YAML config
65
- bench eval create --config benchmarks/skillsbench-claude-glm51.yaml
65
+ # From a YAML config (shipped with the repo)
66
+ bench eval create --config benchmarks/harvey-lab/harvey-lab-gemini-flash-lite.yaml
66
67
 
67
68
  # Inline — mirrors the YAML source fields
68
69
  bench eval create \
@@ -72,10 +73,9 @@ bench eval create \
72
73
 
73
74
  Repos are cloned and cached locally under `.cache/datasets/` on first use.
74
75
 
75
- SkillsBench itself sources BenchFlow from GitHub `main` in its
76
- [`pyproject.toml`](https://github.com/benchflow-ai/skillsbench/blob/main/pyproject.toml).
77
- After a BenchFlow change lands, run `uv lock --upgrade-package benchflow` in
78
- SkillsBench when you need its lockfile to point at the newest BenchFlow commit.
76
+ Downstream projects should depend on the public PyPI release by default. For
77
+ internal validation before the next public release, install or lock the internal
78
+ preview channel with prereleases enabled; see [Release channels](./docs/release.md).
79
79
 
80
80
  ## Featured
81
81
 
@@ -99,7 +99,9 @@ Two runnable labs validate the security story:
99
99
 
100
100
  PRs welcome. Open against `main`. CI runs ruff + tests on every PR; please run `ruff check .` and `pytest tests/` locally first.
101
101
 
102
- For a release: bump `pyproject.toml` to the next stable version, tag `v<version>` on main, push the tag — CI publishes to PyPI. Then bump main to the next `.dev0`.
102
+ Release channels are documented in [Release channels](./docs/release.md). In
103
+ short: merges to `main` publish an internal preview after CI passes, while a
104
+ matching `v<version>` tag publishes the public release.
103
105
 
104
106
  ## License
105
107
 
@@ -1,16 +1,19 @@
1
1
  [project]
2
2
  name = "benchflow"
3
- version = "0.4.0"
3
+ version = "0.5.1.dev869"
4
4
  description = "Multi-turn agent benchmarking with ACP — run any agent, any model, any provider."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
7
7
  keywords = ["benchmark", "llm-agents", "acp", "agent-evaluation", "multi-turn", "skillsbench"]
8
8
  dependencies = [
9
+ "agent-client-protocol>=0.10",
9
10
  "httpx>=0.27.0",
10
11
  "anyio>=4.0",
11
- "pydantic>=2.0",
12
+ "pydantic>=2.7",
12
13
  "pyyaml>=6.0",
13
14
  "rich>=13.0",
15
+ "litellm[proxy]==1.88.0rc1",
16
+ "tomli-w>=1.0",
14
17
  "typer>=0.9",
15
18
  ]
16
19
  authors = [
@@ -35,6 +38,7 @@ classifiers = [
35
38
 
36
39
  [project.optional-dependencies]
37
40
  dev = [
41
+ "packaging>=24",
38
42
  "pre-commit>=3.7",
39
43
  "pytest>=9.0.3",
40
44
  "pytest-asyncio>=0.24.0",
@@ -42,7 +46,12 @@ dev = [
42
46
  "ty>=0.0.1a1",
43
47
  ]
44
48
  sandbox-daytona = [
45
- "daytona>=0.153.0",
49
+ # >=0.183: list() returns an auto-paginating Iterator[Sandbox] (the older
50
+ # paged list(page=, limit=) -> .items API was removed).
51
+ # >=0.184: the top-level sync `Daytona` export is present (0.176-0.183 only
52
+ # shipped `AsyncDaytona`); the dashboard's daytona_status.snapshot() uses the
53
+ # sync client, so this floor is required for that panel to import.
54
+ "daytona>=0.184.0",
46
55
  "tenacity>=8.0",
47
56
  ]
48
57
  sandbox-modal = [
@@ -52,6 +61,13 @@ sandbox-modal = [
52
61
  bedrock = [
53
62
  "boto3>=1.40",
54
63
  ]
64
+ # Provider SDKs for the llm-judge verifier (type = "llm-judge").
65
+ # llm.py routes judge calls across all three; install at least one.
66
+ judge = [
67
+ "anthropic>=0.40",
68
+ "openai>=1.40",
69
+ "google-genai>=1.0",
70
+ ]
55
71
 
56
72
  [project.scripts]
57
73
  benchflow = "benchflow.cli.main:app"
@@ -90,7 +106,13 @@ markers = [
90
106
 
91
107
  [tool.ruff]
92
108
  target-version = "py312"
93
- extend-exclude = [".claude/skills/skill-creator"]
109
+ # Vendored third-party service packages baked into task images (e.g. the
110
+ # smolclaws claw-* sources copied under a ClawsBench task's environment/) are
111
+ # not BenchFlow code — do not lint them.
112
+ extend-exclude = [
113
+ ".claude/skills/skill-creator",
114
+ "benchmarks/**/tasks/**/environment/claw-*",
115
+ ]
94
116
 
95
117
  [tool.ruff.lint]
96
118
  select = [
@@ -127,7 +149,7 @@ python-version = "3.12"
127
149
  unresolved-import = "ignore"
128
150
 
129
151
  [tool.ty.src]
130
- include = ["src"]
152
+ include = ["src", "tools"]
131
153
  # Modules that heavily use optional-dep types (daytona, modal, openai, boto3, …)
132
154
  # produce cascading type errors when those packages aren't installed.
133
155
  exclude = [
@@ -139,6 +161,5 @@ exclude = [
139
161
  "src/benchflow/rewards/llm.py",
140
162
  "src/benchflow/rewards/file_readers.py",
141
163
  "src/benchflow/rewards/rubric_config.py",
142
- "src/benchflow/providers/bedrock_runtime.py",
143
164
  "src/benchflow/experimental/mcp/reviewer_server.py",
144
165
  ]
@@ -3,16 +3,20 @@
3
3
  Public API surface:
4
4
  - Sandbox protocol for isolated execution environments
5
5
  - ACP client for multi-turn agent communication
6
- - Trajectory capture (HTTP proxy, OTel collector, ACP native)
6
+ - Trajectory capture (LiteLLM callbacks, OTel collector, ACP native)
7
7
  - Rollout lifecycle for single-task execution
8
8
  - Evaluation orchestration with retries and concurrency
9
9
  - Rewards protocol (composable Rubric + RewardFunc)
10
10
  - Metrics collection and aggregation
11
11
  """
12
12
 
13
+ from importlib.metadata import PackageNotFoundError
13
14
  from importlib.metadata import version as _version
14
15
 
15
- __version__ = _version("benchflow")
16
+ try:
17
+ __version__ = _version("benchflow")
18
+ except PackageNotFoundError:
19
+ __version__ = "0+unknown"
16
20
 
17
21
  # Core types
18
22
  from benchflow._types import Role, Scene, Turn
@@ -33,6 +37,12 @@ from benchflow.agents.registry import (
33
37
  list_agents,
34
38
  register_agent,
35
39
  )
40
+ from benchflow.contracts.user import (
41
+ BaseUser,
42
+ FunctionUser,
43
+ PassthroughUser,
44
+ RoundResult,
45
+ )
36
46
  from benchflow.evaluation import (
37
47
  Evaluation,
38
48
  EvaluationConfig,
@@ -41,13 +51,23 @@ from benchflow.evaluation import (
41
51
  )
42
52
  from benchflow.metrics import BenchmarkMetrics, collect_metrics
43
53
  from benchflow.models import AgentInstallError, AgentTimeoutError, RolloutResult
54
+ from benchflow.monitor import (
55
+ Monitor,
56
+ MonitorConfig,
57
+ MonitorNotImplementedError,
58
+ MonitorResult,
59
+ )
44
60
 
45
- # Rewards protocol (v0.4 composable Rubric + RewardFunc)
61
+ # Rewards plane. Reward is the canonical node-based contract
62
+ # (``score(node) -> VerifyResult``); RewardFunc is the legacy path-based shape
63
+ # (``score(rollout_dir) -> float``) adapted into Reward via PathReward.
46
64
  from benchflow.rewards import (
47
65
  CodeExecRewardFunc,
48
66
  Criterion,
49
67
  JudgeConfig,
50
68
  LLMJudgeRewardFunc,
69
+ PathReward,
70
+ Reward,
51
71
  RewardEvent,
52
72
  RewardFunc,
53
73
  Rubric,
@@ -56,6 +76,8 @@ from benchflow.rewards import (
56
76
  StringMatchRewardFunc,
57
77
  TestRewardFunc,
58
78
  VerifyResult,
79
+ load_rubric,
80
+ load_rubric_json,
59
81
  load_rubric_toml,
60
82
  )
61
83
  from benchflow.rollout import Rollout, RolloutConfig
@@ -73,6 +95,8 @@ from benchflow.sandbox import (
73
95
  ImageConfig,
74
96
  ImageRef,
75
97
  Sandbox,
98
+ SandboxImage,
99
+ SandboxSnapshotNotSupported,
76
100
  build_service_hooks,
77
101
  detect_services_from_dockerfile,
78
102
  register_service,
@@ -82,10 +106,15 @@ from benchflow.sandbox import (
82
106
  from benchflow.sandbox import ExecResult as SandboxExecResult
83
107
  from benchflow.sandbox.protocol import ExecResult
84
108
  from benchflow.sandbox.setup import stage_dockerfile_deps
85
- from benchflow.sandbox.snapshot import list_snapshots, restore, snapshot
86
- from benchflow.sandbox.user import BaseUser, FunctionUser, PassthroughUser, RoundResult
87
- from benchflow.scenes import MailboxTransport, Message, MessageTransport, SceneRole
88
- from benchflow.scenes import Scene as SceneRuntime
109
+ from benchflow.sandbox.snapshot import (
110
+ list_snapshots,
111
+ list_workspace_snapshots,
112
+ restore,
113
+ snapshot,
114
+ workspace_restore,
115
+ workspace_snapshot,
116
+ )
117
+ from benchflow.scenes import compile_scenes_to_steps
89
118
  from benchflow.sdk import SDK
90
119
  from benchflow.skills import SkillInfo, discover_skills, install_skill, parse_skill
91
120
  from benchflow.task import (
@@ -95,17 +124,18 @@ from benchflow.task import (
95
124
  VerifierResult,
96
125
  )
97
126
  from benchflow.trajectories.otel import OTelCollector
98
- from benchflow.trajectories.proxy import TrajectoryProxy
99
127
  from benchflow.trajectories.types import Trajectory
100
128
 
101
129
  # Public API surface. Anything not in this list is implementation detail and
102
130
  # may change without notice.
103
131
  __all__ = [
104
132
  "__version__",
105
- # Rewards protocol (v0.4)
133
+ # Rewards plane
134
+ "Reward",
106
135
  "Rubric",
107
136
  "RewardFunc",
108
137
  "RewardEvent",
138
+ "PathReward",
109
139
  "VerifyResult",
110
140
  "TestRewardFunc",
111
141
  "LLMJudgeRewardFunc",
@@ -115,10 +145,14 @@ __all__ = [
115
145
  "JudgeConfig",
116
146
  "RubricConfig",
117
147
  "ScoringConfig",
148
+ "load_rubric",
149
+ "load_rubric_json",
118
150
  "load_rubric_toml",
119
151
  # Sandbox protocol
120
152
  "Sandbox",
121
153
  "SandboxExecResult",
154
+ "SandboxImage",
155
+ "SandboxSnapshotNotSupported",
122
156
  "ImageBuilder",
123
157
  "ImageConfig",
124
158
  "ImageRef",
@@ -149,6 +183,11 @@ __all__ = [
149
183
  "AgentInstallError",
150
184
  "AgentTimeoutError",
151
185
  "RolloutResult",
186
+ # Monitor mode — scaffolded API surface (#386)
187
+ "Monitor",
188
+ "MonitorConfig",
189
+ "MonitorResult",
190
+ "MonitorNotImplementedError",
152
191
  # Runtime
153
192
  "Agent",
154
193
  "Environment",
@@ -161,13 +200,13 @@ __all__ = [
161
200
  "Role",
162
201
  "Scene",
163
202
  "Turn",
164
- # Multi-agent scene runtime
165
- "SceneRole",
166
- "SceneRuntime",
167
- "Message",
168
- "MessageTransport",
169
- "MailboxTransport",
170
- # Env snapshots
203
+ # Scene authoring desugaring
204
+ "compile_scenes_to_steps",
205
+ # Workspace snapshots (filesystem helper — NOT the Sandbox primitive, #384)
206
+ "workspace_snapshot",
207
+ "workspace_restore",
208
+ "list_workspace_snapshots",
209
+ # Backward-compatible aliases for the above (pre-#384 names)
171
210
  "snapshot",
172
211
  "restore",
173
212
  "list_snapshots",
@@ -195,7 +234,6 @@ __all__ = [
195
234
  "parse_skill",
196
235
  # Trajectories
197
236
  "OTelCollector",
198
- "TrajectoryProxy",
199
237
  "Trajectory",
200
238
  # External adapters
201
239
  "InspectAdapter",
@@ -0,0 +1,218 @@
1
+ """Path safety helpers — reject unsafe inputs and refuse to follow symlinks.
2
+
3
+ Two independent helper sets live here:
4
+
5
+ 1. **Segment validation** (``safe_path_segment``, ``assert_within``):
6
+ Reject user-controlled strings (case ids, skill names) that would traverse
7
+ outside the intended tree.
8
+
9
+ 2. **Symlink defense** (``is_safe_regular_file``, ``iter_safe_tree``, etc.):
10
+ Walk directories we do not own without following symlinks, so an
11
+ attacker-placed link cannot pull host files into dashboard payloads,
12
+ judge prompts, or sandbox uploads.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import os
19
+ import stat
20
+ from collections.abc import Iterator
21
+ from pathlib import Path
22
+
23
+ __all__ = [
24
+ "safe_path_segment",
25
+ "assert_within",
26
+ "is_safe_regular_file",
27
+ "is_safe_regular_dir",
28
+ "iter_safe_children",
29
+ "iter_safe_tree",
30
+ "ignore_symlinks",
31
+ ]
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ # ── Segment validation ───────────────────────────────────────────────
37
+
38
+
39
+ def safe_path_segment(name: str, *, kind: str = "name") -> str:
40
+ """Return ``name`` unchanged if safe as a single path segment.
41
+
42
+ Raises :class:`ValueError` for inputs that cannot be used as a directory
43
+ or file name without risking path traversal or shell ambiguity.
44
+
45
+ Rejected forms:
46
+
47
+ * empty string
48
+ * ``.`` or ``..`` (current/parent directory references)
49
+ * any string containing ``/`` or ``\\`` (multi-segment paths)
50
+ * any string containing a NUL byte
51
+ * leading or trailing whitespace
52
+ * leading ``-`` (would be interpreted as a CLI flag by downstream tools)
53
+
54
+ All other Unicode is accepted; this is a security boundary, not a
55
+ cosmetic slugifier. Callers that want forgiving behaviour should slugify
56
+ *before* calling this function.
57
+
58
+ Args:
59
+ name: The candidate path segment.
60
+ kind: A human label used in the error message (e.g. ``"case id"``,
61
+ ``"skill name"``).
62
+
63
+ Returns:
64
+ The input ``name`` unchanged.
65
+
66
+ Raises:
67
+ ValueError: If ``name`` is not safe as a single path segment.
68
+ """
69
+ if not isinstance(name, str):
70
+ raise ValueError(f"{kind} must be a string, got {type(name).__name__}")
71
+ if name == "":
72
+ raise ValueError(f"{kind} must not be empty")
73
+ if name in (".", ".."):
74
+ raise ValueError(f"{kind} must not be '.' or '..' (got {name!r})")
75
+ if "/" in name or "\\" in name:
76
+ raise ValueError(f"{kind} must not contain path separators (got {name!r})")
77
+ if "\x00" in name:
78
+ raise ValueError(f"{kind} must not contain NUL bytes (got {name!r})")
79
+ if name != name.strip():
80
+ raise ValueError(
81
+ f"{kind} must not have leading or trailing whitespace (got {name!r})"
82
+ )
83
+ if name.startswith("-"):
84
+ raise ValueError(
85
+ f"{kind} must not start with '-' (got {name!r}); "
86
+ "would be misread as a CLI flag"
87
+ )
88
+ return name
89
+
90
+
91
+ def assert_within(child: Path, root: Path) -> Path:
92
+ """Resolve both paths and assert ``child`` is under ``root``.
93
+
94
+ Uses :meth:`Path.resolve` so symlinks are followed and ``..`` segments
95
+ collapsed before the containment check. Returns the resolved child.
96
+
97
+ Args:
98
+ child: A path that should be inside ``root``.
99
+ root: The directory ``child`` must not escape.
100
+
101
+ Returns:
102
+ The resolved ``child`` path.
103
+
104
+ Raises:
105
+ ValueError: If the resolved ``child`` is not under the resolved
106
+ ``root``.
107
+ """
108
+ resolved_root = root.resolve()
109
+ resolved_child = child.resolve()
110
+ try:
111
+ resolved_child.relative_to(resolved_root)
112
+ except ValueError as exc:
113
+ raise ValueError(
114
+ f"path {child} resolves to {resolved_child}, "
115
+ f"which is outside {resolved_root}"
116
+ ) from exc
117
+ return resolved_child
118
+
119
+
120
+ # ── Symlink defense ──────────────────────────────────────────────────
121
+
122
+
123
+ def is_safe_regular_file(path: Path) -> bool:
124
+ """True if *path* exists, is a regular file, and is not a symlink.
125
+
126
+ Uses ``os.lstat`` so symlinks, fifos, sockets, and device files all
127
+ return False. A non-existent path also returns False.
128
+ """
129
+ try:
130
+ st = os.lstat(path)
131
+ except OSError:
132
+ return False
133
+ return stat.S_ISREG(st.st_mode) and not stat.S_ISLNK(st.st_mode)
134
+
135
+
136
+ def is_safe_regular_dir(path: Path) -> bool:
137
+ """True if *path* is a directory and not a symlink to one."""
138
+ try:
139
+ st = os.lstat(path)
140
+ except OSError:
141
+ return False
142
+ return stat.S_ISDIR(st.st_mode) and not stat.S_ISLNK(st.st_mode)
143
+
144
+
145
+ def iter_safe_children(
146
+ directory: Path,
147
+ *,
148
+ context: str = "directory walk",
149
+ ) -> Iterator[Path]:
150
+ """Yield direct children of *directory*, skipping symlinks with a warning."""
151
+ try:
152
+ entries = sorted(directory.iterdir())
153
+ except (OSError, NotADirectoryError):
154
+ return
155
+ for child in entries:
156
+ if child.is_symlink():
157
+ logger.warning(
158
+ "%s: skipping symlink %s (refusing to follow)", context, child
159
+ )
160
+ continue
161
+ yield child
162
+
163
+
164
+ def iter_safe_tree(
165
+ root: Path,
166
+ *,
167
+ context: str = "tree walk",
168
+ ) -> Iterator[Path]:
169
+ """Recursively yield regular files under *root*, never following symlinks.
170
+
171
+ Uses ``os.walk(followlinks=False)`` so directory symlinks are also not
172
+ descended into.
173
+ """
174
+ if not is_safe_regular_dir(root):
175
+ if Path(root).is_symlink():
176
+ logger.warning(
177
+ "%s: refusing to descend into symlinked root %s", context, root
178
+ )
179
+ return
180
+ for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
181
+ base = Path(dirpath)
182
+ kept_dirs: list[str] = []
183
+ for name in dirnames:
184
+ child = base / name
185
+ if child.is_symlink():
186
+ logger.warning(
187
+ "%s: skipping symlinked directory %s (refusing to follow)",
188
+ context,
189
+ child,
190
+ )
191
+ continue
192
+ kept_dirs.append(name)
193
+ dirnames[:] = sorted(kept_dirs)
194
+ for name in sorted(filenames):
195
+ f = base / name
196
+ if not is_safe_regular_file(f):
197
+ logger.warning(
198
+ "%s: skipping non-regular path %s (symlink or special file)",
199
+ context,
200
+ f,
201
+ )
202
+ continue
203
+ yield f
204
+
205
+
206
+ def ignore_symlinks(directory: str, contents: list[str]) -> list[str]:
207
+ """``shutil.copytree`` ``ignore=`` callback that drops every symlink."""
208
+ skipped: list[str] = []
209
+ for name in contents:
210
+ if Path(directory, name).is_symlink():
211
+ skipped.append(name)
212
+ if skipped:
213
+ logger.warning(
214
+ "copytree: skipping symlinked entries under %s: %s",
215
+ directory,
216
+ ", ".join(sorted(skipped)),
217
+ )
218
+ return skipped