verifiers 0.1.13.dev7__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (296) hide show
  1. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/.gitignore +1 -1
  2. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/PKG-INFO +77 -2
  3. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/README.md +73 -1
  4. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/pyproject.toml +26 -2
  5. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/conftest.py +8 -13
  6. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_cli_agent_env.py +175 -14
  7. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_composable_env.py +4 -4
  8. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_decorator_ranks.py +43 -4
  9. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_endpoint_registry.py +33 -65
  10. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_env_group.py +51 -52
  11. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_env_server.py +1 -1
  12. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_environment.py +86 -7
  13. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_environment_extra.py +2 -4
  14. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_envs.py +25 -0
  15. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_eval_cli.py +191 -16
  16. verifiers-0.1.14/tests/test_gepa_cli.py +251 -0
  17. verifiers-0.1.14/tests/test_gepa_utils.py +155 -0
  18. verifiers-0.1.14/tests/test_lean_task.py +344 -0
  19. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_math_rubric.py +89 -21
  20. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_multiturn_env.py +81 -2
  21. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_nemorl_client.py +146 -35
  22. verifiers-0.1.14/tests/test_openai_responses_client.py +338 -0
  23. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_opencode_rlm_env.py +7 -9
  24. verifiers-0.1.14/tests/test_per_turn_timing.py +68 -0
  25. verifiers-0.1.14/tests/test_renderer_client.py +600 -0
  26. verifiers-0.1.14/tests/test_renderer_e2e.py +417 -0
  27. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_rlm_composable_env.py +333 -40
  28. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_rlm_env.py +71 -71
  29. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_rubric.py +9 -59
  30. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_rubric_group.py +72 -43
  31. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_save_utils.py +11 -11
  32. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_setup_script.py +2 -2
  33. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_singleturn_env.py +7 -35
  34. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_stateful_tool_env.py +3 -5
  35. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_tool_env.py +4 -8
  36. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_tool_utils.py +31 -1
  37. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_trajectory_processing.py +0 -3
  38. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_tui_info_formatting.py +9 -1
  39. verifiers-0.1.14/tests/test_types.py +11 -0
  40. verifiers-0.1.14/tests/test_v1_bfcl.py +55 -0
  41. verifiers-0.1.14/tests/test_v1_config_extension.py +1599 -0
  42. verifiers-0.1.14/tests/test_v1_endpoint_protocols.py +222 -0
  43. verifiers-0.1.14/tests/test_v1_group_reward_env.py +39 -0
  44. verifiers-0.1.14/tests/test_v1_harbor_cli.py +178 -0
  45. verifiers-0.1.14/tests/test_v1_mini_swe_agent.py +63 -0
  46. verifiers-0.1.14/tests/test_v1_rlm_swe.py +70 -0
  47. verifiers-0.1.14/tests/test_v1_runtime_lifecycle.py +1731 -0
  48. verifiers-0.1.14/tests/test_v1_scoring_functions.py +152 -0
  49. verifiers-0.1.14/tests/test_wordle_env.py +22 -0
  50. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/__init__.py +88 -8
  51. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/__init__.py +29 -0
  52. verifiers-0.1.14/verifiers/clients/nemorl_chat_completions_client.py +117 -0
  53. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/openai_chat_completions_client.py +2 -0
  54. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/openai_chat_completions_token_client.py +17 -2
  55. verifiers-0.1.14/verifiers/clients/openai_responses_client.py +443 -0
  56. verifiers-0.1.14/verifiers/clients/renderer_client.py +603 -0
  57. verifiers-0.1.14/verifiers/decorators.py +296 -0
  58. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/AGENTS.md +2 -1
  59. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/env_group.py +192 -62
  60. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/environment.py +113 -76
  61. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/cli_agent_env.py +33 -38
  62. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/README.md +1 -1
  63. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/composable_env.py +101 -25
  64. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/harness.py +32 -14
  65. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/harnesses/opencode.py +20 -4
  66. verifiers-0.1.14/verifiers/envs/experimental/composable/harnesses/rlm.py +281 -0
  67. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/task.py +44 -22
  68. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -1
  69. verifiers-0.1.14/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +13 -0
  70. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +138 -27
  71. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -1
  72. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +3 -3
  73. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +3 -3
  74. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +3 -3
  75. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +11 -8
  76. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +2 -2
  77. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +2 -2
  78. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +2 -2
  79. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/harbor_env/env.py +4 -3
  80. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/mcp_env.py +9 -12
  81. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/opencode_env.py +2 -1
  82. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/opencode_rlm_env.py +2 -3
  83. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/rlm_env.py +3 -5
  84. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/sandbox_mixin.py +51 -1
  85. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/utils/git_checkout_cache.py +45 -2
  86. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/browser_env.py +3 -3
  87. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/modes/base.py +2 -2
  88. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +1 -3
  89. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +1 -2
  90. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/openenv_env.py +2 -3
  91. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/reasoninggym_env.py +1 -1
  92. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/textarena_env.py +7 -2
  93. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/multiturn_env.py +54 -12
  94. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/python_env.py +2 -3
  95. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/sandbox_env.py +2 -2
  96. verifiers-0.1.14/verifiers/gepa/gepa_utils.py +322 -0
  97. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/math_rubric.py +2 -1
  98. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/rubric.py +137 -34
  99. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/rubric_group.py +23 -1
  100. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/eval.py +21 -7
  101. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/gepa.py +274 -37
  102. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/setup.py +14 -11
  103. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/tui.py +34 -10
  104. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/types.py +176 -17
  105. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/async_utils.py +18 -0
  106. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/display_utils.py +90 -3
  107. verifiers-0.1.14/verifiers/utils/env_config_utils.py +45 -0
  108. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/env_utils.py +53 -2
  109. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/error_utils.py +33 -0
  110. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/eval_display.py +61 -50
  111. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/eval_utils.py +185 -109
  112. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/interception_utils.py +344 -7
  113. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/logging_utils.py +18 -0
  114. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/response_utils.py +2 -0
  115. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/save_utils.py +33 -13
  116. verifiers-0.1.14/verifiers/v1/README.md +1587 -0
  117. verifiers-0.1.14/verifiers/v1/RE_MIGRATION.md +804 -0
  118. verifiers-0.1.14/verifiers/v1/__init__.py +85 -0
  119. verifiers-0.1.14/verifiers/v1/config.py +455 -0
  120. verifiers-0.1.14/verifiers/v1/env.py +136 -0
  121. verifiers-0.1.14/verifiers/v1/harness.py +598 -0
  122. verifiers-0.1.14/verifiers/v1/packages/__init__.py +1 -0
  123. verifiers-0.1.14/verifiers/v1/packages/harnesses/__init__.py +7 -0
  124. verifiers-0.1.14/verifiers/v1/packages/harnesses/cli.py +121 -0
  125. verifiers-0.1.14/verifiers/v1/packages/harnesses/mini_swe_agent.py +247 -0
  126. verifiers-0.1.14/verifiers/v1/packages/harnesses/opencode.py +273 -0
  127. verifiers-0.1.14/verifiers/v1/packages/harnesses/pi.py +212 -0
  128. verifiers-0.1.14/verifiers/v1/packages/harnesses/rlm.py +264 -0
  129. verifiers-0.1.14/verifiers/v1/packages/tasksets/__init__.py +3 -0
  130. verifiers-0.1.14/verifiers/v1/packages/tasksets/harbor.py +405 -0
  131. verifiers-0.1.14/verifiers/v1/runtime.py +1931 -0
  132. verifiers-0.1.14/verifiers/v1/state.py +401 -0
  133. verifiers-0.1.14/verifiers/v1/task.py +177 -0
  134. verifiers-0.1.14/verifiers/v1/taskset.py +269 -0
  135. verifiers-0.1.14/verifiers/v1/toolset.py +352 -0
  136. verifiers-0.1.14/verifiers/v1/user.py +85 -0
  137. verifiers-0.1.14/verifiers/v1/utils/__init__.py +1 -0
  138. verifiers-0.1.14/verifiers/v1/utils/artifact_utils.py +31 -0
  139. verifiers-0.1.14/verifiers/v1/utils/endpoint_utils.py +669 -0
  140. verifiers-0.1.14/verifiers/v1/utils/json_utils.py +11 -0
  141. verifiers-0.1.14/verifiers/v1/utils/judge_utils.py +63 -0
  142. verifiers-0.1.14/verifiers/v1/utils/lifecycle_utils.py +96 -0
  143. verifiers-0.1.14/verifiers/v1/utils/mcp_proxy_utils.py +233 -0
  144. verifiers-0.1.14/verifiers/v1/utils/mcp_utils.py +148 -0
  145. verifiers-0.1.14/verifiers/v1/utils/program_utils.py +483 -0
  146. verifiers-0.1.14/verifiers/v1/utils/prompt_utils.py +136 -0
  147. verifiers-0.1.14/verifiers/v1/utils/sandbox_program_utils.py +650 -0
  148. verifiers-0.1.14/verifiers/v1/utils/sandbox_utils.py +753 -0
  149. verifiers-0.1.14/verifiers/v1/utils/scoring_utils.py +379 -0
  150. verifiers-0.1.14/verifiers/v1/utils/timing_utils.py +36 -0
  151. verifiers-0.1.14/verifiers/v1/utils/tool_utils.py +19 -0
  152. verifiers-0.1.14/verifiers/v1/utils/trajectory_utils.py +78 -0
  153. verifiers-0.1.13.dev7/tests/test_gepa_cli.py +0 -115
  154. verifiers-0.1.13.dev7/verifiers/clients/nemorl_chat_completions_client.py +0 -87
  155. verifiers-0.1.13.dev7/verifiers/decorators.py +0 -147
  156. verifiers-0.1.13.dev7/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -211
  157. verifiers-0.1.13.dev7/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -3
  158. verifiers-0.1.13.dev7/verifiers/gepa/gepa_utils.py +0 -116
  159. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/LICENSE +0 -0
  160. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/AGENTS.md +0 -0
  161. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/README.md +0 -0
  162. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/__init__.py +0 -0
  163. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_browser_env.py +0 -0
  164. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_build_script.py +0 -0
  165. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_client_auth_errors.py +0 -0
  166. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_client_config.py +0 -0
  167. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_client_multimodal_types.py +0 -0
  168. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_context_token_metrics.py +0 -0
  169. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_error_chain.py +0 -0
  170. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_eval_display.py +0 -0
  171. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_eval_utils.py +0 -0
  172. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_gym_env.py +0 -0
  173. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_harbor_env_mcp.py +0 -0
  174. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_imports.py +0 -0
  175. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_install_utils.py +0 -0
  176. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_interception_utils.py +0 -0
  177. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_logging.py +0 -0
  178. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_maybe_think_parser.py +0 -0
  179. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_message_utils.py +0 -0
  180. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_message_utils_multimodal.py +0 -0
  181. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_openai_chat_completions_token_client.py +0 -0
  182. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_opencode_harbor.py +0 -0
  183. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_parser.py +0 -0
  184. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_path_utils.py +0 -0
  185. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_prime_plugin.py +0 -0
  186. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_sandbox_env.py +0 -0
  187. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_sandbox_mixin.py +0 -0
  188. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_think_parser.py +0 -0
  189. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_xml_parser.py +0 -0
  190. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/AGENTS.md +0 -0
  191. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/__init__.py +0 -0
  192. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/__init__.py +0 -0
  193. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/build.py +0 -0
  194. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/eval.py +0 -0
  195. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/gepa.py +0 -0
  196. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/init.py +0 -0
  197. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/install.py +0 -0
  198. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/setup.py +0 -0
  199. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/plugins/__init__.py +0 -0
  200. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/plugins/prime.py +0 -0
  201. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/tui.py +0 -0
  202. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/anthropic_messages_client.py +0 -0
  203. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/client.py +0 -0
  204. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/openai_completions_client.py +0 -0
  205. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/__init__.py +0 -0
  206. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/README.md +0 -0
  207. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/__init__.py +0 -0
  208. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/__init__.py +0 -0
  209. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/_filter.py +0 -0
  210. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
  211. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
  212. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  213. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  214. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  215. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  216. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  217. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
  218. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  219. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  220. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
  221. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
  222. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
  223. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
  224. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
  225. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/gym_env.py +0 -0
  226. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
  227. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
  228. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  229. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/utils/__init__.py +0 -0
  230. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/utils/file_locks.py +0 -0
  231. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/README.md +0 -0
  232. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/__init__.py +0 -0
  233. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/README.md +0 -0
  234. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  235. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  236. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/singleturn_env.py +0 -0
  237. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/stateful_tool_env.py +0 -0
  238. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/tool_env.py +0 -0
  239. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/errors.py +0 -0
  240. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/gepa/__init__.py +0 -0
  241. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/gepa/adapter.py +0 -0
  242. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/gepa/config.py +0 -0
  243. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/gepa/display.py +0 -0
  244. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/parsers/__init__.py +0 -0
  245. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/parsers/maybe_think_parser.py +0 -0
  246. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/parsers/parser.py +0 -0
  247. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/parsers/think_parser.py +0 -0
  248. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/parsers/xml_parser.py +0 -0
  249. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/README.md +0 -0
  250. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/__init__.py +0 -0
  251. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/inference/__init__.py +0 -0
  252. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/inference/client.py +0 -0
  253. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/inference/server.py +0 -0
  254. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/trainer/__init__.py +0 -0
  255. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/trainer/config.py +0 -0
  256. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/trainer/orchestrator.py +0 -0
  257. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/trainer/trainer.py +0 -0
  258. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/trainer/utils.py +0 -0
  259. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/__init__.py +0 -0
  260. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
  261. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/judge_rubric.py +0 -0
  262. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/__init__.py +0 -0
  263. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/build.py +0 -0
  264. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/init.py +0 -0
  265. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/install.py +0 -0
  266. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/prime_rl.py +0 -0
  267. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/rl.py +0 -0
  268. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/train.py +0 -0
  269. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/vllm.py +0 -0
  270. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/__init__.py +0 -0
  271. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/client/env_client.py +0 -0
  272. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/client/zmq_env_client.py +0 -0
  273. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/server/__init__.py +0 -0
  274. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/server/env_router.py +0 -0
  275. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/server/env_server.py +0 -0
  276. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/server/env_worker.py +0 -0
  277. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/server/zmq_env_server.py +0 -0
  278. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/types.py +0 -0
  279. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/__init__.py +0 -0
  280. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/client_utils.py +0 -0
  281. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/config_utils.py +0 -0
  282. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/data_utils.py +0 -0
  283. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/heartbeat.py +0 -0
  284. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/import_utils.py +0 -0
  285. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/install_utils.py +0 -0
  286. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/message_utils.py +0 -0
  287. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/metric_utils.py +0 -0
  288. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/path_utils.py +0 -0
  289. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/process_utils.py +0 -0
  290. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/serve_utils.py +0 -0
  291. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/thread_utils.py +0 -0
  292. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/threaded_sandbox_client.py +0 -0
  293. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/tool_utils.py +0 -0
  294. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/tunnel_utils.py +0 -0
  295. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/usage_utils.py +0 -0
  296. {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/version_utils.py +0 -0
@@ -4,7 +4,6 @@ venv/
4
4
  env/
5
5
  .env
6
6
  .env.local
7
- uv.lock
8
7
  .claude/
9
8
  .cursorrules
10
9
  .ropeproject/
@@ -22,6 +21,7 @@ _build/
22
21
  docs/build/
23
22
  *.egg-info/
24
23
  __pycache__/
24
+ environments/**/uv.lock
25
25
 
26
26
  .pytest_cache/
27
27
  .ruff_cache/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.13.dev7
3
+ Version: 0.1.14
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -26,6 +26,7 @@ Requires-Dist: aiolimiter>=1.2.1
26
26
  Requires-Dist: anthropic>=0.78.0
27
27
  Requires-Dist: datasets<4.7.0,>=3.0.0
28
28
  Requires-Dist: gepa
29
+ Requires-Dist: httpx>=0.27.0
29
30
  Requires-Dist: jinja2>=3.1.6
30
31
  Requires-Dist: math-verify>=0.8.0
31
32
  Requires-Dist: mcp>=1.14.1
@@ -53,6 +54,8 @@ Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
53
54
  Requires-Dist: stagehand>=3.0.0; extra == 'browser'
54
55
  Provides-Extra: openenv
55
56
  Requires-Dist: openenv-core[core]==0.2.1; extra == 'openenv'
57
+ Provides-Extra: renderers
58
+ Requires-Dist: renderers>=0.1.6; extra == 'renderers'
56
59
  Provides-Extra: rg
57
60
  Requires-Dist: reasoning-gym; extra == 'rg'
58
61
  Provides-Extra: rl
@@ -197,11 +200,81 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
197
200
  async def correct_answer(completion, answer) -> float:
198
201
  completion_ans = completion[-1]['content']
199
202
  return 1.0 if completion_ans == answer else 0.0
200
- rubric = Rubric(funcs=[correct_answer])
203
+ rubric = vf.Rubric(funcs=[correct_answer])
201
204
  env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
202
205
  return env
203
206
  ```
204
207
 
208
+ For composable environments with reusable tasksets, toolsets, custom programs,
209
+ or custom harnesses, use the v1 BYO Harness path:
210
+ ```python
211
+ # my_env.py
212
+ import verifiers.v1 as vf
213
+
214
+ def source():
215
+ yield {
216
+ "prompt": [{"role": "user", "content": "Reverse abc."}],
217
+ "answer": "cba",
218
+ "max_turns": 1,
219
+ }
220
+
221
+ @vf.reward(weight=1.0)
222
+ async def contains_answer(task, state) -> float:
223
+ return float(task["answer"] in str(state.get("completion") or ""))
224
+
225
+ def load_taskset(config: vf.TasksetConfig | None = None):
226
+ return vf.Taskset(source=source, rewards=[contains_answer], config=config)
227
+
228
+ def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
229
+ config = config or vf.EnvConfig()
230
+ return vf.Env(taskset=load_taskset(config=config.taskset))
231
+ ```
232
+ If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
233
+ **[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
234
+ Reusable taskset and harness packages live under `verifiers.v1.packages` while
235
+ the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
236
+ For example, Harbor task directories can run through the bundled OpenCode CLI
237
+ harness with:
238
+
239
+ ```python
240
+ env = vf.Env(
241
+ taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
242
+ harness=vf.OpenCode(),
243
+ )
244
+ ```
245
+
246
+ The same environment package is the unit used by evals and `prime-rl`. The
247
+ trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
248
+ and harness options stay under `env.taskset` and `env.harness`:
249
+
250
+ ```toml
251
+ # configs/rl/my-v1-env.toml
252
+ model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
253
+ max_steps = 100
254
+ batch_size = 256
255
+ rollouts_per_example = 8
256
+
257
+ [sampling]
258
+ max_tokens = 4096
259
+
260
+ [[env]]
261
+ id = "my-env"
262
+
263
+ [env.args]
264
+ arg1 = "non-th-arg"
265
+
266
+ [env.harness]
267
+ max_turns = 1
268
+
269
+ [env.taskset.scoring.contains_answer]
270
+ weight = 1.0
271
+ ```
272
+
273
+ ```bash
274
+ prime env install my-env
275
+ uv run prime-rl configs/rl/my-v1-env.toml
276
+ ```
277
+
205
278
  To install the environment module into your project, do:
206
279
  ```bash
207
280
  prime env install my-env # installs from ./environments/my_env
@@ -237,6 +310,8 @@ prime eval run primeintellect/math-python
237
310
 
238
311
  **[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
239
312
 
313
+ **[BYO Harness](docs/byo-harness.md)** — Build composable v1 taskset/harness environments with custom tools, sandboxes, users, and custom programs.
314
+
240
315
  **[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
241
316
 
242
317
  **[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
@@ -124,11 +124,81 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
124
124
  async def correct_answer(completion, answer) -> float:
125
125
  completion_ans = completion[-1]['content']
126
126
  return 1.0 if completion_ans == answer else 0.0
127
- rubric = Rubric(funcs=[correct_answer])
127
+ rubric = vf.Rubric(funcs=[correct_answer])
128
128
  env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
129
129
  return env
130
130
  ```
131
131
 
132
+ For composable environments with reusable tasksets, toolsets, custom programs,
133
+ or custom harnesses, use the v1 BYO Harness path:
134
+ ```python
135
+ # my_env.py
136
+ import verifiers.v1 as vf
137
+
138
+ def source():
139
+ yield {
140
+ "prompt": [{"role": "user", "content": "Reverse abc."}],
141
+ "answer": "cba",
142
+ "max_turns": 1,
143
+ }
144
+
145
+ @vf.reward(weight=1.0)
146
+ async def contains_answer(task, state) -> float:
147
+ return float(task["answer"] in str(state.get("completion") or ""))
148
+
149
+ def load_taskset(config: vf.TasksetConfig | None = None):
150
+ return vf.Taskset(source=source, rewards=[contains_answer], config=config)
151
+
152
+ def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
153
+ config = config or vf.EnvConfig()
154
+ return vf.Env(taskset=load_taskset(config=config.taskset))
155
+ ```
156
+ If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
157
+ **[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
158
+ Reusable taskset and harness packages live under `verifiers.v1.packages` while
159
+ the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
160
+ For example, Harbor task directories can run through the bundled OpenCode CLI
161
+ harness with:
162
+
163
+ ```python
164
+ env = vf.Env(
165
+ taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
166
+ harness=vf.OpenCode(),
167
+ )
168
+ ```
169
+
170
+ The same environment package is the unit used by evals and `prime-rl`. The
171
+ trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
172
+ and harness options stay under `env.taskset` and `env.harness`:
173
+
174
+ ```toml
175
+ # configs/rl/my-v1-env.toml
176
+ model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
177
+ max_steps = 100
178
+ batch_size = 256
179
+ rollouts_per_example = 8
180
+
181
+ [sampling]
182
+ max_tokens = 4096
183
+
184
+ [[env]]
185
+ id = "my-env"
186
+
187
+ [env.args]
188
+ arg1 = "non-th-arg"
189
+
190
+ [env.harness]
191
+ max_turns = 1
192
+
193
+ [env.taskset.scoring.contains_answer]
194
+ weight = 1.0
195
+ ```
196
+
197
+ ```bash
198
+ prime env install my-env
199
+ uv run prime-rl configs/rl/my-v1-env.toml
200
+ ```
201
+
132
202
  To install the environment module into your project, do:
133
203
  ```bash
134
204
  prime env install my-env # installs from ./environments/my_env
@@ -164,6 +234,8 @@ prime eval run primeintellect/math-python
164
234
 
165
235
  **[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
166
236
 
237
+ **[BYO Harness](docs/byo-harness.md)** — Build composable v1 taskset/harness environments with custom tools, sandboxes, users, and custom programs.
238
+
167
239
  **[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
168
240
 
169
241
  **[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
@@ -52,7 +52,8 @@ dependencies = [
52
52
  "msgpack>=1.1.2",
53
53
  "aiolimiter>=1.2.1",
54
54
  "setproctitle>=1.3.0",
55
- "regex<2026.4.4", # 2026.4.4 missing cp312/cp313 wheels
55
+ "regex<2026.4.4",
56
+ "httpx>=0.27.0",
56
57
  ]
57
58
 
58
59
  [dependency-groups]
@@ -73,6 +74,7 @@ dev = [
73
74
  "aiohttp>=3.9.0",
74
75
  "python-dotenv>=1.0.0",
75
76
  "nltk",
77
+ "renderers>=0.1.6",
76
78
  ]
77
79
 
78
80
  [project.optional-dependencies]
@@ -91,6 +93,9 @@ browser = [
91
93
  "aiohttp>=3.9.0",
92
94
  "python-dotenv>=1.0.0",
93
95
  ]
96
+ renderers = [
97
+ "renderers>=0.1.6",
98
+ ]
94
99
  rl = [
95
100
  "torch>=2.8.0,<2.9.0",
96
101
  "transformers>=4.56.2",
@@ -108,6 +113,24 @@ rl = [
108
113
  preview = true
109
114
  required-version = ">=0.11.1"
110
115
 
116
+ [[tool.uv.index]]
117
+ name = "pypi"
118
+ url = "https://pypi.org/simple"
119
+ default = true
120
+ exclude-newer = "7 days"
121
+
122
+ [tool.uv.exclude-newer-package]
123
+ # PrimeIntellect-published on PyPI (trusted publisher)
124
+ prime-tunnel = false
125
+ prime-sandboxes = false
126
+ renderers = false
127
+
128
+ [tool.uv.sources]
129
+ # Pinned to renderers main until the next PyPI release lands; drop after.
130
+ # fe67f9f = renderers main: PR #4 squash-merge — construction-time
131
+ # preserve_*_thinking flags on create_renderer / create_renderer_pool.
132
+ renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "fe67f9f" }
133
+
111
134
  [tool.uv.extra-build-dependencies]
112
135
  flash-attn = [{ requirement = "torch", match-runtime = true }]
113
136
 
@@ -170,6 +193,7 @@ addopts = [
170
193
  markers = [
171
194
  "slow: marks tests as slow (deselect with '-m \"not slow\"')",
172
195
  "integration: marks tests as integration tests",
196
+ "prime_sandbox: marks tests that provision real Prime sandbox or tunnel resources",
173
197
  "unit: marks tests as unit tests",
174
198
  "asyncio: marks tests as async tests",
175
199
  "parsers: marks tests for parser components",
@@ -195,7 +219,7 @@ unknown-argument = "warn"
195
219
  redundant-cast = "ignore"
196
220
 
197
221
  [tool.ty.src]
198
- exclude = ["environments"]
222
+ exclude = ["environments", "verifiers/v1/sketch.py"]
199
223
 
200
224
  [[tool.ty.overrides]]
201
225
  include = ["verifiers/envs/experimental/composable/tasksets/**"]
@@ -425,10 +425,9 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
425
425
  super().__init__(tools=[offset_tool], **kwargs)
426
426
 
427
427
  async def setup_state(self, state, **kwargs):
428
- state = await super().setup_state(state, **kwargs)
428
+ await super().setup_state(state, **kwargs)
429
429
  state["offset"] = 3
430
430
  state["update_calls"] = 0
431
- return state
432
431
 
433
432
  def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
434
433
  state["update_calls"] += 1
@@ -458,13 +457,15 @@ def make_input() -> Callable[..., RolloutInput]:
458
457
 
459
458
  def _make_input(
460
459
  example_id: int = 0,
461
- task: str = "default",
462
460
  prompt: Messages = DEFAULT_PROMPT,
463
461
  info: Info = {},
464
462
  answer: str = "4",
465
463
  ) -> RolloutInput:
466
464
  return RolloutInput(
467
- example_id=example_id, task=task, prompt=prompt, answer=answer, info=info
465
+ example_id=example_id,
466
+ prompt=prompt,
467
+ answer=answer,
468
+ info=info,
468
469
  )
469
470
 
470
471
  return _make_input
@@ -476,7 +477,6 @@ def make_state() -> Callable[..., State]:
476
477
 
477
478
  def _make_state(
478
479
  example_id: int = 0,
479
- task: str = "default",
480
480
  prompt: Messages = DEFAULT_PROMPT,
481
481
  answer: str = "4",
482
482
  info: Info = {},
@@ -488,17 +488,12 @@ def make_state() -> Callable[..., State]:
488
488
  stop_condition: str | None = "max_turns_reached",
489
489
  tool_defs: list[Tool] | None = None,
490
490
  trajectory: list[TrajectoryStep] = [],
491
- timing=RolloutTiming(
492
- generation_ms=0.0,
493
- scoring_ms=0.0,
494
- total_ms=0.0,
495
- ),
491
+ timing=RolloutTiming(),
496
492
  foo: str = "bar", # custom field
497
493
  **kwargs,
498
494
  ) -> State:
499
495
  return State(
500
496
  example_id=example_id,
501
- task=task,
502
497
  prompt=prompt,
503
498
  answer=answer,
504
499
  info=info,
@@ -551,7 +546,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
551
546
  rollouts_per_example: int = 1,
552
547
  sampling_args: SamplingArgs = {},
553
548
  date: str = "1970-01-01",
554
- time_ms: float = 0.0,
549
+ time: float = 0.0,
555
550
  avg_reward: float = 0.0,
556
551
  avg_metrics: dict[str, float] = {},
557
552
  pass_at_k: dict[str, float] = {},
@@ -579,7 +574,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
579
574
  rollouts_per_example=rollouts_per_example,
580
575
  sampling_args=sampling_args,
581
576
  date=date,
582
- time_ms=time_ms,
577
+ time=time,
583
578
  avg_reward=avg_reward,
584
579
  avg_metrics=avg_metrics,
585
580
  pass_at_k=pass_at_k,
@@ -1,5 +1,6 @@
1
1
  """Tests for CliAgentEnv and HarborEnv."""
2
2
 
3
+ import asyncio
3
4
  import tempfile
4
5
  from pathlib import Path
5
6
  from unittest.mock import AsyncMock, MagicMock, patch
@@ -8,6 +9,7 @@ import pytest
8
9
  from datasets import Dataset
9
10
 
10
11
  import verifiers as vf
12
+ from verifiers.utils.interception_utils import serialize_intercept_response
11
13
 
12
14
 
13
15
  @pytest.fixture
@@ -62,7 +64,8 @@ class TestCliAgentEnv:
62
64
  assert env.run_command == "python agent.py"
63
65
  assert env.docker_image == "python:3.11-slim"
64
66
  assert env.interception_port == 8765
65
- assert env.timeout_seconds == 3600.0
67
+ assert env.timeout_seconds is None
68
+ assert env.sandbox_timeout_minutes is None
66
69
 
67
70
  def test_init_custom_config(self, sample_dataset):
68
71
  """Test initialization with custom configuration."""
@@ -130,22 +133,34 @@ class TestCliAgentEnv:
130
133
  state = {"agent_completed": True}
131
134
  assert await env.agent_completed(state) is True
132
135
 
133
- @pytest.mark.asyncio
134
- async def test_timeout_reached_stop_condition(self, sample_dataset):
135
- """Test the timeout_reached stop condition."""
136
+ @pytest.mark.parametrize(
137
+ "timeout_seconds,expected_minutes",
138
+ [
139
+ (None, 24 * 60), # no rollout cap → SDK ceiling
140
+ (600.0, 10 + 60), # finite → ceil + scoring buffer
141
+ (24 * 3600.0, 24 * 60), # buffer would overflow → clamped to ceiling
142
+ ],
143
+ )
144
+ def test_sandbox_timeout_auto_derived(
145
+ self, sample_dataset, timeout_seconds, expected_minutes
146
+ ):
136
147
  env = vf.CliAgentEnv(
137
148
  run_command="python agent.py",
138
149
  dataset=sample_dataset,
139
150
  rubric=vf.Rubric(),
140
- timeout_seconds=10.0,
151
+ timeout_seconds=timeout_seconds,
141
152
  )
142
- import time
143
-
144
- state = {"timing": {"start_time": time.time()}}
145
- assert await env.timeout_reached(state) is False
153
+ assert env.get_sandbox_resources({})["timeout_minutes"] == expected_minutes
146
154
 
147
- state = {"timing": {"start_time": time.time() - 20}}
148
- assert await env.timeout_reached(state) is True
155
+ def test_sandbox_timeout_explicit_override(self, sample_dataset):
156
+ env = vf.CliAgentEnv(
157
+ run_command="python agent.py",
158
+ dataset=sample_dataset,
159
+ rubric=vf.Rubric(),
160
+ timeout_seconds=600.0,
161
+ sandbox_timeout_minutes=30,
162
+ )
163
+ assert env.get_sandbox_resources({})["timeout_minutes"] == 30
149
164
 
150
165
  @pytest.mark.asyncio
151
166
  async def test_env_response_returns_empty(self, sample_dataset):
@@ -204,6 +219,152 @@ class TestCliAgentEnv:
204
219
  assert kwargs["tools"][0].name == "echo"
205
220
 
206
221
 
222
+ @pytest.mark.asyncio
223
+ async def test_cli_agent_env_delivers_intercepted_tool_call_response(
224
+ sample_dataset, mock_client
225
+ ):
226
+ env = vf.CliAgentEnv(
227
+ run_command="python agent.py",
228
+ dataset=sample_dataset,
229
+ rubric=vf.Rubric(),
230
+ )
231
+ prompt = sample_dataset[0]["prompt"]
232
+ tool_call = {
233
+ "id": "call_echo",
234
+ "type": "function",
235
+ "function": {"name": "echo", "arguments": '{"text": "hello"}'},
236
+ }
237
+ mock_client.add_response(
238
+ prompt,
239
+ "",
240
+ finish_reason="tool_calls",
241
+ tool_calls=[tool_call],
242
+ )
243
+
244
+ state = await env.init_state(
245
+ input=sample_dataset[0],
246
+ client=mock_client,
247
+ model="test-model",
248
+ )
249
+ response_future = asyncio.Future()
250
+ request_id = "req-tool-call"
251
+ state["current_request_id"] = request_id
252
+ env._interception_server.intercepts[request_id] = {
253
+ "stream": False,
254
+ "tools": [
255
+ {
256
+ "type": "function",
257
+ "function": {
258
+ "name": "echo",
259
+ "description": "Return the provided text.",
260
+ "parameters": {
261
+ "type": "object",
262
+ "properties": {"text": {"type": "string"}},
263
+ },
264
+ },
265
+ }
266
+ ],
267
+ "response_future": response_future,
268
+ }
269
+
270
+ response = await env.get_model_response(
271
+ state=state,
272
+ prompt=prompt,
273
+ client=mock_client,
274
+ model="test-model",
275
+ )
276
+
277
+ assert response_future.done()
278
+ assert response_future.result() is response
279
+ assert state["current_request_id"] is None
280
+
281
+ payload = serialize_intercept_response(response_future.result())
282
+ choice = payload["choices"][0]
283
+ assert choice["finish_reason"] == "tool_calls"
284
+ assert choice["message"]["tool_calls"] == [tool_call]
285
+ assert mock_client.last_call_kwargs["tools"][0].name == "echo"
286
+
287
+
288
+ @pytest.mark.asyncio
289
+ async def test_cli_agent_env_synthesizes_stream_for_intercepted_tool_call_response(
290
+ sample_dataset, mock_client
291
+ ):
292
+ env = vf.CliAgentEnv(
293
+ run_command="python agent.py",
294
+ dataset=sample_dataset,
295
+ rubric=vf.Rubric(),
296
+ )
297
+ prompt = sample_dataset[0]["prompt"]
298
+ tool_call = {
299
+ "id": "call_echo",
300
+ "type": "function",
301
+ "function": {"name": "echo", "arguments": '{"text": "hello"}'},
302
+ }
303
+ mock_client.add_response(
304
+ prompt,
305
+ "",
306
+ finish_reason="tool_calls",
307
+ tool_calls=[tool_call],
308
+ )
309
+
310
+ state = await env.init_state(
311
+ input=sample_dataset[0],
312
+ client=mock_client,
313
+ model="test-model",
314
+ )
315
+ chunk_queue = asyncio.Queue()
316
+ response_future = asyncio.Future()
317
+ request_id = "req-stream-tool-call"
318
+ state["current_request_id"] = request_id
319
+ env._interception_server.intercepts[request_id] = {
320
+ "stream": True,
321
+ "tools": [
322
+ {
323
+ "type": "function",
324
+ "function": {
325
+ "name": "echo",
326
+ "description": "Return the provided text.",
327
+ "parameters": {
328
+ "type": "object",
329
+ "properties": {"text": {"type": "string"}},
330
+ },
331
+ },
332
+ }
333
+ ],
334
+ "chunk_queue": chunk_queue,
335
+ "response_future": response_future,
336
+ }
337
+
338
+ response = await env.get_model_response(
339
+ state=state,
340
+ prompt=prompt,
341
+ client=mock_client,
342
+ model="test-model",
343
+ )
344
+
345
+ chunks = []
346
+ while True:
347
+ chunk = await asyncio.wait_for(chunk_queue.get(), timeout=1.0)
348
+ if chunk is None:
349
+ break
350
+ chunks.append(chunk)
351
+
352
+ assert response_future.done()
353
+ assert response_future.result() is response
354
+ assert state["current_request_id"] is None
355
+
356
+ assert chunks[0]["object"] == "chat.completion.chunk"
357
+ assert chunks[0]["choices"][0]["delta"]["tool_calls"][0]["id"] == "call_echo"
358
+ assert (
359
+ chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["name"] == "echo"
360
+ )
361
+ assert (
362
+ chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["arguments"]
363
+ == '{"text": "hello"}'
364
+ )
365
+ assert chunks[-1]["choices"][0]["finish_reason"] == "tool_calls"
366
+
367
+
207
368
  class TestHarborEnv:
208
369
  """Tests for HarborEnv."""
209
370
 
@@ -231,7 +392,7 @@ class TestHarborEnv:
231
392
  dataset_path=harbor_task_dir,
232
393
  )
233
394
  assert len(env.dataset) == 1
234
- assert env.dataset[0]["task"] == "test_task"
395
+ assert env.dataset[0]["info"]["task_name"] == "test_task"
235
396
 
236
397
  def test_init_filters_tasks(self, harbor_task_dir):
237
398
  """Test that HarborEnv can filter tasks by name."""
@@ -247,7 +408,7 @@ class TestHarborEnv:
247
408
  tasks=["test_task"],
248
409
  )
249
410
  assert len(env.dataset) == 1
250
- assert env.dataset[0]["task"] == "test_task"
411
+ assert env.dataset[0]["info"]["task_name"] == "test_task"
251
412
 
252
413
  def test_init_raises_on_empty_dataset(self):
253
414
  """Test that HarborEnv raises when no valid tasks found."""
@@ -301,7 +462,7 @@ class TestHarborEnv:
301
462
  )
302
463
  state = {
303
464
  "interception_base_url": "https://test.trycloudflare.com/v1",
304
- "task": "my_task",
465
+ "info": {"task_name": "my_task"},
305
466
  }
306
467
  env_vars = await env.build_env_vars(state)
307
468
 
@@ -251,7 +251,7 @@ async def test_composable_env_quotes_log_path_when_collecting_logs():
251
251
  teardown=lambda: None,
252
252
  )
253
253
 
254
- state = {"sandbox_id": "sbx", "timing": {"total_ms": 0}}
254
+ state = {"sandbox_id": "sbx", "timing": {"total": 0}}
255
255
 
256
256
  await env.post_rollout(state)
257
257
 
@@ -594,7 +594,7 @@ async def test_composable_env_collects_harness_metrics():
594
594
  state = {
595
595
  "sandbox_id": "sbx",
596
596
  "info": {"id": 0},
597
- "timing": {"total_ms": 0},
597
+ "timing": {"total": 0},
598
598
  "trajectory": [],
599
599
  }
600
600
 
@@ -633,7 +633,7 @@ async def test_composable_env_metrics_with_key_whitelist():
633
633
  state = {
634
634
  "sandbox_id": "sbx",
635
635
  "info": {"id": 0},
636
- "timing": {"total_ms": 0},
636
+ "timing": {"total": 0},
637
637
  "trajectory": [],
638
638
  }
639
639
 
@@ -659,7 +659,7 @@ async def test_composable_env_no_metrics_when_path_not_set():
659
659
  state = {
660
660
  "sandbox_id": "sbx",
661
661
  "info": {"id": 0},
662
- "timing": {"total_ms": 0},
662
+ "timing": {"total": 0},
663
663
  "trajectory": [],
664
664
  }
665
665