verifiers 0.1.10.dev5__tar.gz → 0.1.11.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/PKG-INFO +9 -8
  2. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/README.md +7 -7
  3. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/pyproject.toml +1 -0
  4. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/README.md +16 -20
  5. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/conftest.py +149 -118
  6. verifiers-0.1.11.dev1/tests/test_build_script.py +29 -0
  7. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_cli_agent_env.py +43 -0
  8. verifiers-0.1.11.dev1/tests/test_client_auth_errors.py +189 -0
  9. verifiers-0.1.11.dev1/tests/test_client_multimodal_types.py +239 -0
  10. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_decorator_ranks.py +29 -29
  11. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_endpoint_registry.py +45 -0
  12. verifiers-0.1.11.dev1/tests/test_env_crash_recovery.py +237 -0
  13. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_env_group.py +47 -47
  14. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_environment.py +210 -161
  15. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_environment_extra.py +145 -88
  16. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_eval_cli.py +28 -0
  17. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_gym_env.py +68 -95
  18. verifiers-0.1.11.dev1/tests/test_interception_utils.py +63 -0
  19. verifiers-0.1.11.dev1/tests/test_message_utils.py +57 -0
  20. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_multiturn_env.py +53 -54
  21. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_rlm_env.py +880 -133
  22. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_rlm_env_sandbox.py +14 -50
  23. verifiers-0.1.11.dev1/tests/test_rollout_gateway_env.py +350 -0
  24. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_save_utils.py +233 -0
  25. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_singleturn_env.py +21 -24
  26. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_stateful_tool_env.py +23 -29
  27. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_tool_env.py +34 -38
  28. verifiers-0.1.11.dev1/tests/test_tool_utils.py +160 -0
  29. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_trajectory_processing.py +104 -48
  30. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/AGENTS.md +1 -1
  31. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/__init__.py +19 -1
  32. verifiers-0.1.11.dev1/verifiers/clients/__init__.py +39 -0
  33. verifiers-0.1.11.dev1/verifiers/clients/anthropic_messages_client.py +470 -0
  34. verifiers-0.1.11.dev1/verifiers/clients/client.py +128 -0
  35. verifiers-0.1.11.dev1/verifiers/clients/openai_chat_completions_client.py +510 -0
  36. verifiers-0.1.11.dev1/verifiers/clients/openai_chat_completions_token_client.py +236 -0
  37. verifiers-0.1.11.dev1/verifiers/clients/openai_completions_client.py +188 -0
  38. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/env_group.py +13 -15
  39. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/environment.py +233 -358
  40. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/README.md +4 -2
  41. verifiers-0.1.11.dev1/verifiers/envs/experimental/__init__.py +4 -0
  42. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/cli_agent_env.py +104 -26
  43. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/gym_env.py +4 -9
  44. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/mcp_env.py +18 -47
  45. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/rlm_env.py +658 -921
  46. verifiers-0.1.11.dev1/verifiers/envs/experimental/rollout_gateway_mixin.py +397 -0
  47. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/browser_env.py +7 -1
  48. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +60 -44
  49. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +1 -1
  50. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/openenv_env.py +71 -46
  51. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/textarena_env.py +33 -11
  52. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/multiturn_env.py +37 -25
  53. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/singleturn_env.py +3 -0
  54. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/stateful_tool_env.py +26 -44
  55. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/tool_env.py +45 -56
  56. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/errors.py +8 -2
  57. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/gepa/adapter.py +3 -2
  58. verifiers-0.1.11.dev1/verifiers/parsers/parser.py +85 -0
  59. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/parsers/think_parser.py +14 -3
  60. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/parsers/xml_parser.py +12 -6
  61. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/build.py +32 -7
  62. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/eval.py +131 -29
  63. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/gepa.py +6 -11
  64. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/types.py +179 -34
  65. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/async_utils.py +2 -6
  66. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/client_utils.py +62 -31
  67. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/data_utils.py +3 -3
  68. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/eval_display.py +7 -1
  69. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/eval_utils.py +95 -11
  70. verifiers-0.1.11.dev1/verifiers/utils/heartbeat.py +31 -0
  71. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/interception_utils.py +103 -26
  72. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/logging_utils.py +44 -6
  73. verifiers-0.1.11.dev1/verifiers/utils/message_utils.py +318 -0
  74. verifiers-0.1.11.dev1/verifiers/utils/metric_utils.py +69 -0
  75. verifiers-0.1.11.dev1/verifiers/utils/response_utils.py +73 -0
  76. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/save_utils.py +43 -14
  77. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/tool_utils.py +9 -14
  78. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/worker_utils.py +15 -32
  79. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/workers/client/env_client.py +22 -2
  80. verifiers-0.1.11.dev1/verifiers/workers/client/zmq_env_client.py +408 -0
  81. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/workers/server/env_server.py +68 -34
  82. verifiers-0.1.11.dev1/verifiers/workers/server/zmq_env_server.py +246 -0
  83. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/workers/types.py +21 -0
  84. verifiers-0.1.10.dev5/tests/mock_client_guide.md +0 -207
  85. verifiers-0.1.10.dev5/tests/mock_openai_client.py +0 -155
  86. verifiers-0.1.10.dev5/tests/test_environment_audio_modality.py +0 -112
  87. verifiers-0.1.10.dev5/tests/test_tool_utils.py +0 -175
  88. verifiers-0.1.10.dev5/verifiers/envs/experimental/__init__.py +0 -3
  89. verifiers-0.1.10.dev5/verifiers/parsers/parser.py +0 -59
  90. verifiers-0.1.10.dev5/verifiers/utils/message_utils.py +0 -165
  91. verifiers-0.1.10.dev5/verifiers/utils/response_utils.py +0 -142
  92. verifiers-0.1.10.dev5/verifiers/utils/token_utils.py +0 -187
  93. verifiers-0.1.10.dev5/verifiers/workers/client/zmq_env_client.py +0 -198
  94. verifiers-0.1.10.dev5/verifiers/workers/server/zmq_env_server.py +0 -148
  95. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/.gitignore +0 -0
  96. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/LICENSE +0 -0
  97. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/AGENTS.md +0 -0
  98. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/__init__.py +0 -0
  99. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_browser_env.py +0 -0
  100. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_client_config.py +0 -0
  101. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_envs.py +0 -0
  102. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_error_chain.py +0 -0
  103. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_eval_display.py +0 -0
  104. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_eval_utils.py +0 -0
  105. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_gepa_cli.py +0 -0
  106. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_imports.py +0 -0
  107. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_install_utils.py +0 -0
  108. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_logging.py +0 -0
  109. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_math_rubric.py +0 -0
  110. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_maybe_think_parser.py +0 -0
  111. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_message_utils_audio.py +0 -0
  112. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_opencode_harbor.py +0 -0
  113. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_parser.py +0 -0
  114. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_path_utils.py +0 -0
  115. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_prime_plugin.py +0 -0
  116. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_rubric.py +0 -0
  117. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_rubric_group.py +0 -0
  118. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_sandbox_env.py +0 -0
  119. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_sandbox_mixin.py +0 -0
  120. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_setup_script.py +0 -0
  121. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_think_parser.py +0 -0
  122. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_tui_info_formatting.py +0 -0
  123. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_xml_parser.py +0 -0
  124. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/__init__.py +0 -0
  125. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/__init__.py +0 -0
  126. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/build.py +0 -0
  127. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/eval.py +0 -0
  128. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/gepa.py +0 -0
  129. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/init.py +0 -0
  130. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/install.py +0 -0
  131. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/setup.py +0 -0
  132. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/plugins/__init__.py +0 -0
  133. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/plugins/prime.py +0 -0
  134. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/tui.py +0 -0
  135. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/decorators.py +0 -0
  136. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/AGENTS.md +0 -0
  137. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/__init__.py +0 -0
  138. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/harbor_env.py +0 -0
  139. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
  140. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/README.md +0 -0
  141. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/__init__.py +0 -0
  142. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  143. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  144. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
  145. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  146. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/python_env.py +0 -0
  147. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/sandbox_env.py +0 -0
  148. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/gepa/__init__.py +0 -0
  149. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/gepa/config.py +0 -0
  150. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/gepa/display.py +0 -0
  151. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/gepa/gepa_utils.py +0 -0
  152. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/parsers/__init__.py +0 -0
  153. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/parsers/maybe_think_parser.py +0 -0
  154. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/README.md +0 -0
  155. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/__init__.py +0 -0
  156. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/inference/__init__.py +0 -0
  157. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/inference/client.py +0 -0
  158. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/inference/server.py +0 -0
  159. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/trainer/__init__.py +0 -0
  160. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/trainer/config.py +0 -0
  161. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/trainer/orchestrator.py +0 -0
  162. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/trainer/trainer.py +0 -0
  163. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/trainer/utils.py +0 -0
  164. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rubrics/__init__.py +0 -0
  165. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rubrics/judge_rubric.py +0 -0
  166. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rubrics/math_rubric.py +0 -0
  167. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rubrics/rubric.py +0 -0
  168. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rubrics/rubric_group.py +0 -0
  169. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/__init__.py +0 -0
  170. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/init.py +0 -0
  171. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/install.py +0 -0
  172. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/prime_rl.py +0 -0
  173. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/rl.py +0 -0
  174. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/setup.py +0 -0
  175. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/train.py +0 -0
  176. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/tui.py +0 -0
  177. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/vllm.py +0 -0
  178. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/__init__.py +0 -0
  179. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/config_utils.py +0 -0
  180. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/display_utils.py +0 -0
  181. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/env_utils.py +0 -0
  182. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/error_utils.py +0 -0
  183. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/import_utils.py +0 -0
  184. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/install_utils.py +0 -0
  185. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/path_utils.py +0 -0
  186. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/thread_utils.py +0 -0
  187. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/threaded_sandbox_client.py +0 -0
  188. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/tunnel_utils.py +0 -0
  189. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/usage_utils.py +0 -0
  190. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/version_utils.py +0 -0
  191. {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/workers/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.10.dev5
3
+ Version: 0.1.11.dev1
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -22,6 +22,7 @@ Classifier: Programming Language :: Python :: 3.13
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
24
  Requires-Python: <3.14,>=3.10
25
+ Requires-Dist: anthropic>=0.78.0
25
26
  Requires-Dist: datasets>=3.0.0
26
27
  Requires-Dist: gepa
27
28
  Requires-Dist: jinja2>=3.1.6
@@ -105,7 +106,7 @@ Verifiers: Environments for LLM Reinforcement Learning
105
106
 
106
107
  - [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
107
108
  - [11/19/25] v0.1.8 is released, featuring a major refactor of the rollout system to use trajectory-based tracking for token-in token-out training across turns, as well as support for truncated or branching rollouts.
108
- - [11/07/25] Verifiers v0.1.7 is released! This includes an improved quickstart configuration for training with [prime-rl], a new included "nano" trainer (`vf.RLTrainer`, replacing `vf.GRPOTrainer`), and a number of bug fixes and improvements to the documentation.
109
+ - [11/07/25] Verifiers v0.1.7 is released! This includes an improved quickstart configuration for training with [prime-rl](https://github.com/PrimeIntellect-ai/prime-rl), a new included "nano" trainer (`vf.RLTrainer`, replacing `vf.GRPOTrainer`), and a number of bug fixes and improvements to the documentation.
109
110
  - [10/27/25] A new iteration of the Prime Intellect [Environments Program](https://docs.google.com/spreadsheets/d/13UDfRDjgIZXsMI2s9-Lmn8KSMMsgk2_zsfju6cx_pNU/edit?gid=0#gid=0) is live!
110
111
 
111
112
 
@@ -228,17 +229,17 @@ prime eval run primeintellect/math-python
228
229
 
229
230
  ## Documentation
230
231
 
231
- **[Environments](environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
232
+ **[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
232
233
 
233
- **[Evaluation](evaluation.md)** - Evaluate models using your environments.
234
+ **[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
234
235
 
235
- **[Training](training.md)** — Train models in your environments with reinforcement learning.
236
+ **[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
236
237
 
237
- **[Development](development.md)** — Contributing to verifiers
238
+ **[Development](docs/development.md)** — Contributing to verifiers
238
239
 
239
- **[API Reference](reference.md)** — Understanding the API and data structures
240
+ **[API Reference](docs/reference.md)** — Understanding the API and data structures
240
241
 
241
- **[FAQs](faqs.md)** - Other frequently asked questions.
242
+ **[FAQs](docs/faqs.md)** - Other frequently asked questions.
242
243
 
243
244
 
244
245
  ## Citation
@@ -36,7 +36,7 @@ Verifiers: Environments for LLM Reinforcement Learning
36
36
 
37
37
  - [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
38
38
  - [11/19/25] v0.1.8 is released, featuring a major refactor of the rollout system to use trajectory-based tracking for token-in token-out training across turns, as well as support for truncated or branching rollouts.
39
- - [11/07/25] Verifiers v0.1.7 is released! This includes an improved quickstart configuration for training with [prime-rl], a new included "nano" trainer (`vf.RLTrainer`, replacing `vf.GRPOTrainer`), and a number of bug fixes and improvements to the documentation.
39
+ - [11/07/25] Verifiers v0.1.7 is released! This includes an improved quickstart configuration for training with [prime-rl](https://github.com/PrimeIntellect-ai/prime-rl), a new included "nano" trainer (`vf.RLTrainer`, replacing `vf.GRPOTrainer`), and a number of bug fixes and improvements to the documentation.
40
40
  - [10/27/25] A new iteration of the Prime Intellect [Environments Program](https://docs.google.com/spreadsheets/d/13UDfRDjgIZXsMI2s9-Lmn8KSMMsgk2_zsfju6cx_pNU/edit?gid=0#gid=0) is live!
41
41
 
42
42
 
@@ -159,17 +159,17 @@ prime eval run primeintellect/math-python
159
159
 
160
160
  ## Documentation
161
161
 
162
- **[Environments](environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
162
+ **[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
163
163
 
164
- **[Evaluation](evaluation.md)** - Evaluate models using your environments.
164
+ **[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
165
165
 
166
- **[Training](training.md)** — Train models in your environments with reinforcement learning.
166
+ **[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
167
167
 
168
- **[Development](development.md)** — Contributing to verifiers
168
+ **[Development](docs/development.md)** — Contributing to verifiers
169
169
 
170
- **[API Reference](reference.md)** — Understanding the API and data structures
170
+ **[API Reference](docs/reference.md)** — Understanding the API and data structures
171
171
 
172
- **[FAQs](faqs.md)** - Other frequently asked questions.
172
+ **[FAQs](docs/faqs.md)** - Other frequently asked questions.
173
173
 
174
174
 
175
175
  ## Citation
@@ -28,6 +28,7 @@ classifiers = [
28
28
  ]
29
29
 
30
30
  dependencies = [
31
+ "anthropic>=0.78.0",
31
32
  "datasets>=3.0.0",
32
33
  "jinja2>=3.1.6",
33
34
  "numpy",
@@ -63,9 +63,11 @@ uv run pytest -m unit
63
63
 
64
64
  The test suite includes comprehensive support for testing async Environment classes:
65
65
 
66
- ### AsyncOpenAI Client Mocking
67
- - `mock_openai_client` fixture provides a fully mocked AsyncOpenAI client
68
- - Supports both chat completions and regular completions
66
+ ### MockClient (conftest.py)
67
+ - `MockClient(Client)` implements the `get_response()` interface returning `vf.Response` objects
68
+ - `mock_client` fixture provides an instance for tests
69
+ - Supports prompt-to-response mappings via `add_response()`
70
+ - Tracks calls via `call_count` and `last_call_kwargs`
69
71
  - No actual API calls are made during testing
70
72
 
71
73
  ### Test Datasets
@@ -76,22 +78,17 @@ The test suite includes comprehensive support for testing async Environment clas
76
78
  ### Async Test Examples
77
79
  ```python
78
80
  @pytest.mark.asyncio
79
- async def test_my_async_function(mock_openai_client):
80
- env = SingleTurnEnv(client=mock_openai_client, model="test", ...)
81
+ async def test_my_async_function(mock_client):
82
+ env = SingleTurnEnv(client=mock_client, model="test", ...)
81
83
  result = await env.rollout(...)
82
- assert result[0] == expected_completion
83
-
84
- # MultiTurnEnv testing
85
- @pytest.mark.asyncio
86
- async def test_multiturn_conversation(mock_multiturn_env):
87
- # Configure sequential responses
88
- responses = ["response1", "response2", "final DONE"]
89
- mock_multiturn_env.client.chat.completions.create.side_effect = [
90
- create_mock_response(resp) for resp in responses
91
- ]
92
-
93
- completion, state = await mock_multiturn_env.rollout(...)
94
- assert len(completion) > 1 # Multiple turns
84
+ assert mock_client.call_count == 1
85
+
86
+ # Custom response mapping
87
+ @pytest.mark.asyncio
88
+ async def test_with_custom_response(mock_client):
89
+ mock_client.set_default_response("DONE")
90
+ env = SimpleMultiTurnEnv(client=mock_client, model="test", ...)
91
+ completion, state = await env.rollout(...)
95
92
  ```
96
93
 
97
94
  ### Environment Testing
@@ -103,7 +100,6 @@ async def test_multiturn_conversation(mock_multiturn_env):
103
100
  - Completion detection logic
104
101
  - State management across turns
105
102
  - Tests cover both chat and completion message formats
106
- - Mocked responses simulate real OpenAI API behavior
107
103
  - Error handling and edge cases are tested
108
104
  - No real LLM requests are made
109
105
 
@@ -112,5 +108,5 @@ async def test_multiturn_conversation(mock_multiturn_env):
112
108
  1. Create test files following the `test_*.py` naming convention
113
109
  2. Use the fixtures from `conftest.py` for common instances
114
110
  3. Add appropriate test markers (`@pytest.mark.asyncio` for async tests)
115
- 4. Use `mock_openai_client` for Environment testing
111
+ 4. Use `mock_client` for Environment testing
116
112
  5. Follow the existing test structure and naming conventions
@@ -1,12 +1,11 @@
1
1
  """Pytest configuration and fixtures for verifiers tests."""
2
2
 
3
+ import logging
3
4
  from pathlib import Path
4
- from typing import Callable
5
- from unittest.mock import AsyncMock, MagicMock
5
+ from typing import Any, Callable
6
6
 
7
7
  import pytest
8
8
  from datasets import Dataset
9
- from openai.types.chat import ChatCompletionToolParam
10
9
 
11
10
  from verifiers import (
12
11
  MaybeThinkParser,
@@ -23,13 +22,18 @@ from verifiers import (
23
22
  XMLParser,
24
23
  stop,
25
24
  )
25
+ from verifiers.clients.client import Client
26
26
  from verifiers.types import (
27
27
  GenerateMetadata,
28
28
  Info,
29
+ Response,
30
+ ResponseMessage,
29
31
  RolloutInput,
30
32
  RolloutOutput,
31
33
  RolloutTiming,
32
34
  SamplingArgs,
35
+ Tool,
36
+ ToolCall,
33
37
  TrajectoryStep,
34
38
  )
35
39
  from verifiers.utils.save_utils import state_to_output
@@ -82,135 +86,156 @@ def think_parser_with_extractor():
82
86
  # Async test fixtures for Environment testing
83
87
 
84
88
 
85
- class MockAsyncOpenAI:
86
- """Mock AsyncOpenAI client that maps conversation inputs to outputs."""
89
+ class MockClient(Client):
90
+ """Mocked vf.Client with get_response() to return provider-agnostic vf.Response objects"""
87
91
 
88
92
  def __init__(self):
89
- self.chat_completions = {} # Maps conversation history to responses
90
- self.text_completions = {} # Maps prompts to responses
91
- self.default_chat_response = "This is a test response"
92
- self.default_text_response = "This is a test completion"
93
- self.base_url = "http://localhost/v1/" # For testing URL parsing
94
-
95
- # Create mock structure
96
- self.chat = MagicMock()
97
- self.completions = MagicMock()
98
- self.chat.completions = MagicMock()
99
-
100
- # Set up async methods
101
- self.chat.completions.create = AsyncMock(
102
- side_effect=self._handle_chat_completion
103
- )
104
- self.completions.create = AsyncMock(side_effect=self._handle_text_completion)
93
+ self.logger = logging.getLogger(f"{__name__}.MockClient")
94
+ self._client = None
105
95
 
106
- def add_chat_response(
107
- self, messages, response, finish_reason="stop", tool_calls=None
108
- ):
96
+ self._responses: dict[tuple, dict] = {}
97
+ self.default_response = "This is a test response"
98
+
99
+ # Call tracking
100
+ self.call_count = 0
101
+ self.last_call_kwargs: dict[str, Any] = {}
102
+
103
+ def add_response(self, messages, response, finish_reason="stop", tool_calls=None):
109
104
  """Add a mapped response for specific messages."""
110
- # Convert messages to a hashable key
111
- key = self._messages_to_key(messages)
112
- self.chat_completions[key] = {
105
+ key = self._messages_to_key(self._normalize_input(messages))
106
+ self._responses[key] = {
113
107
  "content": response,
114
108
  "finish_reason": finish_reason,
115
109
  "tool_calls": tool_calls,
116
110
  }
117
111
 
118
- def add_text_response(self, prompt, response, finish_reason="stop"):
119
- """Add a mapped response for specific prompt."""
120
- self.text_completions[prompt] = {
121
- "text": response,
122
- "finish_reason": finish_reason,
112
+ def set_default_response(self, response):
113
+ """Set default response when no mapping found."""
114
+ self.default_response = response
115
+
116
+ async def get_response(
117
+ self,
118
+ prompt,
119
+ model,
120
+ sampling_args,
121
+ tools=None,
122
+ **kwargs,
123
+ ) -> Response:
124
+ """Return a Response based on the prompt-to-response mapping."""
125
+ self.call_count += 1
126
+ self.last_call_kwargs = {
127
+ "prompt": prompt,
128
+ "model": model,
129
+ "sampling_args": sampling_args,
130
+ "tools": tools,
131
+ **kwargs,
123
132
  }
124
133
 
125
- def set_default_responses(self, chat_response=None, text_response=None):
126
- """Set default responses when no mapping found."""
127
- if chat_response:
128
- self.default_chat_response = chat_response
129
- if text_response:
130
- self.default_text_response = text_response
134
+ return self._make_response(prompt)
131
135
 
132
- async def _handle_chat_completion(self, messages, **kwargs):
133
- """Handle chat completion requests."""
134
- key = self._messages_to_key(messages)
136
+ def setup_client(self, config):
137
+ return None
135
138
 
136
- if key in self.chat_completions:
137
- response_data = self.chat_completions[key]
138
- else:
139
- response_data = {
140
- "content": self.default_chat_response,
141
- "finish_reason": "stop",
142
- "tool_calls": None,
143
- }
139
+ async def to_native_tool(self, tool):
140
+ pass
144
141
 
145
- # Create mock response that mimics ChatCompletion
146
- from openai.types.chat.chat_completion import ChatCompletion, Choice
147
- from openai.types.chat.chat_completion_message import ChatCompletionMessage
148
-
149
- # Create a proper mock that will pass isinstance checks
150
- mock_response = MagicMock(spec=ChatCompletion)
151
- mock_choice = MagicMock(spec=Choice)
152
- mock_message = MagicMock(spec=ChatCompletionMessage)
153
-
154
- # Set the attributes
155
- mock_message.content = response_data["content"]
156
- mock_message.role = "assistant"
157
- mock_message.tool_calls = response_data.get("tool_calls", None)
158
- mock_choice.message = mock_message
159
- mock_choice.finish_reason = response_data["finish_reason"]
160
- mock_choice.index = 0
161
-
162
- mock_response.choices = [mock_choice]
163
- mock_response.id = "test-id"
164
- mock_response.model = "test-model"
165
- mock_response.object = "chat.completion"
166
-
167
- return mock_response
168
-
169
- async def _handle_text_completion(self, prompt, **kwargs):
170
- """Handle text completion requests."""
171
- if prompt in self.text_completions:
172
- response_data = self.text_completions[prompt]
173
- else:
174
- response_data = {
175
- "text": self.default_text_response,
176
- "finish_reason": "stop",
177
- }
142
+ async def to_native_prompt(self, messages):
143
+ return [], {}
144
+
145
+ async def get_native_response(
146
+ self, prompt, model, sampling_args, tools=None, **kwargs
147
+ ):
148
+ pass
178
149
 
179
- # Create mock response that mimics Completion
180
- from openai.types.completion import Completion
181
- from openai.types.completion_choice import CompletionChoice
150
+ async def raise_from_native_response(self, response):
151
+ pass
182
152
 
183
- # Create a proper mock that will pass isinstance checks
184
- mock_response = MagicMock(spec=Completion)
185
- mock_choice = MagicMock(spec=CompletionChoice)
153
+ async def from_native_response(self, response):
154
+ pass
186
155
 
187
- # Set the attributes
188
- mock_choice.text = response_data["text"]
189
- mock_choice.finish_reason = response_data["finish_reason"]
190
- mock_choice.index = 0
156
+ async def close(self) -> None:
157
+ pass
191
158
 
192
- mock_response.choices = [mock_choice]
193
- mock_response.id = "test-id"
194
- mock_response.model = "test-model"
195
- mock_response.object = "text_completion"
159
+ # -- Internal helpers --
196
160
 
197
- return mock_response
161
+ @staticmethod
162
+ def _normalize_input(messages):
163
+ """Normalize prompt to list-of-dicts form for keying."""
164
+ if isinstance(messages, str):
165
+ return [{"role": "text", "content": messages}]
166
+ return messages
198
167
 
199
168
  def _messages_to_key(self, messages):
200
169
  """Convert messages list to a hashable key."""
201
- # Create a simplified representation for hashing
202
170
  key_parts = []
203
171
  for msg in messages:
204
- role = msg["role"]
205
- content = msg["content"]
172
+ if isinstance(msg, dict):
173
+ role = msg.get("role", "")
174
+ content = msg.get("content", "")
175
+ else:
176
+ role = getattr(msg, "role", "")
177
+ content = getattr(msg, "content", "")
206
178
  key_parts.append(f"{role}:{content}")
207
179
  return tuple(key_parts)
208
180
 
181
+ def _convert_tool_calls(self, raw_tool_calls) -> list[ToolCall] | None:
182
+ """Convert OAI-style tool call objects to vf.ToolCall."""
183
+ if not raw_tool_calls:
184
+ return None
185
+ result: list[ToolCall] = []
186
+ for tc in raw_tool_calls:
187
+ if hasattr(tc, "function"):
188
+ result.append(
189
+ ToolCall(
190
+ id=tc.id,
191
+ name=tc.function.name,
192
+ arguments=tc.function.arguments,
193
+ )
194
+ )
195
+ elif isinstance(tc, dict):
196
+ func = tc.get("function", {})
197
+ result.append(
198
+ ToolCall(
199
+ id=tc.get("id", ""),
200
+ name=func.get("name", ""),
201
+ arguments=func.get("arguments", ""),
202
+ )
203
+ )
204
+ return result or None
205
+
206
+ def _make_response(self, prompt) -> Response:
207
+ key = self._messages_to_key(self._normalize_input(prompt))
208
+ if key in self._responses:
209
+ data = self._responses[key]
210
+ else:
211
+ data = {
212
+ "content": self.default_response,
213
+ "finish_reason": "stop",
214
+ "tool_calls": None,
215
+ }
216
+
217
+ tool_calls = self._convert_tool_calls(data.get("tool_calls"))
218
+
219
+ return Response(
220
+ id="test-id",
221
+ created=0,
222
+ model="test-model",
223
+ usage=None,
224
+ message=ResponseMessage(
225
+ content=data["content"],
226
+ reasoning_content=None,
227
+ finish_reason=data["finish_reason"],
228
+ is_truncated=data["finish_reason"] == "length",
229
+ tokens=None,
230
+ tool_calls=tool_calls,
231
+ ),
232
+ )
233
+
209
234
 
210
235
  @pytest.fixture
211
- def mock_openai_client():
212
- """Return a mocked AsyncOpenAI client with input-output mapping."""
213
- return MockAsyncOpenAI()
236
+ def mock_client():
237
+ """Return a MockClient with input-output mapping."""
238
+ return MockClient()
214
239
 
215
240
 
216
241
  @pytest.fixture
@@ -240,10 +265,10 @@ def sample_chat_dataset():
240
265
 
241
266
 
242
267
  @pytest.fixture
243
- def mock_singleturn_env(mock_openai_client, sample_dataset):
268
+ def mock_singleturn_env(mock_client, sample_dataset):
244
269
  """Return a SingleTurnEnv with mocked client and dataset."""
245
270
  return SingleTurnEnv(
246
- client=mock_openai_client,
271
+ client=mock_client,
247
272
  model="test-model",
248
273
  dataset=sample_dataset,
249
274
  system_prompt="You are a helpful assistant.",
@@ -253,7 +278,7 @@ def mock_singleturn_env(mock_openai_client, sample_dataset):
253
278
 
254
279
 
255
280
  @pytest.fixture
256
- def mock_singleturn_env_completion(mock_openai_client):
281
+ def mock_singleturn_env_completion(mock_client):
257
282
  """Return a SingleTurnEnv for completion format testing."""
258
283
  completion_dataset = Dataset.from_dict(
259
284
  {
@@ -262,7 +287,7 @@ def mock_singleturn_env_completion(mock_openai_client):
262
287
  }
263
288
  )
264
289
  return SingleTurnEnv(
265
- client=mock_openai_client,
290
+ client=mock_client,
266
291
  model="test-model",
267
292
  dataset=completion_dataset,
268
293
  message_type="completion",
@@ -335,10 +360,10 @@ class SimpleMultiTurnEnv(MultiTurnEnv):
335
360
 
336
361
 
337
362
  @pytest.fixture
338
- def mock_multiturn_env(mock_openai_client, sample_chat_dataset):
363
+ def mock_multiturn_env(mock_client, sample_chat_dataset):
339
364
  """Return a MultiTurnEnv for basic testing."""
340
365
  return SimpleMultiTurnEnv(
341
- client=mock_openai_client,
366
+ client=mock_client,
342
367
  model="test-model",
343
368
  dataset=sample_chat_dataset,
344
369
  max_turns=3,
@@ -349,10 +374,10 @@ def mock_multiturn_env(mock_openai_client, sample_chat_dataset):
349
374
 
350
375
 
351
376
  @pytest.fixture
352
- def mock_multiturn_env_max_turns(mock_openai_client, sample_chat_dataset):
377
+ def mock_multiturn_env_max_turns(mock_client, sample_chat_dataset):
353
378
  """Return a MultiTurnEnv that tests max_turns limiting."""
354
379
  return SimpleMultiTurnEnv(
355
- client=mock_openai_client,
380
+ client=mock_client,
356
381
  model="test-model",
357
382
  dataset=sample_chat_dataset,
358
383
  max_turns=2,
@@ -377,9 +402,9 @@ class BasicToolEnv(ToolEnv):
377
402
 
378
403
 
379
404
  @pytest.fixture
380
- def mock_tool_env(mock_openai_client, sample_chat_dataset):
405
+ def mock_tool_env(mock_client, sample_chat_dataset):
381
406
  return BasicToolEnv(
382
- client=mock_openai_client,
407
+ client=mock_client,
383
408
  model="test-model",
384
409
  dataset=sample_chat_dataset,
385
410
  parser=Parser(),
@@ -413,9 +438,9 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
413
438
 
414
439
 
415
440
  @pytest.fixture
416
- def mock_stateful_tool_env(mock_openai_client, sample_chat_dataset):
441
+ def mock_stateful_tool_env(mock_client, sample_chat_dataset):
417
442
  return ExampleStatefulToolEnv(
418
- client=mock_openai_client,
443
+ client=mock_client,
419
444
  model="test-model",
420
445
  dataset=sample_chat_dataset,
421
446
  parser=Parser(),
@@ -461,7 +486,7 @@ def make_state() -> Callable[..., State]:
461
486
  is_completed: bool = True,
462
487
  is_truncated: bool = False,
463
488
  stop_condition: str | None = "max_turns_reached",
464
- oai_tools: list[ChatCompletionToolParam] | None = None,
489
+ tool_defs: list[Tool] | None = None,
465
490
  trajectory: list[TrajectoryStep] = [],
466
491
  timing=RolloutTiming(
467
492
  generation_ms=0.0,
@@ -483,7 +508,7 @@ def make_state() -> Callable[..., State]:
483
508
  is_completed=is_completed,
484
509
  is_truncated=is_truncated,
485
510
  stop_condition=stop_condition,
486
- oai_tools=oai_tools,
511
+ tool_defs=tool_defs,
487
512
  trajectory=trajectory,
488
513
  timing=timing,
489
514
  error=None,
@@ -529,11 +554,14 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
529
554
  time_ms: float = 0.0,
530
555
  avg_reward: float = 0.0,
531
556
  avg_metrics: dict[str, float] = {},
557
+ pass_at_k: dict[str, float] = {},
558
+ pass_all_k: dict[str, float] = {},
559
+ pass_threshold: float = 0.5,
532
560
  usage: dict[str, float] | None = None,
533
561
  version_info: dict | None = None,
534
562
  state_columns: list[str] = ["foo"],
535
563
  path_to_save: Path = Path("test.jsonl"),
536
- tools: list[ChatCompletionToolParam] | None = None,
564
+ tools: list[Tool] | None = None,
537
565
  ) -> GenerateMetadata:
538
566
  if version_info is None:
539
567
  version_info = {
@@ -554,6 +582,9 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
554
582
  time_ms=time_ms,
555
583
  avg_reward=avg_reward,
556
584
  avg_metrics=avg_metrics,
585
+ pass_at_k=pass_at_k,
586
+ pass_all_k=pass_all_k,
587
+ pass_threshold=pass_threshold,
557
588
  usage=usage,
558
589
  version_info=version_info,
559
590
  state_columns=state_columns,
@@ -0,0 +1,29 @@
1
+ from pathlib import Path
2
+
3
+ from verifiers.scripts import build
4
+
5
+
6
+ def test_resolve_env_push_target_defaults_to_environments_dir(tmp_path: Path):
7
+ base_dir = tmp_path / "workspace" / "environments"
8
+ env_name, env_path = build._resolve_env_push_target("my-env", str(base_dir))
9
+
10
+ assert env_name == "my-env"
11
+ assert env_path == (base_dir / "my_env").resolve()
12
+
13
+
14
+ def test_resolve_env_push_target_appends_env_id_to_custom_base_path(tmp_path: Path):
15
+ base_dir = tmp_path / "workspace" / "custom_envs"
16
+ env_name, env_path = build._resolve_env_push_target("env-name", str(base_dir))
17
+
18
+ assert env_name == "env-name"
19
+ assert env_path == (base_dir / "env_name").resolve()
20
+
21
+
22
+ def test_resolve_env_push_target_uses_explicit_environment_path_when_env_id_missing(
23
+ tmp_path: Path,
24
+ ):
25
+ explicit_env_path = tmp_path / "workspace" / "environments" / "already_normalized"
26
+ env_name, env_path = build._resolve_env_push_target(None, str(explicit_env_path))
27
+
28
+ assert env_name == "already-normalized"
29
+ assert env_path == explicit_env_path.resolve()
@@ -159,6 +159,49 @@ class TestCliAgentEnv:
159
159
  response = await env.env_response(messages, state)
160
160
  assert response == []
161
161
 
162
+ @pytest.mark.asyncio
163
+ async def test_non_streaming_intercept_tools_use_oai_schema(
164
+ self, sample_dataset, mock_client
165
+ ):
166
+ """OpenAI-formatted intercepted tools should work for non-streaming requests."""
167
+ env = vf.CliAgentEnv(
168
+ run_command="python agent.py",
169
+ dataset=sample_dataset,
170
+ rubric=vf.Rubric(),
171
+ )
172
+ state = await env.init_state(
173
+ input=sample_dataset[0],
174
+ client=mock_client,
175
+ model="test-model",
176
+ )
177
+ request_id = "req-test"
178
+ state["current_request_id"] = request_id
179
+ env._interception_server.intercepts[request_id] = {
180
+ "stream": False,
181
+ "tools": [
182
+ {
183
+ "type": "function",
184
+ "function": {
185
+ "name": "echo",
186
+ "description": "echo tool",
187
+ "parameters": {},
188
+ },
189
+ }
190
+ ],
191
+ }
192
+
193
+ response = await env.get_model_response(
194
+ state=state,
195
+ prompt=sample_dataset[0]["prompt"],
196
+ client=mock_client,
197
+ model="test-model",
198
+ )
199
+
200
+ assert isinstance(response, vf.Response)
201
+ kwargs = mock_client.last_call_kwargs
202
+ assert kwargs["tools"] is not None
203
+ assert kwargs["tools"][0].name == "echo"
204
+
162
205
 
163
206
  class TestHarborEnv:
164
207
  """Tests for HarborEnv."""