verifiers 0.1.13.dev8__tar.gz → 0.1.15.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (303) hide show
  1. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/.gitignore +1 -1
  2. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/PKG-INFO +82 -5
  3. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/README.md +77 -2
  4. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/pyproject.toml +27 -5
  5. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/conftest.py +9 -12
  6. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_cli_agent_env.py +156 -3
  7. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_composable_env.py +4 -4
  8. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_decorator_ranks.py +43 -4
  9. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_endpoint_registry.py +33 -65
  10. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_env_group.py +51 -52
  11. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_env_server.py +67 -1
  12. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_environment.py +85 -3
  13. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_environment_extra.py +0 -2
  14. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_envs.py +28 -3
  15. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_eval_cli.py +140 -16
  16. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_eval_utils.py +24 -0
  17. verifiers-0.1.15.dev0/tests/test_gepa_cli.py +251 -0
  18. verifiers-0.1.15.dev0/tests/test_gepa_utils.py +155 -0
  19. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_interception_utils.py +105 -0
  20. verifiers-0.1.15.dev0/tests/test_langchain_deep_agents_wikispeedia.py +312 -0
  21. verifiers-0.1.15.dev0/tests/test_lean_task.py +344 -0
  22. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_logging.py +50 -0
  23. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_math_rubric.py +4 -21
  24. verifiers-0.1.15.dev0/tests/test_mcp_search_env.py +61 -0
  25. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_multiturn_env.py +70 -2
  26. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_nemorl_client.py +146 -35
  27. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_openai_chat_completions_token_client.py +6 -2
  28. verifiers-0.1.15.dev0/tests/test_openai_responses_client.py +338 -0
  29. verifiers-0.1.15.dev0/tests/test_opencode_harbor.py +100 -0
  30. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_opencode_rlm_env.py +8 -3
  31. verifiers-0.1.15.dev0/tests/test_per_turn_timing.py +68 -0
  32. verifiers-0.1.15.dev0/tests/test_renderer_client.py +651 -0
  33. verifiers-0.1.15.dev0/tests/test_renderer_e2e.py +417 -0
  34. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_rlm_composable_env.py +2 -2
  35. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_rlm_env.py +3 -0
  36. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_rubric.py +9 -59
  37. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_rubric_group.py +12 -67
  38. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_save_utils.py +11 -11
  39. verifiers-0.1.15.dev0/tests/test_setup_script.py +32 -0
  40. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_singleturn_env.py +7 -35
  41. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_stateful_tool_env.py +3 -5
  42. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_tool_env.py +4 -8
  43. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_tool_utils.py +31 -1
  44. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_trajectory_processing.py +0 -3
  45. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_tui_info_formatting.py +9 -1
  46. verifiers-0.1.15.dev0/tests/test_types.py +11 -0
  47. verifiers-0.1.15.dev0/tests/test_v1_bfcl.py +55 -0
  48. verifiers-0.1.15.dev0/tests/test_v1_config_extension.py +1599 -0
  49. verifiers-0.1.15.dev0/tests/test_v1_endpoint_protocols.py +222 -0
  50. verifiers-0.1.15.dev0/tests/test_v1_example_counts.py +117 -0
  51. verifiers-0.1.15.dev0/tests/test_v1_group_reward_env.py +39 -0
  52. verifiers-0.1.15.dev0/tests/test_v1_harbor_cli.py +209 -0
  53. verifiers-0.1.15.dev0/tests/test_v1_mini_swe_agent.py +65 -0
  54. verifiers-0.1.15.dev0/tests/test_v1_rlm_swe.py +275 -0
  55. verifiers-0.1.15.dev0/tests/test_v1_runtime_lifecycle.py +1897 -0
  56. verifiers-0.1.15.dev0/tests/test_v1_scoring_functions.py +152 -0
  57. verifiers-0.1.15.dev0/tests/test_wordle_env.py +22 -0
  58. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/__init__.py +88 -8
  59. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/__init__.py +29 -0
  60. verifiers-0.1.15.dev0/verifiers/clients/nemorl_chat_completions_client.py +117 -0
  61. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/openai_chat_completions_client.py +2 -0
  62. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/openai_chat_completions_token_client.py +38 -4
  63. verifiers-0.1.15.dev0/verifiers/clients/openai_responses_client.py +443 -0
  64. verifiers-0.1.15.dev0/verifiers/clients/renderer_client.py +606 -0
  65. verifiers-0.1.15.dev0/verifiers/decorators.py +296 -0
  66. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/AGENTS.md +2 -1
  67. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/env_group.py +192 -62
  68. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/environment.py +113 -74
  69. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/README.md +5 -0
  70. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/__init__.py +2 -0
  71. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/cli_agent_env.py +51 -13
  72. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/README.md +8 -3
  73. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/__init__.py +2 -0
  74. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/composable_env.py +12 -0
  75. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harness.py +9 -0
  76. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +5 -2
  77. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harnesses/opencode.py +44 -12
  78. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harnesses/rlm.py +52 -5
  79. verifiers-0.1.15.dev0/verifiers/envs/experimental/composable/swe_debug_env.py +327 -0
  80. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/task.py +13 -8
  81. verifiers-0.1.15.dev0/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +13 -0
  82. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +138 -27
  83. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +3 -11
  84. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -5
  85. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -7
  86. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -8
  87. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -5
  88. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +1 -12
  89. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -6
  90. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +2 -2
  91. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/harbor_env/env.py +4 -3
  92. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/mcp_env.py +9 -12
  93. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/opencode_env.py +29 -14
  94. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/opencode_rlm_env.py +28 -14
  95. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/rlm_env.py +22 -3
  96. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/sandbox_mixin.py +2 -1
  97. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/browser_env.py +6 -3
  98. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/modes/base.py +1 -1
  99. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +2 -1
  100. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +2 -1
  101. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/openenv_env.py +3 -2
  102. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/reasoninggym_env.py +1 -1
  103. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/textarena_env.py +2 -1
  104. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/multiturn_env.py +33 -7
  105. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/python_env.py +5 -2
  106. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/sandbox_env.py +6 -3
  107. verifiers-0.1.15.dev0/verifiers/gepa/gepa_utils.py +322 -0
  108. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/experimental/hybrid_math_rubric.py +1 -1
  109. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/rubric.py +137 -34
  110. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/rubric_group.py +23 -14
  111. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/eval.py +10 -6
  112. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/gepa.py +274 -37
  113. verifiers-0.1.15.dev0/verifiers/scripts/setup.py +33 -0
  114. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/tui.py +34 -10
  115. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/client/zmq_env_client.py +4 -1
  116. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/types.py +176 -18
  117. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/async_utils.py +18 -0
  118. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/display_utils.py +90 -3
  119. verifiers-0.1.15.dev0/verifiers/utils/env_config_utils.py +45 -0
  120. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/env_utils.py +53 -2
  121. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/error_utils.py +33 -0
  122. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/eval_display.py +61 -50
  123. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/eval_utils.py +200 -116
  124. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/interception_utils.py +385 -17
  125. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/logging_utils.py +9 -0
  126. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/response_utils.py +2 -0
  127. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/save_utils.py +33 -13
  128. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/threaded_sandbox_client.py +35 -5
  129. verifiers-0.1.15.dev0/verifiers/v1/README.md +1594 -0
  130. verifiers-0.1.15.dev0/verifiers/v1/RE_MIGRATION.md +804 -0
  131. verifiers-0.1.15.dev0/verifiers/v1/__init__.py +93 -0
  132. verifiers-0.1.15.dev0/verifiers/v1/config.py +455 -0
  133. verifiers-0.1.15.dev0/verifiers/v1/env.py +136 -0
  134. verifiers-0.1.15.dev0/verifiers/v1/harness.py +598 -0
  135. verifiers-0.1.15.dev0/verifiers/v1/packages/__init__.py +1 -0
  136. verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/__init__.py +8 -0
  137. verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/cli.py +121 -0
  138. verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/configs.py +74 -0
  139. verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/mini_swe_agent.py +247 -0
  140. verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/opencode.py +298 -0
  141. verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/pi.py +212 -0
  142. verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/rlm.py +265 -0
  143. verifiers-0.1.15.dev0/verifiers/v1/packages/tasksets/__init__.py +3 -0
  144. verifiers-0.1.15.dev0/verifiers/v1/packages/tasksets/harbor.py +407 -0
  145. verifiers-0.1.15.dev0/verifiers/v1/runtime.py +1931 -0
  146. verifiers-0.1.15.dev0/verifiers/v1/state.py +401 -0
  147. verifiers-0.1.15.dev0/verifiers/v1/task.py +177 -0
  148. verifiers-0.1.15.dev0/verifiers/v1/taskset.py +269 -0
  149. verifiers-0.1.15.dev0/verifiers/v1/toolset.py +352 -0
  150. verifiers-0.1.15.dev0/verifiers/v1/user.py +85 -0
  151. verifiers-0.1.15.dev0/verifiers/v1/utils/__init__.py +1 -0
  152. verifiers-0.1.15.dev0/verifiers/v1/utils/artifact_utils.py +31 -0
  153. verifiers-0.1.15.dev0/verifiers/v1/utils/endpoint_utils.py +671 -0
  154. verifiers-0.1.15.dev0/verifiers/v1/utils/json_utils.py +11 -0
  155. verifiers-0.1.15.dev0/verifiers/v1/utils/judge_utils.py +63 -0
  156. verifiers-0.1.15.dev0/verifiers/v1/utils/lifecycle_utils.py +96 -0
  157. verifiers-0.1.15.dev0/verifiers/v1/utils/mcp_proxy_utils.py +233 -0
  158. verifiers-0.1.15.dev0/verifiers/v1/utils/mcp_utils.py +148 -0
  159. verifiers-0.1.15.dev0/verifiers/v1/utils/program_utils.py +483 -0
  160. verifiers-0.1.15.dev0/verifiers/v1/utils/prompt_utils.py +136 -0
  161. verifiers-0.1.15.dev0/verifiers/v1/utils/sandbox_program_utils.py +770 -0
  162. verifiers-0.1.15.dev0/verifiers/v1/utils/sandbox_utils.py +822 -0
  163. verifiers-0.1.15.dev0/verifiers/v1/utils/scoring_utils.py +379 -0
  164. verifiers-0.1.15.dev0/verifiers/v1/utils/timing_utils.py +36 -0
  165. verifiers-0.1.15.dev0/verifiers/v1/utils/tool_utils.py +19 -0
  166. verifiers-0.1.15.dev0/verifiers/v1/utils/trajectory_utils.py +78 -0
  167. verifiers-0.1.13.dev8/tests/test_gepa_cli.py +0 -115
  168. verifiers-0.1.13.dev8/tests/test_opencode_harbor.py +0 -57
  169. verifiers-0.1.13.dev8/tests/test_setup_script.py +0 -288
  170. verifiers-0.1.13.dev8/verifiers/clients/nemorl_chat_completions_client.py +0 -87
  171. verifiers-0.1.13.dev8/verifiers/decorators.py +0 -147
  172. verifiers-0.1.13.dev8/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -3
  173. verifiers-0.1.13.dev8/verifiers/gepa/gepa_utils.py +0 -116
  174. verifiers-0.1.13.dev8/verifiers/scripts/prime_rl.py +0 -197
  175. verifiers-0.1.13.dev8/verifiers/scripts/setup.py +0 -611
  176. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/LICENSE +0 -0
  177. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/AGENTS.md +0 -0
  178. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/README.md +0 -0
  179. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/__init__.py +0 -0
  180. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_browser_env.py +0 -0
  181. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_build_script.py +0 -0
  182. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_client_auth_errors.py +0 -0
  183. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_client_config.py +0 -0
  184. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_client_multimodal_types.py +0 -0
  185. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_context_token_metrics.py +0 -0
  186. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_error_chain.py +0 -0
  187. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_eval_display.py +0 -0
  188. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_gym_env.py +0 -0
  189. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_harbor_env_mcp.py +0 -0
  190. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_imports.py +0 -0
  191. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_install_utils.py +0 -0
  192. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_maybe_think_parser.py +0 -0
  193. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_message_utils.py +0 -0
  194. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_message_utils_multimodal.py +0 -0
  195. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_parser.py +0 -0
  196. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_path_utils.py +0 -0
  197. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_prime_plugin.py +0 -0
  198. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_sandbox_env.py +0 -0
  199. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_sandbox_mixin.py +0 -0
  200. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_think_parser.py +0 -0
  201. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_xml_parser.py +0 -0
  202. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/AGENTS.md +0 -0
  203. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/__init__.py +0 -0
  204. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/__init__.py +0 -0
  205. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/build.py +0 -0
  206. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/eval.py +0 -0
  207. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/gepa.py +0 -0
  208. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/init.py +0 -0
  209. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/install.py +0 -0
  210. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/setup.py +0 -0
  211. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/plugins/__init__.py +0 -0
  212. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/plugins/prime.py +0 -0
  213. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/tui.py +0 -0
  214. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/anthropic_messages_client.py +0 -0
  215. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/client.py +0 -0
  216. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/openai_completions_client.py +0 -0
  217. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/__init__.py +0 -0
  218. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/_filter.py +0 -0
  219. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
  220. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  221. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  222. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  223. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
  224. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  225. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  226. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
  227. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  228. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
  229. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  230. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
  231. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
  232. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
  233. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
  234. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/gym_env.py +0 -0
  235. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
  236. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
  237. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  238. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/utils/__init__.py +0 -0
  239. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/utils/file_locks.py +0 -0
  240. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
  241. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/README.md +0 -0
  242. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/__init__.py +0 -0
  243. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/README.md +0 -0
  244. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  245. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  246. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/singleturn_env.py +0 -0
  247. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/stateful_tool_env.py +0 -0
  248. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/tool_env.py +0 -0
  249. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/errors.py +0 -0
  250. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/gepa/__init__.py +0 -0
  251. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/gepa/adapter.py +0 -0
  252. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/gepa/config.py +0 -0
  253. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/gepa/display.py +0 -0
  254. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/parsers/__init__.py +0 -0
  255. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/parsers/maybe_think_parser.py +0 -0
  256. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/parsers/parser.py +0 -0
  257. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/parsers/think_parser.py +0 -0
  258. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/parsers/xml_parser.py +0 -0
  259. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/README.md +0 -0
  260. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/__init__.py +0 -0
  261. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/inference/__init__.py +0 -0
  262. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/inference/client.py +0 -0
  263. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/inference/server.py +0 -0
  264. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/trainer/__init__.py +0 -0
  265. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/trainer/config.py +0 -0
  266. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/trainer/orchestrator.py +0 -0
  267. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/trainer/trainer.py +0 -0
  268. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/trainer/utils.py +0 -0
  269. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/__init__.py +0 -0
  270. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/judge_rubric.py +0 -0
  271. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/math_rubric.py +0 -0
  272. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/__init__.py +0 -0
  273. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/build.py +0 -0
  274. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/init.py +0 -0
  275. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/install.py +0 -0
  276. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/rl.py +0 -0
  277. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/train.py +0 -0
  278. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/vllm.py +0 -0
  279. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/__init__.py +0 -0
  280. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/client/env_client.py +0 -0
  281. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/server/__init__.py +0 -0
  282. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/server/env_router.py +0 -0
  283. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/server/env_server.py +0 -0
  284. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/server/env_worker.py +0 -0
  285. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/server/zmq_env_server.py +0 -0
  286. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/types.py +0 -0
  287. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/__init__.py +0 -0
  288. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/client_utils.py +0 -0
  289. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/config_utils.py +0 -0
  290. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/data_utils.py +0 -0
  291. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/heartbeat.py +0 -0
  292. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/import_utils.py +0 -0
  293. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/install_utils.py +0 -0
  294. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/message_utils.py +0 -0
  295. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/metric_utils.py +0 -0
  296. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/path_utils.py +0 -0
  297. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/process_utils.py +0 -0
  298. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/serve_utils.py +0 -0
  299. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/thread_utils.py +0 -0
  300. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/tool_utils.py +0 -0
  301. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/tunnel_utils.py +0 -0
  302. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/usage_utils.py +0 -0
  303. {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/version_utils.py +0 -0
@@ -4,7 +4,6 @@ venv/
4
4
  env/
5
5
  .env
6
6
  .env.local
7
- uv.lock
8
7
  .claude/
9
8
  .cursorrules
10
9
  .ropeproject/
@@ -22,6 +21,7 @@ _build/
22
21
  docs/build/
23
22
  *.egg-info/
24
23
  __pycache__/
24
+ environments/**/uv.lock
25
25
 
26
26
  .pytest_cache/
27
27
  .ruff_cache/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.13.dev8
3
+ Version: 0.1.15.dev0
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -26,6 +26,7 @@ Requires-Dist: aiolimiter>=1.2.1
26
26
  Requires-Dist: anthropic>=0.78.0
27
27
  Requires-Dist: datasets<4.7.0,>=3.0.0
28
28
  Requires-Dist: gepa
29
+ Requires-Dist: httpx>=0.27.0
29
30
  Requires-Dist: jinja2>=3.1.6
30
31
  Requires-Dist: math-verify>=0.8.0
31
32
  Requires-Dist: mcp>=1.14.1
@@ -34,7 +35,7 @@ Requires-Dist: nest-asyncio>=1.6.0
34
35
  Requires-Dist: numpy
35
36
  Requires-Dist: openai-agents>=0.0.7
36
37
  Requires-Dist: openai>=1.108.1
37
- Requires-Dist: prime-sandboxes>=0.2.21
38
+ Requires-Dist: prime-sandboxes>=0.2.25
38
39
  Requires-Dist: prime-tunnel>=0.1.6
39
40
  Requires-Dist: pydantic>=2.11.9
40
41
  Requires-Dist: pyzmq>=27.1.0
@@ -46,13 +47,14 @@ Requires-Dist: tenacity>=8.5.0
46
47
  Requires-Dist: textual
47
48
  Requires-Dist: tomli; python_version < '3.11'
48
49
  Requires-Dist: typing-extensions; python_version < '3.12'
49
- Requires-Dist: wget>=3.2
50
50
  Provides-Extra: browser
51
51
  Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
52
52
  Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
53
53
  Requires-Dist: stagehand>=3.0.0; extra == 'browser'
54
54
  Provides-Extra: openenv
55
55
  Requires-Dist: openenv-core[core]==0.2.1; extra == 'openenv'
56
+ Provides-Extra: renderers
57
+ Requires-Dist: renderers>=0.1.6; extra == 'renderers'
56
58
  Provides-Extra: rg
57
59
  Requires-Dist: reasoning-gym; extra == 'rg'
58
60
  Provides-Extra: rl
@@ -107,7 +109,9 @@ Verifiers: Environments for LLM Reinforcement Learning
107
109
 
108
110
  ## News & Updates
109
111
 
110
- - [04/17/26] v0.1.12 is released, featuring a new composable Task/Agent/Environment architecture, upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
112
+ - [05/07/26] v0.1.14 is released, featuring the v1 Taskset/Harness API, shared eval and training config shape, model-family starter configs, OpenAI Responses and renderer-backed clients, per-turn timing, GEPA prompt artifacts, Lean guard markers, and release/infrastructure hardening.
113
+ - [04/28/26] v0.1.13.dev8 is released, featuring per-rollout wall-clock timeouts for `MultiTurnEnv`, CLI timeout config, sandbox timeout propagation, and smaller `CliAgentEnv` and RLM fixes.
114
+ - [04/17/26] v0.1.12 is released, featuring upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
111
115
  - [03/12/26] v0.1.11 is released, featuring a unified client stack, major `RLMEnv` and env server reliability improvements, a substantially refined eval TUI, new pass@k and ablation sweep support, and bundled opencode environments.
112
116
  - [02/10/26] v0.1.10 is released, featuring OpenEnv and BrowserEnv integrations, resumed evals, improved rollout and token tracking, safer sandbox lifecycle behavior, refreshed workspace setup, and opencode harbor improvements.
113
117
  - [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
@@ -197,11 +201,82 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
197
201
  async def correct_answer(completion, answer) -> float:
198
202
  completion_ans = completion[-1]['content']
199
203
  return 1.0 if completion_ans == answer else 0.0
200
- rubric = Rubric(funcs=[correct_answer])
204
+ rubric = vf.Rubric(funcs=[correct_answer])
201
205
  env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
202
206
  return env
203
207
  ```
204
208
 
209
+ For new environments with reusable tasksets, toolsets, custom programs, or
210
+ custom harnesses, use the v1 Taskset/Harness path:
211
+ ```python
212
+ # my_env.py
213
+ import verifiers.v1 as vf
214
+
215
+ def source():
216
+ yield {
217
+ "prompt": [{"role": "user", "content": "Reverse abc."}],
218
+ "answer": "cba",
219
+ "max_turns": 1,
220
+ }
221
+
222
+ @vf.reward(weight=1.0)
223
+ async def contains_answer(task, state) -> float:
224
+ return float(task["answer"] in str(state.get("completion") or ""))
225
+
226
+ def load_taskset(config: vf.TasksetConfig | None = None):
227
+ return vf.Taskset(source=source, rewards=[contains_answer], config=config)
228
+
229
+ def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
230
+ config = config or vf.EnvConfig()
231
+ return vf.Env(taskset=load_taskset(config=config.taskset))
232
+ ```
233
+ If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
234
+ **[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
235
+ Reusable taskset and harness packages live under `verifiers.v1.packages` while
236
+ the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
237
+ For example, Harbor task directories can run through the bundled OpenCode CLI
238
+ harness with:
239
+
240
+ ```python
241
+ env = vf.Env(
242
+ taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
243
+ harness=vf.OpenCode(),
244
+ )
245
+ ```
246
+
247
+ The same environment package is the unit used by evals and `prime-rl`. The
248
+ trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
249
+ and harness options stay under `env.taskset` and `env.harness`:
250
+
251
+ ```toml
252
+ # configs/rl/my-v1-env.toml
253
+ model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
254
+ max_steps = 100
255
+ batch_size = 256
256
+ rollouts_per_example = 8
257
+
258
+ [sampling]
259
+ max_tokens = 4096
260
+
261
+ [[env]]
262
+ id = "my-env"
263
+
264
+ [env.args]
265
+ arg1 = "non-th-arg"
266
+
267
+ [env.harness]
268
+ max_turns = 1
269
+
270
+ [env.taskset.scoring.contains_answer]
271
+ weight = 1.0
272
+ ```
273
+
274
+ ```bash
275
+ prime env install my-env
276
+ ```
277
+
278
+ For self-managed training launch commands, use the `prime-rl` documentation.
279
+
205
280
  To install the environment module into your project, do:
206
281
  ```bash
207
282
  prime env install my-env # installs from ./environments/my_env
@@ -237,6 +312,8 @@ prime eval run primeintellect/math-python
237
312
 
238
313
  **[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
239
314
 
315
+ **[BYO Harness](docs/byo-harness.md)** — Build v1 Taskset/Harness environments with custom tools, sandboxes, users, and custom programs.
316
+
240
317
  **[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
241
318
 
242
319
  **[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
@@ -34,7 +34,9 @@ Verifiers: Environments for LLM Reinforcement Learning
34
34
 
35
35
  ## News & Updates
36
36
 
37
- - [04/17/26] v0.1.12 is released, featuring a new composable Task/Agent/Environment architecture, upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
37
+ - [05/07/26] v0.1.14 is released, featuring the v1 Taskset/Harness API, shared eval and training config shape, model-family starter configs, OpenAI Responses and renderer-backed clients, per-turn timing, GEPA prompt artifacts, Lean guard markers, and release/infrastructure hardening.
38
+ - [04/28/26] v0.1.13.dev8 is released, featuring per-rollout wall-clock timeouts for `MultiTurnEnv`, CLI timeout config, sandbox timeout propagation, and smaller `CliAgentEnv` and RLM fixes.
39
+ - [04/17/26] v0.1.12 is released, featuring upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
38
40
  - [03/12/26] v0.1.11 is released, featuring a unified client stack, major `RLMEnv` and env server reliability improvements, a substantially refined eval TUI, new pass@k and ablation sweep support, and bundled opencode environments.
39
41
  - [02/10/26] v0.1.10 is released, featuring OpenEnv and BrowserEnv integrations, resumed evals, improved rollout and token tracking, safer sandbox lifecycle behavior, refreshed workspace setup, and opencode harbor improvements.
40
42
  - [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
@@ -124,11 +126,82 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
124
126
  async def correct_answer(completion, answer) -> float:
125
127
  completion_ans = completion[-1]['content']
126
128
  return 1.0 if completion_ans == answer else 0.0
127
- rubric = Rubric(funcs=[correct_answer])
129
+ rubric = vf.Rubric(funcs=[correct_answer])
128
130
  env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
129
131
  return env
130
132
  ```
131
133
 
134
+ For new environments with reusable tasksets, toolsets, custom programs, or
135
+ custom harnesses, use the v1 Taskset/Harness path:
136
+ ```python
137
+ # my_env.py
138
+ import verifiers.v1 as vf
139
+
140
+ def source():
141
+ yield {
142
+ "prompt": [{"role": "user", "content": "Reverse abc."}],
143
+ "answer": "cba",
144
+ "max_turns": 1,
145
+ }
146
+
147
+ @vf.reward(weight=1.0)
148
+ async def contains_answer(task, state) -> float:
149
+ return float(task["answer"] in str(state.get("completion") or ""))
150
+
151
+ def load_taskset(config: vf.TasksetConfig | None = None):
152
+ return vf.Taskset(source=source, rewards=[contains_answer], config=config)
153
+
154
+ def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
155
+ config = config or vf.EnvConfig()
156
+ return vf.Env(taskset=load_taskset(config=config.taskset))
157
+ ```
158
+ If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
159
+ **[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
160
+ Reusable taskset and harness packages live under `verifiers.v1.packages` while
161
+ the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
162
+ For example, Harbor task directories can run through the bundled OpenCode CLI
163
+ harness with:
164
+
165
+ ```python
166
+ env = vf.Env(
167
+ taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
168
+ harness=vf.OpenCode(),
169
+ )
170
+ ```
171
+
172
+ The same environment package is the unit used by evals and `prime-rl`. The
173
+ trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
174
+ and harness options stay under `env.taskset` and `env.harness`:
175
+
176
+ ```toml
177
+ # configs/rl/my-v1-env.toml
178
+ model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
179
+ max_steps = 100
180
+ batch_size = 256
181
+ rollouts_per_example = 8
182
+
183
+ [sampling]
184
+ max_tokens = 4096
185
+
186
+ [[env]]
187
+ id = "my-env"
188
+
189
+ [env.args]
190
+ arg1 = "non-th-arg"
191
+
192
+ [env.harness]
193
+ max_turns = 1
194
+
195
+ [env.taskset.scoring.contains_answer]
196
+ weight = 1.0
197
+ ```
198
+
199
+ ```bash
200
+ prime env install my-env
201
+ ```
202
+
203
+ For self-managed training launch commands, use the `prime-rl` documentation.
204
+
132
205
  To install the environment module into your project, do:
133
206
  ```bash
134
207
  prime env install my-env # installs from ./environments/my_env
@@ -164,6 +237,8 @@ prime eval run primeintellect/math-python
164
237
 
165
238
  **[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
166
239
 
240
+ **[BYO Harness](docs/byo-harness.md)** — Build v1 Taskset/Harness environments with custom tools, sandboxes, users, and custom programs.
241
+
167
242
  **[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
168
243
 
169
244
  **[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
@@ -38,7 +38,7 @@ dependencies = [
38
38
  "openai>=1.108.1",
39
39
  "openai-agents>=0.0.7",
40
40
  "prime-tunnel>=0.1.6",
41
- "prime-sandboxes>=0.2.21",
41
+ "prime-sandboxes>=0.2.25",
42
42
  "pydantic>=2.11.9",
43
43
  "requests",
44
44
  "rich",
@@ -46,13 +46,13 @@ dependencies = [
46
46
  "textual",
47
47
  "tomli; python_version < '3.11'",
48
48
  "typing_extensions; python_version < '3.12'",
49
- "wget>=3.2",
50
49
  "gepa",
51
50
  "pyzmq>=27.1.0",
52
51
  "msgpack>=1.1.2",
53
52
  "aiolimiter>=1.2.1",
54
53
  "setproctitle>=1.3.0",
55
- "regex<2026.4.4", # 2026.4.4 missing cp312/cp313 wheels
54
+ "regex<2026.4.4",
55
+ "httpx>=0.27.0",
56
56
  ]
57
57
 
58
58
  [dependency-groups]
@@ -73,6 +73,7 @@ dev = [
73
73
  "aiohttp>=3.9.0",
74
74
  "python-dotenv>=1.0.0",
75
75
  "nltk",
76
+ "renderers>=0.1.6",
76
77
  ]
77
78
 
78
79
  [project.optional-dependencies]
@@ -91,6 +92,9 @@ browser = [
91
92
  "aiohttp>=3.9.0",
92
93
  "python-dotenv>=1.0.0",
93
94
  ]
95
+ renderers = [
96
+ "renderers>=0.1.6",
97
+ ]
94
98
  rl = [
95
99
  "torch>=2.8.0,<2.9.0",
96
100
  "transformers>=4.56.2",
@@ -108,6 +112,24 @@ rl = [
108
112
  preview = true
109
113
  required-version = ">=0.11.1"
110
114
 
115
+ [[tool.uv.index]]
116
+ name = "pypi"
117
+ url = "https://pypi.org/simple"
118
+ default = true
119
+ exclude-newer = "7 days"
120
+
121
+ [tool.uv.exclude-newer-package]
122
+ # PrimeIntellect-published on PyPI (trusted publisher)
123
+ prime-tunnel = false
124
+ prime-sandboxes = false
125
+ renderers = false
126
+
127
+ [tool.uv.sources]
128
+ # Pinned to renderers main until the next PyPI release lands; drop after.
129
+ # fe67f9f = renderers main: PR #4 squash-merge — construction-time
130
+ # preserve_*_thinking flags on create_renderer / create_renderer_pool.
131
+ renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "fe67f9f" }
132
+
111
133
  [tool.uv.extra-build-dependencies]
112
134
  flash-attn = [{ requirement = "torch", match-runtime = true }]
113
135
 
@@ -125,7 +147,6 @@ vf-rl = "verifiers.scripts.rl:main"
125
147
  vf-train = "verifiers.scripts.train:main"
126
148
  vf-tui = "verifiers.scripts.tui:main"
127
149
  vf-vllm = "verifiers.scripts.vllm:main"
128
- prime-rl = "verifiers.scripts.prime_rl:main"
129
150
 
130
151
  # hatchling configuration
131
152
  [tool.hatch.version]
@@ -170,6 +191,7 @@ addopts = [
170
191
  markers = [
171
192
  "slow: marks tests as slow (deselect with '-m \"not slow\"')",
172
193
  "integration: marks tests as integration tests",
194
+ "prime_sandbox: marks tests that provision real Prime sandbox or tunnel resources",
173
195
  "unit: marks tests as unit tests",
174
196
  "asyncio: marks tests as async tests",
175
197
  "parsers: marks tests for parser components",
@@ -195,7 +217,7 @@ unknown-argument = "warn"
195
217
  redundant-cast = "ignore"
196
218
 
197
219
  [tool.ty.src]
198
- exclude = ["environments"]
220
+ exclude = ["environments", "verifiers/v1/sketch.py"]
199
221
 
200
222
  [[tool.ty.overrides]]
201
223
  include = ["verifiers/envs/experimental/composable/tasksets/**"]
@@ -425,9 +425,10 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
425
425
  super().__init__(tools=[offset_tool], **kwargs)
426
426
 
427
427
  async def setup_state(self, state, **kwargs):
428
- await super().setup_state(state, **kwargs)
428
+ state = await super().setup_state(state, **kwargs)
429
429
  state["offset"] = 3
430
430
  state["update_calls"] = 0
431
+ return state
431
432
 
432
433
  def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
433
434
  state["update_calls"] += 1
@@ -457,13 +458,15 @@ def make_input() -> Callable[..., RolloutInput]:
457
458
 
458
459
  def _make_input(
459
460
  example_id: int = 0,
460
- task: str = "default",
461
461
  prompt: Messages = DEFAULT_PROMPT,
462
462
  info: Info = {},
463
463
  answer: str = "4",
464
464
  ) -> RolloutInput:
465
465
  return RolloutInput(
466
- example_id=example_id, task=task, prompt=prompt, answer=answer, info=info
466
+ example_id=example_id,
467
+ prompt=prompt,
468
+ answer=answer,
469
+ info=info,
467
470
  )
468
471
 
469
472
  return _make_input
@@ -475,7 +478,6 @@ def make_state() -> Callable[..., State]:
475
478
 
476
479
  def _make_state(
477
480
  example_id: int = 0,
478
- task: str = "default",
479
481
  prompt: Messages = DEFAULT_PROMPT,
480
482
  answer: str = "4",
481
483
  info: Info = {},
@@ -487,17 +489,12 @@ def make_state() -> Callable[..., State]:
487
489
  stop_condition: str | None = "max_turns_reached",
488
490
  tool_defs: list[Tool] | None = None,
489
491
  trajectory: list[TrajectoryStep] = [],
490
- timing=RolloutTiming(
491
- generation_ms=0.0,
492
- scoring_ms=0.0,
493
- total_ms=0.0,
494
- ),
492
+ timing=RolloutTiming(),
495
493
  foo: str = "bar", # custom field
496
494
  **kwargs,
497
495
  ) -> State:
498
496
  return State(
499
497
  example_id=example_id,
500
- task=task,
501
498
  prompt=prompt,
502
499
  answer=answer,
503
500
  info=info,
@@ -550,7 +547,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
550
547
  rollouts_per_example: int = 1,
551
548
  sampling_args: SamplingArgs = {},
552
549
  date: str = "1970-01-01",
553
- time_ms: float = 0.0,
550
+ time: float = 0.0,
554
551
  avg_reward: float = 0.0,
555
552
  avg_metrics: dict[str, float] = {},
556
553
  pass_at_k: dict[str, float] = {},
@@ -578,7 +575,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
578
575
  rollouts_per_example=rollouts_per_example,
579
576
  sampling_args=sampling_args,
580
577
  date=date,
581
- time_ms=time_ms,
578
+ time=time,
582
579
  avg_reward=avg_reward,
583
580
  avg_metrics=avg_metrics,
584
581
  pass_at_k=pass_at_k,
@@ -1,5 +1,6 @@
1
1
  """Tests for CliAgentEnv and HarborEnv."""
2
2
 
3
+ import asyncio
3
4
  import tempfile
4
5
  from pathlib import Path
5
6
  from unittest.mock import AsyncMock, MagicMock, patch
@@ -8,6 +9,7 @@ import pytest
8
9
  from datasets import Dataset
9
10
 
10
11
  import verifiers as vf
12
+ from verifiers.utils.interception_utils import serialize_intercept_response
11
13
 
12
14
 
13
15
  @pytest.fixture
@@ -100,6 +102,11 @@ class TestCliAgentEnv:
100
102
  env_vars = await env.build_env_vars(state)
101
103
 
102
104
  assert env_vars["OPENAI_BASE_URL"] == "https://test.trycloudflare.com/v1"
105
+ assert env_vars["OPENAI_API_KEY"] == env._require_interception_server().secret
106
+ assert env_vars["ANTHROPIC_BASE_URL"] == "https://test.trycloudflare.com"
107
+ assert (
108
+ env_vars["ANTHROPIC_API_KEY"] == env._require_interception_server().secret
109
+ )
103
110
  assert env_vars["OPENAI_MODEL"] == "gpt-4"
104
111
  assert env_vars["CUSTOM_VAR"] == "value"
105
112
 
@@ -217,6 +224,152 @@ class TestCliAgentEnv:
217
224
  assert kwargs["tools"][0].name == "echo"
218
225
 
219
226
 
227
+ @pytest.mark.asyncio
228
+ async def test_cli_agent_env_delivers_intercepted_tool_call_response(
229
+ sample_dataset, mock_client
230
+ ):
231
+ env = vf.CliAgentEnv(
232
+ run_command="python agent.py",
233
+ dataset=sample_dataset,
234
+ rubric=vf.Rubric(),
235
+ )
236
+ prompt = sample_dataset[0]["prompt"]
237
+ tool_call = {
238
+ "id": "call_echo",
239
+ "type": "function",
240
+ "function": {"name": "echo", "arguments": '{"text": "hello"}'},
241
+ }
242
+ mock_client.add_response(
243
+ prompt,
244
+ "",
245
+ finish_reason="tool_calls",
246
+ tool_calls=[tool_call],
247
+ )
248
+
249
+ state = await env.init_state(
250
+ input=sample_dataset[0],
251
+ client=mock_client,
252
+ model="test-model",
253
+ )
254
+ response_future = asyncio.Future()
255
+ request_id = "req-tool-call"
256
+ state["current_request_id"] = request_id
257
+ env._interception_server.intercepts[request_id] = {
258
+ "stream": False,
259
+ "tools": [
260
+ {
261
+ "type": "function",
262
+ "function": {
263
+ "name": "echo",
264
+ "description": "Return the provided text.",
265
+ "parameters": {
266
+ "type": "object",
267
+ "properties": {"text": {"type": "string"}},
268
+ },
269
+ },
270
+ }
271
+ ],
272
+ "response_future": response_future,
273
+ }
274
+
275
+ response = await env.get_model_response(
276
+ state=state,
277
+ prompt=prompt,
278
+ client=mock_client,
279
+ model="test-model",
280
+ )
281
+
282
+ assert response_future.done()
283
+ assert response_future.result() is response
284
+ assert state["current_request_id"] is None
285
+
286
+ payload = serialize_intercept_response(response_future.result())
287
+ choice = payload["choices"][0]
288
+ assert choice["finish_reason"] == "tool_calls"
289
+ assert choice["message"]["tool_calls"] == [tool_call]
290
+ assert mock_client.last_call_kwargs["tools"][0].name == "echo"
291
+
292
+
293
+ @pytest.mark.asyncio
294
+ async def test_cli_agent_env_synthesizes_stream_for_intercepted_tool_call_response(
295
+ sample_dataset, mock_client
296
+ ):
297
+ env = vf.CliAgentEnv(
298
+ run_command="python agent.py",
299
+ dataset=sample_dataset,
300
+ rubric=vf.Rubric(),
301
+ )
302
+ prompt = sample_dataset[0]["prompt"]
303
+ tool_call = {
304
+ "id": "call_echo",
305
+ "type": "function",
306
+ "function": {"name": "echo", "arguments": '{"text": "hello"}'},
307
+ }
308
+ mock_client.add_response(
309
+ prompt,
310
+ "",
311
+ finish_reason="tool_calls",
312
+ tool_calls=[tool_call],
313
+ )
314
+
315
+ state = await env.init_state(
316
+ input=sample_dataset[0],
317
+ client=mock_client,
318
+ model="test-model",
319
+ )
320
+ chunk_queue = asyncio.Queue()
321
+ response_future = asyncio.Future()
322
+ request_id = "req-stream-tool-call"
323
+ state["current_request_id"] = request_id
324
+ env._interception_server.intercepts[request_id] = {
325
+ "stream": True,
326
+ "tools": [
327
+ {
328
+ "type": "function",
329
+ "function": {
330
+ "name": "echo",
331
+ "description": "Return the provided text.",
332
+ "parameters": {
333
+ "type": "object",
334
+ "properties": {"text": {"type": "string"}},
335
+ },
336
+ },
337
+ }
338
+ ],
339
+ "chunk_queue": chunk_queue,
340
+ "response_future": response_future,
341
+ }
342
+
343
+ response = await env.get_model_response(
344
+ state=state,
345
+ prompt=prompt,
346
+ client=mock_client,
347
+ model="test-model",
348
+ )
349
+
350
+ chunks = []
351
+ while True:
352
+ chunk = await asyncio.wait_for(chunk_queue.get(), timeout=1.0)
353
+ if chunk is None:
354
+ break
355
+ chunks.append(chunk)
356
+
357
+ assert response_future.done()
358
+ assert response_future.result() is response
359
+ assert state["current_request_id"] is None
360
+
361
+ assert chunks[0]["object"] == "chat.completion.chunk"
362
+ assert chunks[0]["choices"][0]["delta"]["tool_calls"][0]["id"] == "call_echo"
363
+ assert (
364
+ chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["name"] == "echo"
365
+ )
366
+ assert (
367
+ chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["arguments"]
368
+ == '{"text": "hello"}'
369
+ )
370
+ assert chunks[-1]["choices"][0]["finish_reason"] == "tool_calls"
371
+
372
+
220
373
  class TestHarborEnv:
221
374
  """Tests for HarborEnv."""
222
375
 
@@ -244,7 +397,7 @@ class TestHarborEnv:
244
397
  dataset_path=harbor_task_dir,
245
398
  )
246
399
  assert len(env.dataset) == 1
247
- assert env.dataset[0]["task"] == "test_task"
400
+ assert env.dataset[0]["info"]["task_name"] == "test_task"
248
401
 
249
402
  def test_init_filters_tasks(self, harbor_task_dir):
250
403
  """Test that HarborEnv can filter tasks by name."""
@@ -260,7 +413,7 @@ class TestHarborEnv:
260
413
  tasks=["test_task"],
261
414
  )
262
415
  assert len(env.dataset) == 1
263
- assert env.dataset[0]["task"] == "test_task"
416
+ assert env.dataset[0]["info"]["task_name"] == "test_task"
264
417
 
265
418
  def test_init_raises_on_empty_dataset(self):
266
419
  """Test that HarborEnv raises when no valid tasks found."""
@@ -314,7 +467,7 @@ class TestHarborEnv:
314
467
  )
315
468
  state = {
316
469
  "interception_base_url": "https://test.trycloudflare.com/v1",
317
- "task": "my_task",
470
+ "info": {"task_name": "my_task"},
318
471
  }
319
472
  env_vars = await env.build_env_vars(state)
320
473
 
@@ -251,7 +251,7 @@ async def test_composable_env_quotes_log_path_when_collecting_logs():
251
251
  teardown=lambda: None,
252
252
  )
253
253
 
254
- state = {"sandbox_id": "sbx", "timing": {"total_ms": 0}}
254
+ state = {"sandbox_id": "sbx", "timing": {"total": 0}}
255
255
 
256
256
  await env.post_rollout(state)
257
257
 
@@ -594,7 +594,7 @@ async def test_composable_env_collects_harness_metrics():
594
594
  state = {
595
595
  "sandbox_id": "sbx",
596
596
  "info": {"id": 0},
597
- "timing": {"total_ms": 0},
597
+ "timing": {"total": 0},
598
598
  "trajectory": [],
599
599
  }
600
600
 
@@ -633,7 +633,7 @@ async def test_composable_env_metrics_with_key_whitelist():
633
633
  state = {
634
634
  "sandbox_id": "sbx",
635
635
  "info": {"id": 0},
636
- "timing": {"total_ms": 0},
636
+ "timing": {"total": 0},
637
637
  "trajectory": [],
638
638
  }
639
639
 
@@ -659,7 +659,7 @@ async def test_composable_env_no_metrics_when_path_not_set():
659
659
  state = {
660
660
  "sandbox_id": "sbx",
661
661
  "info": {"id": 0},
662
- "timing": {"total_ms": 0},
662
+ "timing": {"total": 0},
663
663
  "trajectory": [],
664
664
  }
665
665