verifiers 0.1.15.dev3__tar.gz → 0.1.15.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/PKG-INFO +5 -6
  2. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/README.md +3 -4
  3. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/pyproject.toml +18 -5
  4. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_context_token_metrics.py +37 -15
  5. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_environment_extra.py +0 -2
  6. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_eval_display.py +43 -0
  7. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_eval_utils.py +51 -0
  8. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_gym_env.py +0 -2
  9. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_harbor_env_mcp.py +0 -2
  10. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_langchain_deep_agents_wikispeedia.py +15 -5
  11. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_lean_task.py +0 -2
  12. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_mcp_search_env.py +17 -3
  13. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_message_utils.py +33 -2
  14. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_nemorl_client.py +0 -2
  15. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_opencode_harbor.py +15 -29
  16. verifiers-0.1.15.dev4/tests/test_openenv_client.py +162 -0
  17. verifiers-0.1.15.dev4/tests/test_pricing_utils.py +127 -0
  18. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_renderer_e2e.py +0 -2
  19. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_save_utils.py +51 -37
  20. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_setup_script.py +0 -2
  21. verifiers-0.1.15.dev4/tests/test_v1_bfcl.py +135 -0
  22. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_v1_config_extension.py +275 -37
  23. verifiers-0.1.15.dev4/tests/test_v1_empty_completions.py +57 -0
  24. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_v1_example_counts.py +13 -15
  25. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_v1_group_reward_env.py +2 -3
  26. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_v1_harbor_cli.py +75 -26
  27. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_v1_mini_swe_agent.py +32 -6
  28. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_v1_rlm_swe.py +105 -12
  29. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_v1_runtime_lifecycle.py +94 -45
  30. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_v1_scoring_functions.py +6 -5
  31. verifiers-0.1.15.dev4/tests/test_v1_taskset_bindings.py +188 -0
  32. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/__init__.py +66 -8
  33. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/plugins/prime.py +0 -2
  34. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/clients/__init__.py +0 -2
  35. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/clients/nemorl_chat_completions_client.py +0 -2
  36. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/clients/openai_responses_client.py +0 -2
  37. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/clients/renderer_client.py +0 -2
  38. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/env_group.py +4 -6
  39. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/environment.py +19 -21
  40. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/_filter.py +0 -2
  41. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/composable_env.py +0 -2
  42. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harness.py +4 -8
  43. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -2
  44. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -2
  45. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -2
  46. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -2
  47. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/task.py +4 -6
  48. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -2
  49. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -2
  50. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +8 -10
  51. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -2
  52. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -2
  53. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -2
  54. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -2
  55. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -2
  56. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -2
  57. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -2
  58. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +0 -2
  59. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +2 -2
  60. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -2
  61. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -2
  62. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/gym_env.py +0 -2
  63. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/harbor_env/env.py +0 -2
  64. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/harbor_env/mcp.py +0 -2
  65. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/utils/file_locks.py +0 -2
  66. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -2
  67. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/README.md +17 -15
  68. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/openenv_env.py +99 -328
  69. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -2
  70. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/build.py +0 -2
  71. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/init.py +16 -19
  72. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/setup.py +0 -2
  73. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/tui.py +6 -0
  74. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/serve/server/env_router.py +0 -2
  75. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/serve/types.py +0 -2
  76. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/types.py +428 -6
  77. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/data_utils.py +3 -5
  78. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/display_utils.py +5 -5
  79. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/env_config_utils.py +0 -2
  80. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/env_utils.py +8 -5
  81. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/eval_display.py +30 -3
  82. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/eval_utils.py +110 -2
  83. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/import_utils.py +0 -2
  84. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/message_utils.py +65 -3
  85. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/metric_utils.py +5 -5
  86. verifiers-0.1.15.dev4/verifiers/utils/pricing_utils.py +170 -0
  87. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/save_utils.py +66 -56
  88. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/usage_utils.py +20 -56
  89. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/version_utils.py +0 -2
  90. verifiers-0.1.15.dev4/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +73 -0
  91. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/README.md +123 -79
  92. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/RE_MIGRATION.md +59 -40
  93. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/__init__.py +42 -3
  94. verifiers-0.1.15.dev4/verifiers/v1/config.py +381 -0
  95. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/env.py +2 -4
  96. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/harness.py +100 -112
  97. verifiers-0.1.15.dev4/verifiers/v1/packages/harnesses/__init__.py +14 -0
  98. verifiers-0.1.15.dev4/verifiers/v1/packages/harnesses/command.py +116 -0
  99. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/packages/harnesses/configs.py +32 -4
  100. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/packages/harnesses/mini_swe_agent.py +43 -39
  101. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/packages/harnesses/opencode.py +71 -52
  102. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/packages/harnesses/pi.py +44 -40
  103. verifiers-0.1.15.dev4/verifiers/v1/packages/harnesses/rlm.py +347 -0
  104. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/packages/tasksets/harbor.py +90 -49
  105. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/runtime.py +472 -374
  106. verifiers-0.1.15.dev4/verifiers/v1/state.py +10 -0
  107. verifiers-0.1.15.dev4/verifiers/v1/task.py +92 -0
  108. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/taskset.py +124 -99
  109. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/toolset.py +94 -77
  110. verifiers-0.1.15.dev4/verifiers/v1/types.py +59 -0
  111. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/user.py +24 -17
  112. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/artifact_utils.py +5 -7
  113. verifiers-0.1.15.dev4/verifiers/v1/utils/binding_utils.py +216 -0
  114. verifiers-0.1.15.dev4/verifiers/v1/utils/config_callable_utils.py +123 -0
  115. verifiers-0.1.15.dev4/verifiers/v1/utils/config_utils.py +177 -0
  116. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/endpoint_utils.py +42 -42
  117. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/json_utils.py +3 -4
  118. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/judge_utils.py +5 -17
  119. verifiers-0.1.15.dev4/verifiers/v1/utils/lifecycle_utils.py +98 -0
  120. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/mcp_proxy_utils.py +26 -31
  121. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/mcp_utils.py +23 -21
  122. verifiers-0.1.15.dev4/verifiers/v1/utils/object_utils.py +32 -0
  123. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/program_utils.py +128 -111
  124. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/prompt_utils.py +17 -19
  125. verifiers-0.1.15.dev4/verifiers/v1/utils/runtime_registry.py +37 -0
  126. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/sandbox_program_utils.py +23 -27
  127. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/sandbox_utils.py +159 -98
  128. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/scoring_utils.py +156 -84
  129. verifiers-0.1.15.dev4/verifiers/v1/utils/serialization_utils.py +14 -0
  130. verifiers-0.1.15.dev4/verifiers/v1/utils/task_freeze_utils.py +89 -0
  131. verifiers-0.1.15.dev4/verifiers/v1/utils/taskset_utils.py +56 -0
  132. verifiers-0.1.15.dev4/verifiers/v1/utils/timing_utils.py +119 -0
  133. verifiers-0.1.15.dev4/verifiers/v1/utils/tool_utils.py +54 -0
  134. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/trajectory_utils.py +24 -18
  135. verifiers-0.1.15.dev4/verifiers/v1/utils/usage_utils.py +21 -0
  136. verifiers-0.1.15.dev3/tests/test_v1_bfcl.py +0 -55
  137. verifiers-0.1.15.dev3/verifiers/v1/config.py +0 -455
  138. verifiers-0.1.15.dev3/verifiers/v1/packages/harnesses/__init__.py +0 -8
  139. verifiers-0.1.15.dev3/verifiers/v1/packages/harnesses/cli.py +0 -121
  140. verifiers-0.1.15.dev3/verifiers/v1/packages/harnesses/rlm.py +0 -265
  141. verifiers-0.1.15.dev3/verifiers/v1/state.py +0 -401
  142. verifiers-0.1.15.dev3/verifiers/v1/task.py +0 -177
  143. verifiers-0.1.15.dev3/verifiers/v1/utils/lifecycle_utils.py +0 -96
  144. verifiers-0.1.15.dev3/verifiers/v1/utils/timing_utils.py +0 -36
  145. verifiers-0.1.15.dev3/verifiers/v1/utils/tool_utils.py +0 -19
  146. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/.gitignore +0 -0
  147. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/LICENSE +0 -0
  148. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/AGENTS.md +0 -0
  149. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/README.md +0 -0
  150. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/__init__.py +0 -0
  151. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/conftest.py +0 -0
  152. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_browser_env.py +0 -0
  153. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_build_script.py +0 -0
  154. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_cli_agent_env.py +0 -0
  155. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_client_auth_errors.py +0 -0
  156. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_client_config.py +0 -0
  157. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_client_multimodal_types.py +0 -0
  158. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_composable_env.py +0 -0
  159. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_decorator_ranks.py +0 -0
  160. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_endpoint_registry.py +0 -0
  161. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_env_group.py +0 -0
  162. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_env_server.py +0 -0
  163. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_environment.py +0 -0
  164. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_envs.py +0 -0
  165. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_error_chain.py +0 -0
  166. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_eval_cli.py +0 -0
  167. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_gepa_cli.py +0 -0
  168. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_gepa_utils.py +0 -0
  169. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_imports.py +0 -0
  170. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_install_utils.py +0 -0
  171. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_interception_utils.py +0 -0
  172. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_logging.py +0 -0
  173. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_math_rubric.py +0 -0
  174. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_maybe_think_parser.py +0 -0
  175. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_message_utils_multimodal.py +0 -0
  176. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_multiturn_env.py +0 -0
  177. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_openai_chat_completions_token_client.py +0 -0
  178. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_openai_responses_client.py +0 -0
  179. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_opencode_rlm_env.py +0 -0
  180. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_parser.py +0 -0
  181. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_path_utils.py +0 -0
  182. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_per_turn_timing.py +0 -0
  183. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_prime_plugin.py +0 -0
  184. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_renderer_client.py +0 -0
  185. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_rlm_composable_env.py +0 -0
  186. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_rlm_env.py +0 -0
  187. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_rubric.py +0 -0
  188. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_rubric_group.py +0 -0
  189. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_sandbox_env.py +0 -0
  190. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_sandbox_mixin.py +0 -0
  191. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_singleturn_env.py +0 -0
  192. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_stateful_tool_env.py +0 -0
  193. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_think_parser.py +0 -0
  194. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_tool_env.py +0 -0
  195. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_tool_utils.py +0 -0
  196. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_trajectory_processing.py +0 -0
  197. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_tui_info_formatting.py +0 -0
  198. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_types.py +0 -0
  199. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_v1_endpoint_protocols.py +0 -0
  200. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_wordle_env.py +0 -0
  201. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/tests/test_xml_parser.py +0 -0
  202. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/AGENTS.md +0 -0
  203. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/__init__.py +0 -0
  204. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/commands/__init__.py +0 -0
  205. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/commands/build.py +0 -0
  206. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/commands/eval.py +0 -0
  207. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/commands/gepa.py +0 -0
  208. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/commands/init.py +0 -0
  209. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/commands/install.py +0 -0
  210. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/commands/setup.py +0 -0
  211. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/plugins/__init__.py +0 -0
  212. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/cli/tui.py +0 -0
  213. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/clients/anthropic_messages_client.py +0 -0
  214. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/clients/client.py +0 -0
  215. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/clients/openai_chat_completions_client.py +0 -0
  216. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
  217. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/clients/openai_completions_client.py +0 -0
  218. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/decorators.py +0 -0
  219. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/AGENTS.md +0 -0
  220. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/__init__.py +0 -0
  221. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/README.md +0 -0
  222. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/__init__.py +0 -0
  223. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/cli_agent_env.py +0 -0
  224. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/README.md +0 -0
  225. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/__init__.py +0 -0
  226. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
  227. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  228. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  229. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  230. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  231. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  232. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
  233. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  234. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  235. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
  236. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
  237. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
  238. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/mcp_env.py +0 -0
  239. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/opencode_env.py +0 -0
  240. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  241. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
  242. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/rlm_env.py +0 -0
  243. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
  244. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/utils/__init__.py +0 -0
  245. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/__init__.py +0 -0
  246. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/README.md +0 -0
  247. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  248. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
  249. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  250. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
  251. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
  252. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
  253. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  254. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/textarena_env.py +0 -0
  255. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/multiturn_env.py +0 -0
  256. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/python_env.py +0 -0
  257. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/sandbox_env.py +0 -0
  258. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/singleturn_env.py +0 -0
  259. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/stateful_tool_env.py +0 -0
  260. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/envs/tool_env.py +0 -0
  261. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/errors.py +0 -0
  262. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/gepa/__init__.py +0 -0
  263. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/gepa/adapter.py +0 -0
  264. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/gepa/config.py +0 -0
  265. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/gepa/display.py +0 -0
  266. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/gepa/gepa_utils.py +0 -0
  267. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/parsers/__init__.py +0 -0
  268. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/parsers/maybe_think_parser.py +0 -0
  269. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/parsers/parser.py +0 -0
  270. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/parsers/think_parser.py +0 -0
  271. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/parsers/xml_parser.py +0 -0
  272. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rl/README.md +0 -0
  273. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rl/__init__.py +0 -0
  274. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rl/inference/__init__.py +0 -0
  275. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rl/inference/client.py +0 -0
  276. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rl/inference/server.py +0 -0
  277. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rl/trainer/__init__.py +0 -0
  278. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rl/trainer/config.py +0 -0
  279. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rl/trainer/orchestrator.py +0 -0
  280. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rl/trainer/trainer.py +0 -0
  281. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rl/trainer/utils.py +0 -0
  282. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rubrics/__init__.py +0 -0
  283. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rubrics/judge_rubric.py +0 -0
  284. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rubrics/math_rubric.py +0 -0
  285. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rubrics/rubric.py +0 -0
  286. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/rubrics/rubric_group.py +0 -0
  287. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/__init__.py +0 -0
  288. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/eval.py +0 -0
  289. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/gepa.py +0 -0
  290. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/install.py +0 -0
  291. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/rl.py +0 -0
  292. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/train.py +0 -0
  293. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/scripts/vllm.py +0 -0
  294. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/serve/__init__.py +0 -0
  295. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/serve/client/env_client.py +0 -0
  296. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/serve/client/zmq_env_client.py +0 -0
  297. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/serve/server/__init__.py +0 -0
  298. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/serve/server/env_server.py +0 -0
  299. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/serve/server/env_worker.py +0 -0
  300. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/serve/server/zmq_env_server.py +0 -0
  301. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/__init__.py +0 -0
  302. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/async_utils.py +0 -0
  303. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/client_utils.py +0 -0
  304. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/config_utils.py +0 -0
  305. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/error_utils.py +0 -0
  306. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/heartbeat.py +0 -0
  307. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/install_utils.py +0 -0
  308. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/interception_utils.py +0 -0
  309. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/logging_utils.py +0 -0
  310. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/path_utils.py +0 -0
  311. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/process_utils.py +0 -0
  312. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/response_utils.py +0 -0
  313. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/serve_utils.py +0 -0
  314. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/thread_utils.py +0 -0
  315. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/threaded_sandbox_client.py +0 -0
  316. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/tool_utils.py +0 -0
  317. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/utils/tunnel_utils.py +0 -0
  318. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/packages/__init__.py +0 -0
  319. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/packages/tasksets/__init__.py +0 -0
  320. {verifiers-0.1.15.dev3 → verifiers-0.1.15.dev4}/verifiers/v1/utils/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.15.dev3
3
+ Version: 0.1.15.dev4
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -52,7 +52,7 @@ Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
52
52
  Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
53
53
  Requires-Dist: stagehand>=3.0.0; extra == 'browser'
54
54
  Provides-Extra: openenv
55
- Requires-Dist: openenv-core[core]==0.2.1; extra == 'openenv'
55
+ Requires-Dist: openenv-core>=0.3.0; extra == 'openenv'
56
56
  Provides-Extra: renderers
57
57
  Requires-Dist: renderers>=0.1.8.dev0; extra == 'renderers'
58
58
  Provides-Extra: rg
@@ -210,7 +210,7 @@ For new environments with reusable tasksets, toolsets, custom programs, or
210
210
  custom harnesses, use the v1 Taskset/Harness path:
211
211
  ```python
212
212
  # my_env.py
213
- import verifiers.v1 as vf
213
+ import verifiers as vf
214
214
 
215
215
  def source():
216
216
  yield {
@@ -226,8 +226,7 @@ async def contains_answer(task, state) -> float:
226
226
  def load_taskset(config: vf.TasksetConfig | None = None):
227
227
  return vf.Taskset(source=source, rewards=[contains_answer], config=config)
228
228
 
229
- def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
230
- config = config or vf.EnvConfig()
229
+ def load_environment(config: vf.EnvConfig) -> vf.Env:
231
230
  return vf.Env(taskset=load_taskset(config=config.taskset))
232
231
  ```
233
232
  If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
@@ -239,7 +238,7 @@ harness with:
239
238
 
240
239
  ```python
241
240
  env = vf.Env(
242
- taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
241
+ taskset=vf.HarborTaskset(),
243
242
  harness=vf.OpenCode(),
244
243
  )
245
244
  ```
@@ -135,7 +135,7 @@ For new environments with reusable tasksets, toolsets, custom programs, or
135
135
  custom harnesses, use the v1 Taskset/Harness path:
136
136
  ```python
137
137
  # my_env.py
138
- import verifiers.v1 as vf
138
+ import verifiers as vf
139
139
 
140
140
  def source():
141
141
  yield {
@@ -151,8 +151,7 @@ async def contains_answer(task, state) -> float:
151
151
  def load_taskset(config: vf.TasksetConfig | None = None):
152
152
  return vf.Taskset(source=source, rewards=[contains_answer], config=config)
153
153
 
154
- def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
155
- config = config or vf.EnvConfig()
154
+ def load_environment(config: vf.EnvConfig) -> vf.Env:
156
155
  return vf.Env(taskset=load_taskset(config=config.taskset))
157
156
  ```
158
157
  If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
@@ -164,7 +163,7 @@ harness with:
164
163
 
165
164
  ```python
166
165
  env = vf.Env(
167
- taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
166
+ taskset=vf.HarborTaskset(),
168
167
  harness=vf.OpenCode(),
169
168
  )
170
169
  ```
@@ -68,13 +68,15 @@ dev = [
68
68
  "ipywidgets",
69
69
  "reasoning-gym",
70
70
  "textarena",
71
- "openenv-core[core]==0.2.1",
72
71
  "stagehand>=3.0.0",
73
72
  "aiohttp>=3.9.0",
74
73
  "python-dotenv>=1.0.0",
75
74
  "nltk",
76
75
  "renderers>=0.1.8.dev0",
77
76
  ]
77
+ policy = [
78
+ "semgrep>=1.150.0",
79
+ ]
78
80
 
79
81
  [project.optional-dependencies]
80
82
  rg = [
@@ -84,14 +86,14 @@ ta = [
84
86
  "textarena",
85
87
  "nltk",
86
88
  ]
87
- openenv = [
88
- "openenv-core[core]==0.2.1",
89
- ]
90
89
  browser = [
91
90
  "stagehand>=3.0.0",
92
91
  "aiohttp>=3.9.0",
93
92
  "python-dotenv>=1.0.0",
94
93
  ]
94
+ openenv = [
95
+ "openenv-core>=0.3.0",
96
+ ]
95
97
  renderers = [
96
98
  "renderers>=0.1.8.dev0",
97
99
  ]
@@ -111,7 +113,12 @@ rl = [
111
113
  [tool.uv]
112
114
  preview = true
113
115
  required-version = ">=0.11.1"
114
-
116
+ conflicts = [
117
+ [
118
+ { extra = "openenv" },
119
+ { group = "policy" },
120
+ ],
121
+ ]
115
122
  [[tool.uv.index]]
116
123
  name = "pypi"
117
124
  url = "https://pypi.org/simple"
@@ -123,6 +130,7 @@ exclude-newer = "7 days"
123
130
  prime-tunnel = false
124
131
  prime-sandboxes = false
125
132
  renderers = false
133
+ openenv-core = false
126
134
 
127
135
  [tool.uv.extra-build-dependencies]
128
136
  flash-attn = [{ requirement = "torch", match-runtime = true }]
@@ -130,6 +138,11 @@ flash-attn = [{ requirement = "torch", match-runtime = true }]
130
138
  [tool.uv.extra-build-variables]
131
139
  flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE" }
132
140
 
141
+ [tool.ruff]
142
+ exclude = [
143
+ ".semgrep",
144
+ ]
145
+
133
146
  [project.scripts]
134
147
  vf-eval = "verifiers.scripts.eval:main"
135
148
  vf-gepa = "verifiers.scripts.gepa:main"
@@ -5,10 +5,9 @@ Tests the trajectory-based context token computation
5
5
  using the last trajectory step.
6
6
  """
7
7
 
8
- from unittest.mock import MagicMock
9
-
10
8
  import pytest
11
9
 
10
+ from verifiers.types import Response, ResponseMessage, Usage
12
11
  from verifiers.utils.usage_utils import compute_context_token_metrics
13
12
 
14
13
 
@@ -20,12 +19,39 @@ SYS = {"role": "system", "content": "You are helpful"}
20
19
  USER = {"role": "user", "content": "hi"}
21
20
 
22
21
 
23
- def _make_response(prompt_tokens: int, completion_tokens: int) -> MagicMock:
24
- response = MagicMock()
25
- response.usage = MagicMock(
26
- prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
22
+ def _make_response(prompt_tokens: int, completion_tokens: int) -> Response:
23
+ return Response(
24
+ id="test",
25
+ created=0,
26
+ model="test",
27
+ usage=Usage(
28
+ prompt_tokens=prompt_tokens,
29
+ reasoning_tokens=0,
30
+ completion_tokens=completion_tokens,
31
+ total_tokens=prompt_tokens + completion_tokens,
32
+ ),
33
+ message=ResponseMessage(
34
+ role="assistant",
35
+ content="",
36
+ finish_reason="stop",
37
+ is_truncated=False,
38
+ ),
39
+ )
40
+
41
+
42
+ def _make_response_without_usage() -> Response:
43
+ return Response(
44
+ id="test",
45
+ created=0,
46
+ model="test",
47
+ usage=None,
48
+ message=ResponseMessage(
49
+ role="assistant",
50
+ content="",
51
+ finish_reason="stop",
52
+ is_truncated=False,
53
+ ),
27
54
  )
28
- return response
29
55
 
30
56
 
31
57
  def _asst(i: int) -> dict:
@@ -115,13 +141,11 @@ class TestContextMetrics:
115
141
  assert metrics["final_input_tokens"] == 230 - 50
116
142
 
117
143
  def test_skips_responses_without_usage(self):
118
- """Responses with no .usage attribute are skipped entirely."""
119
- no_usage = MagicMock()
120
- no_usage.usage = None
144
+ """Responses with usage=None are skipped entirely."""
121
145
  trajectory = [
122
146
  {"response": _make_response(100, 20)},
123
147
  {"response": _make_response(200, 30)},
124
- {"response": no_usage}, # last step, but no usage
148
+ {"response": _make_response_without_usage()},
125
149
  ]
126
150
  metrics = compute_context_token_metrics(trajectory)
127
151
  # Should use step 1 (last with usage): total = 230
@@ -130,11 +154,9 @@ class TestContextMetrics:
130
154
 
131
155
  def test_all_responses_lack_usage(self):
132
156
  """If no response has usage data, return zeros."""
133
- no_usage = MagicMock()
134
- no_usage.usage = None
135
157
  trajectory = [
136
- {"response": no_usage},
137
- {"response": no_usage},
158
+ {"response": _make_response_without_usage()},
159
+ {"response": _make_response_without_usage()},
138
160
  ]
139
161
  metrics = compute_context_token_metrics(trajectory)
140
162
  assert metrics["final_output_tokens"] == 0
@@ -9,8 +9,6 @@ Covers:
9
9
  - make_dataset tool call sanitization
10
10
  """
11
11
 
12
- from __future__ import annotations
13
-
14
12
  import asyncio
15
13
  import json
16
14
  from unittest.mock import AsyncMock
@@ -1,3 +1,5 @@
1
+ from rich.console import Console
2
+
1
3
  from verifiers.types import ClientConfig, EvalConfig
2
4
  from verifiers.utils.eval_display import EvalDisplay
3
5
 
@@ -78,3 +80,44 @@ def test_format_client_target_uses_single_resolved_base_url() -> None:
78
80
  )
79
81
 
80
82
  assert EvalDisplay._format_client_target(config) == "http://localhost:8001/v1"
83
+
84
+
85
+ def render_plain(renderable) -> str:
86
+ console = Console(width=100, record=True)
87
+ console.print(renderable)
88
+ return console.export_text()
89
+
90
+
91
+ def test_tokens_row_omits_cost_when_unavailable() -> None:
92
+ display = EvalDisplay([make_config(max_concurrent=1)])
93
+
94
+ rendered = render_plain(
95
+ display._make_tokens_row({"input_tokens": 12.0, "output_tokens": 7.0})
96
+ )
97
+
98
+ assert "input 12" in rendered
99
+ assert "output 7" in rendered
100
+ assert "cost" not in rendered
101
+
102
+
103
+ def test_tokens_row_includes_cost_when_available() -> None:
104
+ display = EvalDisplay([make_config(max_concurrent=1)])
105
+
106
+ rendered = render_plain(
107
+ display._make_tokens_row(
108
+ {
109
+ "input_tokens": 12.0,
110
+ "output_tokens": 7.0,
111
+ "final_input_tokens": 10.0,
112
+ "final_output_tokens": 5.0,
113
+ },
114
+ {"input_usd": 0.005, "output_usd": 0.0073, "total_usd": 0.0123},
115
+ )
116
+ )
117
+
118
+ assert "input 12" in rendered
119
+ assert "output 7" in rendered
120
+ assert "final_input 10" in rendered
121
+ assert "final_output 5" in rendered
122
+ assert "cost (all) $0.0123" in rendered
123
+ assert rendered.index("final_output 5") < rendered.index("cost (all) $0.0123")
@@ -4,6 +4,8 @@ Covers:
4
4
  - print_results indexing with multiple rollouts per example
5
5
  """
6
6
 
7
+ import pytest
8
+
7
9
  from verifiers.types import GenerateOutputs
8
10
  from verifiers.utils.save_utils import states_to_outputs
9
11
 
@@ -138,6 +140,55 @@ def test_print_results_includes_usage(capsys, make_metadata, make_output):
138
140
  assert "output_tokens (avg): 3.000" in captured.out
139
141
 
140
142
 
143
+ def test_attach_metadata_cost_uses_total_output_usage(make_metadata, make_output):
144
+ from verifiers.utils.eval_utils import _attach_metadata_cost
145
+
146
+ outputs = [
147
+ make_output(example_id=0, reward=1.0, metrics={"test_metric": 1.0}),
148
+ make_output(example_id=1, reward=0.0, metrics={"test_metric": 2.0}),
149
+ ]
150
+ outputs[0]["token_usage"] = {"input_tokens": 10.0, "output_tokens": 4.0}
151
+ outputs[1]["token_usage"] = {"input_tokens": 6.0, "output_tokens": 2.0}
152
+ metadata = make_metadata(
153
+ num_examples=2,
154
+ rollouts_per_example=1,
155
+ usage={"input_tokens": 8.0, "output_tokens": 3.0},
156
+ )
157
+
158
+ cost = _attach_metadata_cost(
159
+ metadata,
160
+ {"input_usd_per_mtok": 1.0, "output_usd_per_mtok": 5.0},
161
+ outputs,
162
+ )
163
+
164
+ assert cost == {
165
+ "input_usd": pytest.approx(0.000016),
166
+ "output_usd": pytest.approx(0.000030),
167
+ "total_usd": pytest.approx(0.000046),
168
+ }
169
+ assert metadata["cost"] == cost
170
+
171
+
172
+ def test_print_results_labels_cost_as_all(capsys, make_metadata, make_output):
173
+ from verifiers.utils.eval_utils import print_results
174
+
175
+ outputs = [
176
+ make_output(example_id=0, reward=1.0, metrics={"test_metric": 1.0}),
177
+ ]
178
+ outputs[0]["token_usage"] = {"input_tokens": 10.0, "output_tokens": 4.0}
179
+ metadata = make_metadata(num_examples=1, rollouts_per_example=1, usage=None)
180
+ metadata["cost"] = {
181
+ "input_usd": 0.005,
182
+ "output_usd": 0.0073,
183
+ "total_usd": 0.0123,
184
+ }
185
+
186
+ print_results(GenerateOutputs(outputs=outputs, metadata=metadata))
187
+ captured = capsys.readouterr()
188
+
189
+ assert "cost (all): $0.0123" in captured.out
190
+
191
+
141
192
  def test_print_results_handles_heterogeneous_metrics(
142
193
  capsys, make_metadata, make_output
143
194
  ):
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  import re
4
2
  from typing import Any
5
3
 
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  import asyncio
4
2
  import time
5
3
  from typing import Any
@@ -57,7 +57,7 @@ def test_wikispeedia_loads_as_v1_taskset_harness(
57
57
  ) -> None:
58
58
  module = load_module(monkeypatch)
59
59
 
60
- env = module.load_environment(train_size=1, eval_size=1)
60
+ env = module.load_environment(config=vf.EnvConfig(), train_size=1, eval_size=1)
61
61
 
62
62
  assert isinstance(env, vf.Env)
63
63
  assert isinstance(env.taskset, vf.Taskset)
@@ -157,6 +157,7 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime(
157
157
  wiki = make_small_wiki(module)
158
158
  monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
159
159
  env = module.load_environment(
160
+ config=vf.EnvConfig(),
160
161
  train_size=2,
161
162
  eval_size=1,
162
163
  min_path_length=1,
@@ -248,6 +249,12 @@ async def test_wikispeedia_graph_recursion_limit_stops_rollout(
248
249
  async def ainvoke(self, payload, config=None):
249
250
  raise GraphRecursionError("recursion limit")
250
251
 
252
+ created_system_prompts = []
253
+
254
+ def fake_create_deep_agent(**kwargs):
255
+ created_system_prompts.append(kwargs["system_prompt"])
256
+ return FakeAgent()
257
+
251
258
  fake_deepagents = types.ModuleType("deepagents")
252
259
  fake_langchain_openai = types.ModuleType("langchain_openai")
253
260
  fake_langgraph = types.ModuleType("langgraph")
@@ -255,7 +262,7 @@ async def test_wikispeedia_graph_recursion_limit_stops_rollout(
255
262
  fake_langchain_core = types.ModuleType("langchain_core")
256
263
  fake_tools_module = types.ModuleType("langchain_core.tools")
257
264
 
258
- fake_deepagents.create_deep_agent = lambda **kwargs: FakeAgent()
265
+ fake_deepagents.create_deep_agent = fake_create_deep_agent
259
266
  fake_langchain_openai.ChatOpenAI = FakeChatOpenAI
260
267
  fake_langgraph_errors.GraphRecursionError = GraphRecursionError
261
268
  fake_langgraph.errors = fake_langgraph_errors
@@ -276,12 +283,16 @@ async def test_wikispeedia_graph_recursion_limit_stops_rollout(
276
283
  {
277
284
  "info": {"source": "A"},
278
285
  "prompt": [{"role": "user", "content": "start"}],
279
- "system_prompt": [{"role": "system", "content": "prompt"}],
286
+ "system_prompt": [
287
+ {"role": "user", "content": "first prompt chunk"},
288
+ {"role": "system", "content": "second prompt chunk"},
289
+ ],
280
290
  }
281
291
  )
282
292
 
283
293
  result = await program({}, state)
284
294
 
295
+ assert created_system_prompts == ["first prompt chunk\n\nsecond prompt chunk"]
285
296
  assert result["agent_timeout"] is True
286
297
  assert result["stop_reason"] == "agent_recursion_limit"
287
298
  assert result["agent_completion"] == []
@@ -298,11 +309,10 @@ async def test_wikispeedia_tool_metrics_use_agent_completion(
298
309
  {
299
310
  "role": "assistant",
300
311
  "content": "",
301
- "tool_calls": [{"id": "call_1", "name": "click_link"}],
312
+ "tool_calls": [{"id": "call_1", "name": "click_link", "arguments": "{}"}],
302
313
  },
303
314
  {
304
315
  "role": "tool",
305
- "name": "click_link",
306
316
  "tool_call_id": "call_1",
307
317
  "content": "'C' is not a valid link from 'A'.",
308
318
  },
@@ -1,7 +1,5 @@
1
1
  """Tests for ``LeanTaskSet`` lean-guard wrapping and reward enforcement."""
2
2
 
3
- from __future__ import annotations
4
-
5
3
  from dataclasses import dataclass
6
4
 
7
5
  import pytest
@@ -1,10 +1,9 @@
1
- from __future__ import annotations
2
-
3
1
  import importlib.util
4
2
  import inspect
5
3
  from pathlib import Path
6
4
  from typing import Any
7
5
 
6
+ import pytest
8
7
  import verifiers.v1 as vf
9
8
 
10
9
 
@@ -27,7 +26,7 @@ def _load_mcp_search_module() -> Any:
27
26
  def test_mcp_search_env_is_v1_only() -> None:
28
27
  module = _load_mcp_search_module()
29
28
 
30
- env = module.load_environment(max_turns=4)
29
+ env = module.load_environment(config=vf.EnvConfig(), max_turns=4)
31
30
 
32
31
  assert isinstance(env, vf.Env)
33
32
  assert isinstance(env.taskset, vf.Taskset)
@@ -59,3 +58,18 @@ def test_mcp_search_taskset_accepts_v1_taskset_config() -> None:
59
58
 
60
59
  assert env.taskset.config.max_turns == 3
61
60
  assert all(row["max_turns"] == 3 for row in rows)
61
+
62
+
63
+ @pytest.mark.asyncio
64
+ async def test_mcp_search_reward_handles_missing_assistant() -> None:
65
+ module = _load_mcp_search_module()
66
+
67
+ task = vf.Task({"answer": "expected"})
68
+ assert await module.exact_title_reward(task, vf.State({"completion": []})) == 0.0
69
+ assert (
70
+ await module.exact_title_reward(
71
+ task,
72
+ vf.State({"completion": [{"role": "user", "content": "expected"}]}),
73
+ )
74
+ == 0.0
75
+ )
@@ -1,5 +1,9 @@
1
- from verifiers.types import AssistantMessage
2
- from verifiers.utils.message_utils import from_raw_message, normalize_messages
1
+ from verifiers.types import AssistantMessage, UserMessage
2
+ from verifiers.utils.message_utils import (
3
+ from_raw_message,
4
+ get_messages,
5
+ normalize_messages,
6
+ )
3
7
 
4
8
 
5
9
  def test_from_raw_message_normalizes_oai_tool_calls():
@@ -55,3 +59,30 @@ def test_normalize_messages_accepts_oai_tool_call_dicts():
55
59
  assert assistant.tool_calls[0].id == "call_2"
56
60
  assert assistant.tool_calls[0].name == "lookup"
57
61
  assert assistant.tool_calls[0].arguments == '{"q": "hello"}'
62
+
63
+
64
+ def test_get_messages_returns_typed_messages():
65
+ messages = get_messages(
66
+ [
67
+ {"role": "user", "content": "question"},
68
+ {"role": "assistant", "content": "answer"},
69
+ ]
70
+ )
71
+
72
+ assert isinstance(messages[0], UserMessage)
73
+ assert isinstance(messages[1], AssistantMessage)
74
+ assert messages[-1].content == "answer"
75
+
76
+
77
+ def test_get_messages_filters_by_role_with_typed_return():
78
+ messages = get_messages(
79
+ [
80
+ {"role": "user", "content": "question"},
81
+ {"role": "assistant", "content": "answer"},
82
+ ],
83
+ role="assistant",
84
+ )
85
+
86
+ assert len(messages) == 1
87
+ assert isinstance(messages[0], AssistantMessage)
88
+ assert messages[0].content == "answer"
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from unittest.mock import AsyncMock, patch
4
2
 
5
3
  import pytest
@@ -1,11 +1,8 @@
1
- from __future__ import annotations
2
-
3
1
  import importlib.util
2
+ import sys
4
3
  from pathlib import Path
5
4
  from typing import Any, cast
6
5
 
7
- import pytest
8
-
9
6
  import verifiers.v1 as vf
10
7
 
11
8
 
@@ -23,6 +20,7 @@ def _load_opencode_module() -> Any:
23
20
  assert spec.loader is not None
24
21
 
25
22
  module = importlib.util.module_from_spec(spec)
23
+ sys.modules[spec.name] = module
26
24
  spec.loader.exec_module(module)
27
25
  return module
28
26
 
@@ -30,33 +28,32 @@ def _load_opencode_module() -> Any:
30
28
  def test_load_environment_uses_v1_taskset_and_harness() -> None:
31
29
  module = _load_opencode_module()
32
30
 
33
- env = module.load_environment()
31
+ env = module.load_environment(config=vf.EnvConfig())
34
32
 
35
33
  assert isinstance(env, vf.Env)
36
34
  assert isinstance(env.taskset, vf.HarborTaskset)
37
35
  assert isinstance(env.harness, vf.OpenCode)
38
36
  assert isinstance(env.harness.config, vf.OpenCodeConfig)
39
37
  assert not hasattr(module, "OpenCodeHarborHarnessConfig")
40
- assert Path(env.taskset.tasks) == Path(module.__file__).parent / "tasks"
38
+ assert not hasattr(module, "TERMINAL_BENCH_SAMPLE_TASKS")
39
+ assert env.taskset.resolve_tasks_root() == Path(module.__file__).parent / "tasks"
41
40
  assert env.harness.config.max_turns == 4
42
- assert env.harness.config.disabled_tools == ["webfetch", "question"]
41
+ assert env.harness.config.disabled_tools == vf.OpenCodeConfig().disabled_tools
42
+ assert "webfetch" in env.harness.config.disabled_tools
43
+ assert "question" in env.harness.config.disabled_tools
43
44
 
44
45
  program = cast(dict[str, object], env.harness.program)
45
- mcp_setup = cast(dict[str, object], program["tools"])["mcp"]
46
+ mcp_setup = cast(dict[str, object], program["channels"])["mcp"]
46
47
  assert '"webfetch": false' in cast(str, mcp_setup)
47
48
  assert '"question": false' in cast(str, mcp_setup)
48
- assert '"read": false' not in cast(str, mcp_setup)
49
49
 
50
50
 
51
- def test_load_environment_accepts_v1_taskset_and_harness_config(
52
- tmp_path: Path,
53
- ) -> None:
51
+ def test_load_environment_accepts_v1_taskset_and_harness_config() -> None:
54
52
  module = _load_opencode_module()
55
53
 
56
54
  env = module.load_environment(
57
55
  config=vf.EnvConfig(
58
56
  taskset={
59
- "tasks": str(tmp_path),
60
57
  "task_names": ["task-a"],
61
58
  "cpu_cores": 1.5,
62
59
  },
@@ -68,7 +65,7 @@ def test_load_environment_accepts_v1_taskset_and_harness_config(
68
65
  )
69
66
  )
70
67
 
71
- assert Path(env.taskset.tasks) == tmp_path
68
+ assert env.taskset.resolve_tasks_root() == Path(module.__file__).parent / "tasks"
72
69
  assert env.taskset.task_names == ["task-a"]
73
70
  assert env.taskset.cpu_cores == 1.5
74
71
  assert env.harness.config.agent_workdir == "/workspace"
@@ -76,25 +73,14 @@ def test_load_environment_accepts_v1_taskset_and_harness_config(
76
73
 
77
74
  program = cast(dict[str, object], env.harness.program)
78
75
  command = cast(list[object], program["command"])
79
- mcp_setup = cast(dict[str, object], program["tools"])["mcp"]
76
+ mcp_setup = cast(dict[str, object], program["channels"])["mcp"]
80
77
  assert "/workspace" in cast(str, command[2])
81
78
  assert '"webfetch": false' in cast(str, mcp_setup)
82
79
  assert '"question": false' not in cast(str, mcp_setup)
83
80
 
84
81
 
85
- def test_dataset_shortcuts_select_task_names() -> None:
86
- module = _load_opencode_module()
87
-
88
- env = module.load_environment(dataset="terminal-bench-sample")
89
-
90
- assert env.taskset.task_names == module.TERMINAL_BENCH_SAMPLE_TASKS
91
-
92
-
93
- def test_dataset_rejects_explicit_task_names() -> None:
82
+ def test_pyproject_does_not_define_unsupported_harness_defaults() -> None:
94
83
  module = _load_opencode_module()
84
+ pyproject = Path(module.__file__).parent / "pyproject.toml"
95
85
 
96
- with pytest.raises(ValueError, match="dataset.*task_names"):
97
- module.load_environment(
98
- dataset="terminal-bench-sample",
99
- task_names=["hello-world"],
100
- )
86
+ assert "[tool.verifiers.harness]" not in pyproject.read_text()