verifiers 0.1.15.dev5__tar.gz → 0.1.15.dev7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/PKG-INFO +14 -8
  2. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/README.md +13 -7
  3. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_eval_cli.py +51 -0
  4. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_eval_display.py +16 -0
  5. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_eval_utils.py +16 -0
  6. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_langchain_deep_agents_wikispeedia.py +74 -19
  7. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_mcp_search_env.py +5 -3
  8. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_opencode_harbor.py +2 -2
  9. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_path_utils.py +14 -0
  10. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_save_utils.py +4 -0
  11. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_bfcl.py +18 -10
  12. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_config_extension.py +181 -29
  13. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_group_reward_env.py +8 -3
  14. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_rlm_swe.py +3 -3
  15. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/__init__.py +1 -1
  16. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/clients/openai_chat_completions_client.py +3 -24
  17. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/clients/openai_completions_client.py +5 -2
  18. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/environment.py +4 -0
  19. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/eval.py +5 -0
  20. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/init.py +77 -15
  21. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/serve/types.py +13 -8
  22. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/types.py +4 -2
  23. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/eval_display.py +25 -9
  24. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/eval_utils.py +30 -16
  25. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/path_utils.py +9 -3
  26. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/response_utils.py +29 -3
  27. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/save_utils.py +1 -3
  28. verifiers-0.1.15.dev7/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +252 -0
  29. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/README.md +21 -37
  30. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/RE_MIGRATION.md +4 -4
  31. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/config.py +66 -27
  32. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/config_utils.py +24 -1
  33. verifiers-0.1.15.dev5/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +0 -73
  34. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/.gitignore +0 -0
  35. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/LICENSE +0 -0
  36. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/pyproject.toml +0 -0
  37. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/AGENTS.md +0 -0
  38. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/README.md +0 -0
  39. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/__init__.py +0 -0
  40. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/conftest.py +0 -0
  41. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_browser_env.py +0 -0
  42. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_build_script.py +0 -0
  43. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_cli_agent_env.py +0 -0
  44. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_client_auth_errors.py +0 -0
  45. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_client_config.py +0 -0
  46. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_client_multimodal_types.py +0 -0
  47. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_composable_env.py +0 -0
  48. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_context_token_metrics.py +0 -0
  49. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_decorator_ranks.py +0 -0
  50. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_endpoint_registry.py +0 -0
  51. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_env_group.py +0 -0
  52. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_env_server.py +0 -0
  53. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_environment.py +0 -0
  54. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_environment_extra.py +0 -0
  55. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_envs.py +0 -0
  56. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_error_chain.py +0 -0
  57. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_gepa_cli.py +0 -0
  58. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_gepa_utils.py +0 -0
  59. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_gym_env.py +0 -0
  60. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_harbor_env_mcp.py +0 -0
  61. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_imports.py +0 -0
  62. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_install_utils.py +0 -0
  63. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_interception_utils.py +0 -0
  64. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_lean_task.py +0 -0
  65. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_logging.py +0 -0
  66. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_math_rubric.py +0 -0
  67. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_maybe_think_parser.py +0 -0
  68. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_message_utils.py +0 -0
  69. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_message_utils_multimodal.py +0 -0
  70. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_multiturn_env.py +0 -0
  71. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_nemorl_client.py +0 -0
  72. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_openai_chat_completions_token_client.py +0 -0
  73. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_openai_responses_client.py +0 -0
  74. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_opencode_rlm_env.py +0 -0
  75. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_openenv_client.py +0 -0
  76. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_parser.py +0 -0
  77. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_per_turn_timing.py +0 -0
  78. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_pricing_utils.py +0 -0
  79. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_prime_plugin.py +0 -0
  80. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_renderer_client.py +0 -0
  81. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_renderer_e2e.py +0 -0
  82. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_rlm_composable_env.py +0 -0
  83. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_rlm_env.py +0 -0
  84. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_rubric.py +0 -0
  85. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_rubric_group.py +0 -0
  86. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_sandbox_env.py +0 -0
  87. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_sandbox_mixin.py +0 -0
  88. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_setup_script.py +0 -0
  89. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_singleturn_env.py +0 -0
  90. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_stateful_tool_env.py +0 -0
  91. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_think_parser.py +0 -0
  92. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_tool_env.py +0 -0
  93. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_tool_utils.py +0 -0
  94. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_trajectory_processing.py +0 -0
  95. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_tui_info_formatting.py +0 -0
  96. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_types.py +0 -0
  97. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_empty_completions.py +0 -0
  98. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_endpoint_protocols.py +0 -0
  99. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_example_counts.py +0 -0
  100. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_harbor_cli.py +0 -0
  101. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_mini_swe_agent.py +0 -0
  102. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_runtime_lifecycle.py +0 -0
  103. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_scoring_functions.py +0 -0
  104. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_v1_taskset_bindings.py +0 -0
  105. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_wordle_env.py +0 -0
  106. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/tests/test_xml_parser.py +0 -0
  107. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/AGENTS.md +0 -0
  108. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/__init__.py +0 -0
  109. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/commands/__init__.py +0 -0
  110. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/commands/build.py +0 -0
  111. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/commands/eval.py +0 -0
  112. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/commands/gepa.py +0 -0
  113. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/commands/init.py +0 -0
  114. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/commands/install.py +0 -0
  115. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/commands/setup.py +0 -0
  116. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/plugins/__init__.py +0 -0
  117. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/plugins/prime.py +0 -0
  118. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/cli/tui.py +0 -0
  119. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/clients/__init__.py +0 -0
  120. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/clients/anthropic_messages_client.py +0 -0
  121. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/clients/client.py +0 -0
  122. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
  123. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
  124. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/clients/openai_responses_client.py +0 -0
  125. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/clients/renderer_client.py +0 -0
  126. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/decorators.py +0 -0
  127. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/AGENTS.md +0 -0
  128. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/__init__.py +0 -0
  129. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/env_group.py +0 -0
  130. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/README.md +0 -0
  131. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/__init__.py +0 -0
  132. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/cli_agent_env.py +0 -0
  133. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/README.md +0 -0
  134. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/__init__.py +0 -0
  135. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/_filter.py +0 -0
  136. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/composable_env.py +0 -0
  137. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harness.py +0 -0
  138. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
  139. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
  140. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
  141. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  142. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
  143. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
  144. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/task.py +0 -0
  145. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  146. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  147. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
  148. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  149. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  150. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
  151. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
  152. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
  153. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  154. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
  155. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  156. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
  157. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
  158. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
  159. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -0
  160. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
  161. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -0
  162. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
  163. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
  164. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +0 -0
  165. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
  166. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -0
  167. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
  168. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/gym_env.py +0 -0
  169. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
  170. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/harbor_env/env.py +0 -0
  171. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
  172. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/mcp_env.py +0 -0
  173. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/opencode_env.py +0 -0
  174. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  175. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
  176. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/rlm_env.py +0 -0
  177. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
  178. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/utils/__init__.py +0 -0
  179. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/utils/file_locks.py +0 -0
  180. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
  181. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/README.md +0 -0
  182. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/__init__.py +0 -0
  183. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/README.md +0 -0
  184. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  185. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
  186. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  187. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
  188. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
  189. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
  190. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/openenv_env.py +0 -0
  191. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  192. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/textarena_env.py +0 -0
  193. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/multiturn_env.py +0 -0
  194. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/python_env.py +0 -0
  195. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/sandbox_env.py +0 -0
  196. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/singleturn_env.py +0 -0
  197. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/stateful_tool_env.py +0 -0
  198. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/envs/tool_env.py +0 -0
  199. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/errors.py +0 -0
  200. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/gepa/__init__.py +0 -0
  201. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/gepa/adapter.py +0 -0
  202. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/gepa/config.py +0 -0
  203. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/gepa/display.py +0 -0
  204. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/gepa/gepa_utils.py +0 -0
  205. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/parsers/__init__.py +0 -0
  206. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/parsers/maybe_think_parser.py +0 -0
  207. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/parsers/parser.py +0 -0
  208. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/parsers/think_parser.py +0 -0
  209. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/parsers/xml_parser.py +0 -0
  210. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rl/README.md +0 -0
  211. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rl/__init__.py +0 -0
  212. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rl/inference/__init__.py +0 -0
  213. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rl/inference/client.py +0 -0
  214. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rl/inference/server.py +0 -0
  215. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/__init__.py +0 -0
  216. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/config.py +0 -0
  217. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/orchestrator.py +0 -0
  218. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/trainer.py +0 -0
  219. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/utils.py +0 -0
  220. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rubrics/__init__.py +0 -0
  221. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
  222. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rubrics/judge_rubric.py +0 -0
  223. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rubrics/math_rubric.py +0 -0
  224. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rubrics/rubric.py +0 -0
  225. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/rubrics/rubric_group.py +0 -0
  226. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/__init__.py +0 -0
  227. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/build.py +0 -0
  228. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/gepa.py +0 -0
  229. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/install.py +0 -0
  230. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/rl.py +0 -0
  231. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/setup.py +0 -0
  232. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/train.py +0 -0
  233. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/tui.py +0 -0
  234. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/scripts/vllm.py +0 -0
  235. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/serve/__init__.py +0 -0
  236. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/serve/client/env_client.py +0 -0
  237. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/serve/client/zmq_env_client.py +0 -0
  238. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/serve/server/__init__.py +0 -0
  239. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/serve/server/env_router.py +0 -0
  240. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/serve/server/env_server.py +0 -0
  241. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/serve/server/env_worker.py +0 -0
  242. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/serve/server/zmq_env_server.py +0 -0
  243. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/__init__.py +0 -0
  244. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/async_utils.py +0 -0
  245. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/client_utils.py +0 -0
  246. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/config_utils.py +0 -0
  247. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/data_utils.py +0 -0
  248. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/display_utils.py +0 -0
  249. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/env_config_utils.py +0 -0
  250. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/env_utils.py +0 -0
  251. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/error_utils.py +0 -0
  252. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/heartbeat.py +0 -0
  253. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/import_utils.py +0 -0
  254. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/install_utils.py +0 -0
  255. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/interception_utils.py +0 -0
  256. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/logging_utils.py +0 -0
  257. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/message_utils.py +0 -0
  258. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/metric_utils.py +0 -0
  259. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/pricing_utils.py +0 -0
  260. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/process_utils.py +0 -0
  261. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/serve_utils.py +0 -0
  262. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/thread_utils.py +0 -0
  263. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/threaded_sandbox_client.py +0 -0
  264. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/tool_utils.py +0 -0
  265. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/tunnel_utils.py +0 -0
  266. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/usage_utils.py +0 -0
  267. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/utils/version_utils.py +0 -0
  268. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/__init__.py +0 -0
  269. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/env.py +0 -0
  270. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/harness.py +0 -0
  271. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/__init__.py +0 -0
  272. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/__init__.py +0 -0
  273. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/command.py +0 -0
  274. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/configs.py +0 -0
  275. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/mini_swe_agent.py +0 -0
  276. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/opencode.py +0 -0
  277. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/pi.py +0 -0
  278. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/rlm.py +0 -0
  279. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/terminus_2.py +0 -0
  280. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/tasksets/__init__.py +0 -0
  281. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/packages/tasksets/harbor.py +0 -0
  282. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/runtime.py +0 -0
  283. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/state.py +0 -0
  284. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/task.py +0 -0
  285. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/taskset.py +0 -0
  286. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/toolset.py +0 -0
  287. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/types.py +0 -0
  288. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/user.py +0 -0
  289. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/__init__.py +0 -0
  290. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/artifact_utils.py +0 -0
  291. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/binding_utils.py +0 -0
  292. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/config_callable_utils.py +0 -0
  293. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/endpoint_utils.py +0 -0
  294. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/json_utils.py +0 -0
  295. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/judge_utils.py +0 -0
  296. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/lifecycle_utils.py +0 -0
  297. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
  298. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/mcp_utils.py +0 -0
  299. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/object_utils.py +0 -0
  300. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/program_utils.py +0 -0
  301. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/prompt_utils.py +0 -0
  302. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/runtime_registry.py +0 -0
  303. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
  304. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/sandbox_utils.py +0 -0
  305. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/scoring_utils.py +0 -0
  306. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/serialization_utils.py +0 -0
  307. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/task_freeze_utils.py +0 -0
  308. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/taskset_utils.py +0 -0
  309. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/timing_utils.py +0 -0
  310. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/tool_utils.py +0 -0
  311. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/trajectory_utils.py +0 -0
  312. {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev7}/verifiers/v1/utils/usage_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.15.dev5
3
+ Version: 0.1.15.dev7
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -174,6 +174,10 @@ Environments built with Verifiers are self-contained Python modules. To initiali
174
174
  ```bash
175
175
  prime env init my-env # creates a new template in ./environments/my_env
176
176
  ```
177
+ Add an explicit harness loader when the environment owns harness behavior:
178
+ ```bash
179
+ prime env init my-env --with-harness
180
+ ```
177
181
  For OpenEnv integration, use:
178
182
  ```bash
179
183
  prime env init my-openenv --openenv
@@ -191,7 +195,9 @@ environments/my_env/
191
195
  └── README.md # Documentation
192
196
  ```
193
197
 
194
- Environment modules should expose a `load_environment` function which returns an instance of the Environment object, and which can accept custom arguments. For example:
198
+ Environment modules should expose a `load_environment` function which returns an
199
+ environment object. For simple legacy environments, this can still be a direct
200
+ constructor:
195
201
  ```python
196
202
  # my_env.py
197
203
  import verifiers as vf
@@ -223,7 +229,7 @@ def source():
223
229
  async def contains_answer(task, state) -> float:
224
230
  return float(task["answer"] in str(state.get("completion") or ""))
225
231
 
226
- def load_taskset(config: vf.TasksetConfig | None = None):
232
+ def load_taskset(config: vf.TasksetConfig):
227
233
  return vf.Taskset(source=source, rewards=[contains_answer], config=config)
228
234
 
229
235
  def load_environment(config: vf.EnvConfig) -> vf.Env:
@@ -244,8 +250,8 @@ env = vf.Env(
244
250
  ```
245
251
 
246
252
  The same environment package is the unit used by evals and `prime-rl`. The
247
- trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
248
- and harness options stay under `env.taskset` and `env.harness`:
253
+ trainer owns model, endpoint, sampling, and rollout count; v1-specific options
254
+ stay on the taskset or harness config that owns them:
249
255
 
250
256
  ```toml
251
257
  # configs/rl/my-v1-env.toml
@@ -260,12 +266,12 @@ max_tokens = 4096
260
266
  [[env]]
261
267
  id = "my-env"
262
268
 
263
- [env.args]
264
- arg1 = "non-th-arg"
265
-
266
269
  [env.harness]
267
270
  max_turns = 1
268
271
 
272
+ [env.taskset]
273
+ split = "train"
274
+
269
275
  [env.taskset.scoring.contains_answer]
270
276
  weight = 1.0
271
277
  ```
@@ -99,6 +99,10 @@ Environments built with Verifiers are self-contained Python modules. To initiali
99
99
  ```bash
100
100
  prime env init my-env # creates a new template in ./environments/my_env
101
101
  ```
102
+ Add an explicit harness loader when the environment owns harness behavior:
103
+ ```bash
104
+ prime env init my-env --with-harness
105
+ ```
102
106
  For OpenEnv integration, use:
103
107
  ```bash
104
108
  prime env init my-openenv --openenv
@@ -116,7 +120,9 @@ environments/my_env/
116
120
  └── README.md # Documentation
117
121
  ```
118
122
 
119
- Environment modules should expose a `load_environment` function which returns an instance of the Environment object, and which can accept custom arguments. For example:
123
+ Environment modules should expose a `load_environment` function which returns an
124
+ environment object. For simple legacy environments, this can still be a direct
125
+ constructor:
120
126
  ```python
121
127
  # my_env.py
122
128
  import verifiers as vf
@@ -148,7 +154,7 @@ def source():
148
154
  async def contains_answer(task, state) -> float:
149
155
  return float(task["answer"] in str(state.get("completion") or ""))
150
156
 
151
- def load_taskset(config: vf.TasksetConfig | None = None):
157
+ def load_taskset(config: vf.TasksetConfig):
152
158
  return vf.Taskset(source=source, rewards=[contains_answer], config=config)
153
159
 
154
160
  def load_environment(config: vf.EnvConfig) -> vf.Env:
@@ -169,8 +175,8 @@ env = vf.Env(
169
175
  ```
170
176
 
171
177
  The same environment package is the unit used by evals and `prime-rl`. The
172
- trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
173
- and harness options stay under `env.taskset` and `env.harness`:
178
+ trainer owns model, endpoint, sampling, and rollout count; v1-specific options
179
+ stay on the taskset or harness config that owns them:
174
180
 
175
181
  ```toml
176
182
  # configs/rl/my-v1-env.toml
@@ -185,12 +191,12 @@ max_tokens = 4096
185
191
  [[env]]
186
192
  id = "my-env"
187
193
 
188
- [env.args]
189
- arg1 = "non-th-arg"
190
-
191
194
  [env.harness]
192
195
  max_turns = 1
193
196
 
197
+ [env.taskset]
198
+ split = "train"
199
+
194
200
  [env.taskset.scoring.contains_answer]
195
201
  weight = 1.0
196
202
  ```
@@ -13,6 +13,7 @@ import verifiers.scripts.eval as vf_eval
13
13
  import verifiers.utils.eval_utils
14
14
  from verifiers.types import GenerateOutputs
15
15
  from verifiers.utils.eval_utils import load_toml_config
16
+ from verifiers.utils.path_utils import get_eval_results_path
16
17
  from verifiers.utils.save_utils import states_to_outputs
17
18
 
18
19
 
@@ -706,6 +707,34 @@ def test_load_toml_config_multi_env():
706
707
  assert result[1]["env_id"] == "env2"
707
708
 
708
709
 
710
+ def test_load_toml_config_duplicate_envs_accept_names():
711
+ """Duplicate env ids can be labeled and configured independently."""
712
+ with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
713
+ f.write(
714
+ '[[eval]]\nid = "env1"\nname = "env1-short"\n'
715
+ "[eval.args]\n"
716
+ 'split = "short"\n\n'
717
+ '[[eval]]\nid = "env1"\nname = "env1-long"\n'
718
+ "[eval.args]\n"
719
+ 'split = "long"\n'
720
+ )
721
+ f.flush()
722
+ result = load_toml_config(Path(f.name))
723
+
724
+ assert len(result) == 2
725
+ assert [config["env_id"] for config in result] == ["env1", "env1"]
726
+ assert [config["name"] for config in result] == ["env1-short", "env1-long"]
727
+ assert [config["env_args"]["split"] for config in result] == ["short", "long"]
728
+
729
+
730
+ def test_load_toml_config_rejects_global_name():
731
+ with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
732
+ f.write('name = "shared-name"\n\n[[eval]]\nid = "env1"\n')
733
+ f.flush()
734
+ with pytest.raises(ValueError, match="Invalid global field"):
735
+ load_toml_config(Path(f.name))
736
+
737
+
709
738
  def test_load_toml_config_with_env_args():
710
739
  """Multiple sections with env_args field loads correctly."""
711
740
  with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
@@ -815,6 +844,28 @@ def test_cli_multi_env_via_toml_config(monkeypatch, run_cli):
815
844
  assert configs[1].env_id == "env2"
816
845
 
817
846
 
847
+ def test_cli_duplicate_env_names_disambiguate_result_paths(monkeypatch, run_cli):
848
+ with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
849
+ f.write(
850
+ '[[eval]]\nid = "env1"\nname = "env1-short"\n'
851
+ "[eval.args]\n"
852
+ 'split = "short"\n\n'
853
+ '[[eval]]\nid = "env1"\nname = "env1-long"\n'
854
+ "[eval.args]\n"
855
+ 'split = "long"\n'
856
+ )
857
+ f.flush()
858
+ captured = run_cli(monkeypatch, {"env_id_or_config": f.name})
859
+
860
+ configs = captured["configs"]
861
+ assert len(configs) == 2
862
+ assert [config.env_id for config in configs] == ["env1", "env1"]
863
+ assert [config.name for config in configs] == ["env1-short", "env1-long"]
864
+ assert [config.env_args["split"] for config in configs] == ["short", "long"]
865
+ assert get_eval_results_path(configs[0]).parent.name.startswith("env1-short--")
866
+ assert get_eval_results_path(configs[1]).parent.name.startswith("env1-long--")
867
+
868
+
818
869
  def test_cli_toml_ignores_cli_args(monkeypatch, run_cli):
819
870
  """TOML config ignores CLI args, uses defaults for unspecified values."""
820
871
  with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
@@ -11,9 +11,11 @@ def make_config(
11
11
  independent_scoring: bool = False,
12
12
  endpoint_id: str | None = None,
13
13
  client_config: ClientConfig | None = None,
14
+ name: str | None = None,
14
15
  ) -> EvalConfig:
15
16
  return EvalConfig(
16
17
  env_id="dummy-env",
18
+ name=name,
17
19
  env_args={},
18
20
  env_dir_path="./environments",
19
21
  endpoint_id=endpoint_id,
@@ -82,6 +84,20 @@ def test_format_client_target_uses_single_resolved_base_url() -> None:
82
84
  assert EvalDisplay._format_client_target(config) == "http://localhost:8001/v1"
83
85
 
84
86
 
87
+ def test_display_uses_eval_name_for_duplicate_env_labels() -> None:
88
+ display = EvalDisplay(
89
+ [
90
+ make_config(max_concurrent=1, name="dummy-env-short"),
91
+ make_config(max_concurrent=1, name="dummy-env-long"),
92
+ ]
93
+ )
94
+
95
+ rendered = render_plain(display._make_compact_env_row(0))
96
+
97
+ assert "dummy-env-short" in rendered
98
+ assert "dummy-env-long" not in rendered
99
+
100
+
85
101
  def render_plain(renderable) -> str:
86
102
  console = Console(width=100, record=True)
87
103
  console.print(renderable)
@@ -87,6 +87,22 @@ def test_print_results_single_rollout(capsys, make_metadata, make_state, make_in
87
87
  assert "r1: [0.1, 0.2, 0.3]" in captured.out
88
88
 
89
89
 
90
+ def test_print_results_includes_eval_name(capsys, make_metadata, make_output):
91
+ from verifiers.utils.eval_utils import print_results
92
+
93
+ metadata = make_metadata(env_id="env1")
94
+ metadata["name"] = "env1-short"
95
+ results = GenerateOutputs(
96
+ outputs=[make_output(example_id=0, reward=1.0)],
97
+ metadata=metadata,
98
+ )
99
+
100
+ print_results(results)
101
+ captured = capsys.readouterr()
102
+
103
+ assert "Environment: env1-short (env1)" in captured.out
104
+
105
+
90
106
  def test_print_results_three_rollouts(capsys, make_metadata, make_state, make_input):
91
107
  """Test print_results with three rollouts per example."""
92
108
  from verifiers.utils.eval_utils import print_results
@@ -57,7 +57,7 @@ def test_wikispeedia_loads_as_v1_taskset_harness(
57
57
  ) -> None:
58
58
  module = load_module(monkeypatch)
59
59
 
60
- env = module.load_environment(config=vf.EnvConfig(), train_size=1, eval_size=1)
60
+ env = module.load_environment(config=module.WikispeediaEnvConfig())
61
61
 
62
62
  assert isinstance(env, vf.Env)
63
63
  assert isinstance(env.taskset, vf.Taskset)
@@ -65,6 +65,43 @@ def test_wikispeedia_loads_as_v1_taskset_harness(
65
65
  assert env.taskset.taskset_id == "langchain-deep-agents-wikispeedia"
66
66
 
67
67
 
68
+ def test_wikispeedia_env_config_reaches_taskset_and_harness(
69
+ monkeypatch: pytest.MonkeyPatch,
70
+ ) -> None:
71
+ module = load_module(monkeypatch)
72
+ wiki = make_small_wiki(module)
73
+ monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
74
+
75
+ env = module.load_environment(
76
+ config=module.WikispeediaEnvConfig(
77
+ taskset={
78
+ "train_size": 2,
79
+ "eval_size": 1,
80
+ "min_path_length": 1,
81
+ "max_path_length": 1,
82
+ "eval_target_fraction": 0.5,
83
+ "allow_go_back": False,
84
+ "links_only": True,
85
+ "max_turns": 7,
86
+ },
87
+ harness={
88
+ "max_turns": 8,
89
+ "timeout_seconds": 9.0,
90
+ },
91
+ )
92
+ )
93
+
94
+ train_rows = list(env.taskset.source())
95
+ eval_rows = list(env.taskset.eval_source())
96
+
97
+ assert len(train_rows) == 2
98
+ assert len(eval_rows) == 1
99
+ assert train_rows[0]["max_turns"] == 7
100
+ assert env.harness.config.max_turns == 8
101
+ assert env.harness.config.timeout_seconds == 9.0
102
+ assert [tool.__name__ for tool in env.taskset.toolsets[0].tools] == ["click_link"]
103
+
104
+
68
105
  def test_wikispeedia_rows_use_v1_task_shape(
69
106
  monkeypatch: pytest.MonkeyPatch,
70
107
  ) -> None:
@@ -90,11 +127,13 @@ def test_wikispeedia_taskset_sources_use_disjoint_target_split(
90
127
  wiki = make_small_wiki(module)
91
128
  monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
92
129
  taskset = module.load_taskset(
93
- train_size=2,
94
- eval_size=1,
95
- min_path_length=1,
96
- max_path_length=1,
97
- eval_target_fraction=0.5,
130
+ config=module.WikispeediaTasksetConfig(
131
+ train_size=2,
132
+ eval_size=1,
133
+ min_path_length=1,
134
+ max_path_length=1,
135
+ eval_target_fraction=0.5,
136
+ )
98
137
  )
99
138
 
100
139
  train_rows = list(taskset.source())
@@ -114,8 +153,12 @@ def test_wikispeedia_efficiency_weight_uses_fresh_reward_wrapper(
114
153
  wiki = make_small_wiki(module)
115
154
  monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
116
155
 
117
- weighted = module.load_taskset(efficiency_weight=0.5)
118
- plain = module.load_taskset(efficiency_weight=0.0)
156
+ weighted = module.load_taskset(
157
+ config=module.WikispeediaTasksetConfig(efficiency_weight=0.5)
158
+ )
159
+ plain = module.load_taskset(
160
+ config=module.WikispeediaTasksetConfig(efficiency_weight=0.0)
161
+ )
119
162
 
120
163
  assert any(fn.__name__ == "path_efficiency" for fn in weighted.rewards)
121
164
  assert any(fn is module.path_efficiency for fn in plain.metrics)
@@ -127,13 +170,17 @@ def test_wikispeedia_taskset_owns_navigation_tools(
127
170
  ) -> None:
128
171
  module = load_module(monkeypatch)
129
172
 
130
- taskset = module.load_taskset(allow_go_back=True)
173
+ taskset = module.load_taskset(
174
+ config=module.WikispeediaTasksetConfig(allow_go_back=True)
175
+ )
131
176
  names = [tool.__name__ for tool in taskset.toolsets[0].tools]
132
- no_back = module.load_taskset(allow_go_back=False)
177
+ no_back = module.load_taskset(
178
+ config=module.WikispeediaTasksetConfig(allow_go_back=False)
179
+ )
133
180
 
134
181
  assert names == ["click_link", "go_back"]
135
182
  assert [tool.__name__ for tool in no_back.toolsets[0].tools] == ["click_link"]
136
- assert module.load_harness().toolsets == []
183
+ assert module.load_harness(config=module.WikispeediaHarnessConfig()).toolsets == []
137
184
 
138
185
 
139
186
  def test_wikispeedia_system_prompt_matches_available_tools(
@@ -141,8 +188,12 @@ def test_wikispeedia_system_prompt_matches_available_tools(
141
188
  ) -> None:
142
189
  module = load_module(monkeypatch)
143
190
 
144
- with_back = module.load_taskset(allow_go_back=True)
145
- without_back = module.load_taskset(allow_go_back=False)
191
+ with_back = module.load_taskset(
192
+ config=module.WikispeediaTasksetConfig(allow_go_back=True)
193
+ )
194
+ without_back = module.load_taskset(
195
+ config=module.WikispeediaTasksetConfig(allow_go_back=False)
196
+ )
146
197
 
147
198
  assert "go_back" in with_back.system_prompt[0]["content"]
148
199
  assert "go_back" not in without_back.system_prompt[0]["content"]
@@ -156,12 +207,16 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime(
156
207
  module = load_module(monkeypatch)
157
208
  wiki = make_small_wiki(module)
158
209
  monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
159
- env = module.load_environment(
160
- config=vf.EnvConfig(),
161
- train_size=2,
162
- eval_size=1,
163
- min_path_length=1,
164
- max_path_length=1,
210
+ env = vf.Env(
211
+ taskset=module.load_taskset(
212
+ config=module.WikispeediaTasksetConfig(
213
+ train_size=2,
214
+ eval_size=1,
215
+ min_path_length=1,
216
+ max_path_length=1,
217
+ )
218
+ ),
219
+ harness=module.load_harness(config=module.WikispeediaHarnessConfig()),
165
220
  )
166
221
  task = module.vf.Task(list(env.taskset.source())[0]).freeze()
167
222
  state = module.vf.State.for_task(task)
@@ -26,7 +26,9 @@ def _load_mcp_search_module() -> Any:
26
26
  def test_mcp_search_env_is_v1_only() -> None:
27
27
  module = _load_mcp_search_module()
28
28
 
29
- env = module.load_environment(config=vf.EnvConfig(), max_turns=4)
29
+ env = module.load_environment(
30
+ config=module.MCPSearchEnvConfig(taskset={"max_turns": 4})
31
+ )
30
32
 
31
33
  assert isinstance(env, vf.Env)
32
34
  assert isinstance(env.taskset, vf.Taskset)
@@ -40,7 +42,7 @@ def test_mcp_search_env_is_v1_only() -> None:
40
42
  def test_mcp_search_default_taskset_has_stable_non_doc_fixture() -> None:
41
43
  module = _load_mcp_search_module()
42
44
 
43
- rows = module.load_taskset().rows()
45
+ rows = module.load_taskset(config=module.MCPSearchTasksetConfig()).rows()
44
46
 
45
47
  assert len(rows) >= 10
46
48
  assert len({row["answer"] for row in rows}) == len(rows)
@@ -52,7 +54,7 @@ def test_mcp_search_taskset_accepts_v1_taskset_config() -> None:
52
54
  module = _load_mcp_search_module()
53
55
 
54
56
  env = module.load_environment(
55
- config=vf.EnvConfig(taskset={"max_turns": 3}),
57
+ config=module.MCPSearchEnvConfig(taskset={"max_turns": 3}),
56
58
  )
57
59
  rows = env.taskset.rows()
58
60
 
@@ -28,7 +28,7 @@ def _load_opencode_module() -> Any:
28
28
  def test_load_environment_uses_v1_taskset_and_harness() -> None:
29
29
  module = _load_opencode_module()
30
30
 
31
- env = module.load_environment(config=vf.EnvConfig())
31
+ env = module.load_environment(config=module.OpenCodeHarborEnvConfig())
32
32
 
33
33
  assert isinstance(env, vf.Env)
34
34
  assert isinstance(env.taskset, vf.HarborTaskset)
@@ -52,7 +52,7 @@ def test_load_environment_accepts_v1_taskset_and_harness_config() -> None:
52
52
  module = _load_opencode_module()
53
53
 
54
54
  env = module.load_environment(
55
- config=vf.EnvConfig(
55
+ config=module.OpenCodeHarborEnvConfig(
56
56
  taskset={
57
57
  "task_names": ["task-a"],
58
58
  "cpu_cores": 1.5,
@@ -3,6 +3,7 @@ from pathlib import Path
3
3
 
4
4
  from verifiers.utils.path_utils import (
5
5
  find_latest_incomplete_eval_results_path,
6
+ get_eval_runs_dir,
6
7
  is_valid_eval_results_path,
7
8
  )
8
9
 
@@ -69,6 +70,19 @@ def test_find_latest_incomplete_eval_results_path_returns_none_when_no_match(
69
70
  assert result is None
70
71
 
71
72
 
73
+ def test_get_eval_runs_dir_uses_name_as_result_label(tmp_path: Path):
74
+ runs_dir = get_eval_runs_dir(
75
+ env_id="dummy-env",
76
+ name="dummy-env-short",
77
+ model="openai/gpt-4.1-mini",
78
+ output_dir=str(tmp_path / "outputs"),
79
+ )
80
+
81
+ assert runs_dir == (
82
+ tmp_path / "outputs" / "evals" / "dummy-env-short--openai--gpt-4.1-mini"
83
+ )
84
+
85
+
72
86
  def test_is_valid_eval_results_path_requires_files(tmp_path: Path):
73
87
  run_dir = tmp_path / "run"
74
88
  run_dir.mkdir()
@@ -32,6 +32,7 @@ from verifiers.utils.save_utils import (
32
32
  make_serializable,
33
33
  save_new_outputs,
34
34
  states_to_outputs,
35
+ truncate_malformed_trailing_line,
35
36
  validate_resume_metadata,
36
37
  )
37
38
  from verifiers.utils.usage_utils import StateUsageTracker, response_usage_tokens
@@ -488,6 +489,9 @@ class TestSaveNewOutputs:
488
489
  "\n".join(lines + [malformed_trailing_line]), encoding="utf-8"
489
490
  )
490
491
 
492
+ # Caller drops the partial trailing row before appending so the new
493
+ # row lands on a valid JSONL boundary.
494
+ truncate_malformed_trailing_line(outputs_path)
491
495
  save_new_outputs(
492
496
  [{"example_id": 3, "label": "row-3"}],
493
497
  results_path,
@@ -75,12 +75,12 @@ def test_bfcl_public_loader_is_v1_only(monkeypatch: pytest.MonkeyPatch) -> None:
75
75
  seen_taskset_config: vf.TasksetConfig | None = None
76
76
  seen_harness_config: vf.HarnessConfig | None = None
77
77
 
78
- def fake_taskset(config: vf.TasksetConfig | None = None) -> vf.Taskset:
78
+ def fake_taskset(config: vf.TasksetConfig) -> vf.Taskset:
79
79
  nonlocal seen_taskset_config
80
80
  seen_taskset_config = config
81
81
  return vf.Taskset(source=[], config=config)
82
82
 
83
- def fake_harness(config: vf.HarnessConfig | None = None) -> vf.Harness:
83
+ def fake_harness(config: vf.HarnessConfig) -> vf.Harness:
84
84
  nonlocal seen_harness_config
85
85
  seen_harness_config = config
86
86
  return vf.Harness(config=config)
@@ -89,9 +89,13 @@ def test_bfcl_public_loader_is_v1_only(monkeypatch: pytest.MonkeyPatch) -> None:
89
89
  monkeypatch.setattr(bfcl, "load_harness", fake_harness)
90
90
 
91
91
  env = bfcl.load_environment(
92
- config=vf.EnvConfig(),
93
- test_category="simple_python",
94
- examples_per_category=0,
92
+ config=bfcl.BFCLEnvConfig(
93
+ taskset=bfcl.BFCLTasksetConfig(
94
+ test_category="simple_python",
95
+ examples_per_category=0,
96
+ ),
97
+ harness=bfcl.BFCLHarnessConfig(),
98
+ )
95
99
  )
96
100
 
97
101
  assert isinstance(env, vf.Env)
@@ -110,12 +114,12 @@ def test_bfcl_loader_supports_category_groups(
110
114
  seen_taskset_categories = []
111
115
  seen_harness_categories = []
112
116
 
113
- def fake_taskset(config: vf.TasksetConfig | None = None) -> vf.Taskset:
117
+ def fake_taskset(config: vf.TasksetConfig) -> vf.Taskset:
114
118
  assert isinstance(config, bfcl.BFCLTasksetConfig)
115
119
  seen_taskset_categories.append(config.test_category)
116
120
  return vf.Taskset(source=[{"question": "q", "answer": "a"}], config=config)
117
121
 
118
- def fake_harness(config: vf.HarnessConfig | None = None) -> vf.Harness:
122
+ def fake_harness(config: vf.HarnessConfig) -> vf.Harness:
119
123
  assert isinstance(config, bfcl.BFCLHarnessConfig)
120
124
  seen_harness_categories.append(config.test_category)
121
125
  return vf.Harness(config=config)
@@ -124,9 +128,13 @@ def test_bfcl_loader_supports_category_groups(
124
128
  monkeypatch.setattr(bfcl, "load_harness", fake_harness)
125
129
 
126
130
  env = bfcl.load_environment(
127
- config=vf.EnvConfig(),
128
- test_categories=["simple_python", "simple_java"],
129
- examples_per_category=0,
131
+ config=bfcl.BFCLEnvConfig(
132
+ taskset=bfcl.BFCLTasksetConfig(
133
+ test_categories=["simple_python", "simple_java"],
134
+ examples_per_category=0,
135
+ ),
136
+ harness=bfcl.BFCLHarnessConfig(),
137
+ )
130
138
  )
131
139
 
132
140
  assert isinstance(env, root_vf.EnvGroup)