verifiers 0.1.15.dev170__tar.gz → 0.1.15.dev171__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/PKG-INFO +1 -1
  2. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/README.md +3 -1
  3. verifiers-0.1.15.dev171/verifiers/envs/experimental/composable/tasksets/search/__init__.py +15 -0
  4. verifiers-0.1.15.dev171/verifiers/envs/experimental/composable/tasksets/search/redsearcher/README.md +38 -0
  5. verifiers-0.1.15.dev171/verifiers/envs/experimental/composable/tasksets/search/redsearcher/__init__.py +5 -0
  6. verifiers-0.1.15.dev171/verifiers/envs/experimental/composable/tasksets/search/redsearcher/taskset.py +556 -0
  7. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/search_tasksets.py +10 -0
  8. verifiers-0.1.15.dev170/verifiers/envs/experimental/composable/tasksets/search/__init__.py +0 -9
  9. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/.gitignore +0 -0
  10. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/LICENSE +0 -0
  11. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/README.md +0 -0
  12. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/pyproject.toml +0 -0
  13. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/AGENTS.md +0 -0
  14. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/README.md +0 -0
  15. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/__init__.py +0 -0
  16. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/conftest.py +0 -0
  17. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_browser_env.py +0 -0
  18. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_build_script.py +0 -0
  19. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_cli_agent_env.py +0 -0
  20. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_client_auth_errors.py +0 -0
  21. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_client_config.py +0 -0
  22. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_client_multimodal_types.py +0 -0
  23. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_composable_env.py +0 -0
  24. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_context_token_metrics.py +0 -0
  25. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_decorator_ranks.py +0 -0
  26. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_endpoint_registry.py +0 -0
  27. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_env_group.py +0 -0
  28. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_env_server.py +0 -0
  29. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_environment.py +0 -0
  30. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_environment_extra.py +0 -0
  31. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_envs.py +0 -0
  32. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_error_chain.py +0 -0
  33. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_eval_cli.py +0 -0
  34. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_eval_display.py +0 -0
  35. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_eval_utils.py +0 -0
  36. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_gepa_cli.py +0 -0
  37. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_gepa_utils.py +0 -0
  38. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_gym_env.py +0 -0
  39. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_harbor_env_mcp.py +0 -0
  40. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_imports.py +0 -0
  41. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_init_script.py +0 -0
  42. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_install_utils.py +0 -0
  43. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_interception_utils.py +0 -0
  44. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_langchain_deep_agents_wikispeedia.py +0 -0
  45. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_lean_task.py +0 -0
  46. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_logging.py +0 -0
  47. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_math_rubric.py +0 -0
  48. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_maybe_think_parser.py +0 -0
  49. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_mcp_search_env.py +0 -0
  50. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_message_utils.py +0 -0
  51. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_message_utils_multimodal.py +0 -0
  52. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_multiturn_env.py +0 -0
  53. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_nemorl_client.py +0 -0
  54. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_openai_chat_completions_token_client.py +0 -0
  55. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_openai_responses_client.py +0 -0
  56. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_opencode_harbor.py +0 -0
  57. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_opencode_rlm_env.py +0 -0
  58. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_openenv_client.py +0 -0
  59. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_parser.py +0 -0
  60. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_path_utils.py +0 -0
  61. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_per_turn_timing.py +0 -0
  62. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_pricing_utils.py +0 -0
  63. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_prime_plugin.py +0 -0
  64. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_renderer_client.py +0 -0
  65. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_renderer_e2e.py +0 -0
  66. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_rlm_composable_env.py +0 -0
  67. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_rubric.py +0 -0
  68. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_rubric_group.py +0 -0
  69. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_sandbox_env.py +0 -0
  70. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_sandbox_mixin.py +0 -0
  71. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_save_utils.py +0 -0
  72. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_setup_script.py +0 -0
  73. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_singleturn_env.py +0 -0
  74. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_stateful_tool_env.py +0 -0
  75. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_think_parser.py +0 -0
  76. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_tool_env.py +0 -0
  77. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_tool_utils.py +0 -0
  78. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_trajectory_processing.py +0 -0
  79. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_tui_info_formatting.py +0 -0
  80. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_types.py +0 -0
  81. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_bfcl.py +0 -0
  82. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_config_extension.py +0 -0
  83. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_empty_completions.py +0 -0
  84. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_endpoint_protocols.py +0 -0
  85. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_example_counts.py +0 -0
  86. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_group_reward_env.py +0 -0
  87. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_harbor_cli.py +0 -0
  88. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_mini_swe_agent.py +0 -0
  89. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_nemo_gym_harness.py +0 -0
  90. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_openenv_taskset.py +0 -0
  91. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_openreward_taskset.py +0 -0
  92. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_replay_harness.py +0 -0
  93. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_rlm_swe.py +0 -0
  94. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_runtime_lifecycle.py +0 -0
  95. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_scoring_functions.py +0 -0
  96. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_taskset_bindings.py +0 -0
  97. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_taskset_utils.py +0 -0
  98. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_v1_textarena_taskset.py +0 -0
  99. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_wiki_search_v1.py +0 -0
  100. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_wordle_env.py +0 -0
  101. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_wordle_v1_env.py +0 -0
  102. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/tests/test_xml_parser.py +0 -0
  103. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/AGENTS.md +0 -0
  104. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/__init__.py +0 -0
  105. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/__init__.py +0 -0
  106. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/commands/__init__.py +0 -0
  107. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/commands/build.py +0 -0
  108. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/commands/eval.py +0 -0
  109. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/commands/gepa.py +0 -0
  110. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/commands/init.py +0 -0
  111. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/commands/install.py +0 -0
  112. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/commands/setup.py +0 -0
  113. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/plugins/__init__.py +0 -0
  114. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/plugins/prime.py +0 -0
  115. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/cli/tui.py +0 -0
  116. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/clients/__init__.py +0 -0
  117. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/clients/anthropic_messages_client.py +0 -0
  118. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/clients/client.py +0 -0
  119. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
  120. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/clients/openai_chat_completions_client.py +0 -0
  121. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
  122. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/clients/openai_completions_client.py +0 -0
  123. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/clients/openai_responses_client.py +0 -0
  124. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/clients/renderer_client.py +0 -0
  125. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/decorators.py +0 -0
  126. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/AGENTS.md +0 -0
  127. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/__init__.py +0 -0
  128. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/env_group.py +0 -0
  129. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/environment.py +0 -0
  130. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/README.md +0 -0
  131. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/__init__.py +0 -0
  132. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/cli_agent_env.py +0 -0
  133. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/README.md +0 -0
  134. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/__init__.py +0 -0
  135. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/_filter.py +0 -0
  136. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/composable_env.py +0 -0
  137. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/harness.py +0 -0
  138. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
  139. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
  140. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
  141. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  142. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
  143. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
  144. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/task.py +0 -0
  145. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  146. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  147. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
  148. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  149. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  150. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
  151. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
  152. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
  153. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  154. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
  155. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/openseeker/README.md +0 -0
  156. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/openseeker/__init__.py +0 -0
  157. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/openseeker/taskset.py +0 -0
  158. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/README.md +0 -0
  159. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/__init__.py +0 -0
  160. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/__init__.py +0 -0
  161. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/api_tools/__init__.py +0 -0
  162. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/api_tools/tool_pdf.py +0 -0
  163. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/eval_toolkit.py +0 -0
  164. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/evaluator.py +0 -0
  165. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/llm_client/__init__.py +0 -0
  166. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/llm_client/base_client.py +0 -0
  167. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/prompts/__init__.py +0 -0
  168. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/prompts/cache_prompts.py +0 -0
  169. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/__init__.py +0 -0
  170. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/cache_filesys.py +0 -0
  171. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/load_eval_script.py +0 -0
  172. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/misc.py +0 -0
  173. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/tool_visit.py +0 -0
  174. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/url_tools.py +0 -0
  175. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/verification_tree.py +0 -0
  176. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/search/quest/taskset.py +0 -0
  177. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/README.md +0 -0
  178. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  179. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/__init__.py +0 -0
  180. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/extract_fix_patch.sh +0 -0
  181. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/taskset.py +0 -0
  182. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/openswe/__init__.py +0 -0
  183. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/openswe/taskset.py +0 -0
  184. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/__init__.py +0 -0
  185. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/log_parser.py +0 -0
  186. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/taskset.py +0 -0
  187. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/scale_swe/__init__.py +0 -0
  188. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/scale_swe/taskset.py +0 -0
  189. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/shared/__init__.py +0 -0
  190. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/shared/test_patch.py +0 -0
  191. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench/__init__.py +0 -0
  192. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench/taskset.py +0 -0
  193. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego/__init__.py +0 -0
  194. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego/taskset.py +0 -0
  195. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/__init__.py +0 -0
  196. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/log_parsers.py +0 -0
  197. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/taskset.py +0 -0
  198. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith/__init__.py +0 -0
  199. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith/taskset.py +0 -0
  200. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
  201. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/gym_env.py +0 -0
  202. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
  203. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/harbor_env/env.py +0 -0
  204. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
  205. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/mcp_env.py +0 -0
  206. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/opencode_env.py +0 -0
  207. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  208. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
  209. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
  210. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/utils/__init__.py +0 -0
  211. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/utils/file_locks.py +0 -0
  212. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
  213. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/README.md +0 -0
  214. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/__init__.py +0 -0
  215. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/browser_env/README.md +0 -0
  216. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  217. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
  218. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  219. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
  220. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
  221. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
  222. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/openenv_env.py +0 -0
  223. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  224. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/integrations/textarena_env.py +0 -0
  225. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/multiturn_env.py +0 -0
  226. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/python_env.py +0 -0
  227. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/sandbox_env.py +0 -0
  228. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/singleturn_env.py +0 -0
  229. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/stateful_tool_env.py +0 -0
  230. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/envs/tool_env.py +0 -0
  231. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/errors.py +0 -0
  232. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/gepa/__init__.py +0 -0
  233. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/gepa/adapter.py +0 -0
  234. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/gepa/config.py +0 -0
  235. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/gepa/display.py +0 -0
  236. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/gepa/gepa_utils.py +0 -0
  237. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/parsers/__init__.py +0 -0
  238. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/parsers/maybe_think_parser.py +0 -0
  239. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/parsers/parser.py +0 -0
  240. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/parsers/think_parser.py +0 -0
  241. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/parsers/xml_parser.py +0 -0
  242. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rl/README.md +0 -0
  243. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rl/__init__.py +0 -0
  244. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rl/inference/__init__.py +0 -0
  245. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rl/inference/client.py +0 -0
  246. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rl/inference/server.py +0 -0
  247. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rl/trainer/__init__.py +0 -0
  248. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rl/trainer/config.py +0 -0
  249. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rl/trainer/orchestrator.py +0 -0
  250. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rl/trainer/trainer.py +0 -0
  251. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rl/trainer/utils.py +0 -0
  252. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rubrics/__init__.py +0 -0
  253. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
  254. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rubrics/judge_rubric.py +0 -0
  255. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rubrics/math_rubric.py +0 -0
  256. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rubrics/rubric.py +0 -0
  257. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/rubrics/rubric_group.py +0 -0
  258. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/__init__.py +0 -0
  259. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/build.py +0 -0
  260. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/eval.py +0 -0
  261. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/gepa.py +0 -0
  262. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/init.py +0 -0
  263. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/install.py +0 -0
  264. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/rl.py +0 -0
  265. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/setup.py +0 -0
  266. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/train.py +0 -0
  267. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/tui.py +0 -0
  268. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/scripts/vllm.py +0 -0
  269. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/serve/__init__.py +0 -0
  270. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/serve/client/env_client.py +0 -0
  271. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/serve/client/zmq_env_client.py +0 -0
  272. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/serve/server/__init__.py +0 -0
  273. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/serve/server/env_router.py +0 -0
  274. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/serve/server/env_server.py +0 -0
  275. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/serve/server/env_worker.py +0 -0
  276. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/serve/server/zmq_env_server.py +0 -0
  277. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/serve/types.py +0 -0
  278. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/types.py +0 -0
  279. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/__init__.py +0 -0
  280. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/async_utils.py +0 -0
  281. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/client_utils.py +0 -0
  282. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/config_utils.py +0 -0
  283. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/data_utils.py +0 -0
  284. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/display_utils.py +0 -0
  285. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/env_config_utils.py +0 -0
  286. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/env_utils.py +0 -0
  287. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/error_utils.py +0 -0
  288. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/eval_display.py +0 -0
  289. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/eval_utils.py +0 -0
  290. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/heartbeat.py +0 -0
  291. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/import_utils.py +0 -0
  292. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/install_utils.py +0 -0
  293. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/interception_utils.py +0 -0
  294. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/logging_utils.py +0 -0
  295. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/message_utils.py +0 -0
  296. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/metric_utils.py +0 -0
  297. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/path_utils.py +0 -0
  298. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/pricing_utils.py +0 -0
  299. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/process_utils.py +0 -0
  300. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/response_utils.py +0 -0
  301. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/save_utils.py +0 -0
  302. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/serve_utils.py +0 -0
  303. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/thread_utils.py +0 -0
  304. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/threaded_sandbox_client.py +0 -0
  305. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/tool_utils.py +0 -0
  306. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/usage_utils.py +0 -0
  307. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/utils/version_utils.py +0 -0
  308. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +0 -0
  309. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/README.md +0 -0
  310. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/RE_MIGRATION.md +0 -0
  311. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/__init__.py +0 -0
  312. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/artifact.py +0 -0
  313. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/config.py +0 -0
  314. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/env.py +0 -0
  315. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/harness.py +0 -0
  316. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/model.py +0 -0
  317. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/program.py +0 -0
  318. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/runtime.py +0 -0
  319. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/runtime_handles.py +0 -0
  320. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/sandbox.py +0 -0
  321. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/state.py +0 -0
  322. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/task.py +0 -0
  323. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/taskset.py +0 -0
  324. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/toolset.py +0 -0
  325. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/types.py +0 -0
  326. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/user.py +0 -0
  327. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/__init__.py +0 -0
  328. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/binding_utils.py +0 -0
  329. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/config_callable_utils.py +0 -0
  330. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/config_utils.py +0 -0
  331. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/endpoint_utils.py +0 -0
  332. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/json_utils.py +0 -0
  333. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/judge_utils.py +0 -0
  334. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/lifecycle_utils.py +0 -0
  335. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/logging_utils.py +0 -0
  336. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
  337. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/mcp_utils.py +0 -0
  338. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/object_utils.py +0 -0
  339. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/program_utils.py +0 -0
  340. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/prompt_utils.py +0 -0
  341. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/runtime_owner_utils.py +0 -0
  342. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/runtime_registry.py +0 -0
  343. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
  344. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/sandbox_python_utils.py +0 -0
  345. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/sandbox_utils.py +0 -0
  346. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/scoring_utils.py +0 -0
  347. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/serialization_utils.py +0 -0
  348. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/task_freeze_utils.py +0 -0
  349. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/taskset_utils.py +0 -0
  350. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/tool_utils.py +0 -0
  351. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/toolset_utils.py +0 -0
  352. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/trajectory_utils.py +0 -0
  353. {verifiers-0.1.15.dev170 → verifiers-0.1.15.dev171}/verifiers/v1/utils/usage_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.15.dev170
3
+ Version: 0.1.15.dev171
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -10,6 +10,7 @@ The search family is intentionally backend-oriented, mirroring the SWE taskset p
10
10
  |---|---|---|---|
11
11
  | `openseeker` | [PolarSeeker/OpenSeeker](https://github.com/PolarSeeker/OpenSeeker) | [`PolarSeeker/OpenSeeker-v1-Data`](https://huggingface.co/datasets/PolarSeeker/OpenSeeker-v1-Data) | Binary semantic answer judge |
12
12
  | `quest` | [OSU-NLP-Group/QUEST](https://github.com/OSU-NLP-Group/QUEST) | [`osunlp/QUEST-RL-Data`](https://huggingface.co/datasets/osunlp/QUEST-RL-Data) | Objective tasks supported |
13
+ | `redsearcher` | [RedSearchAgent/REDSearcher](https://github.com/RedSearchAgent/REDSearcher) | [`Zchu/REDSearcher_RL_1K`](https://huggingface.co/datasets/Zchu/REDSearcher_RL_1K) | Text RL query set supported |
13
14
 
14
15
  ## Usage
15
16
 
@@ -18,13 +19,14 @@ from verifiers.envs.experimental.composable.tasksets.search import make_search_t
18
19
 
19
20
  taskset = make_search_taskset(backend="openseeker")
20
21
  taskset = make_search_taskset(backend="quest", category="objective")
22
+ redsearcher = make_search_taskset(backend="redsearcher", difficulty="easy")
21
23
  ```
22
24
 
23
25
  `make_search_taskset()` dispatches by backend name. Unknown backends raise `ValueError` with the available backend list.
24
26
 
25
27
  ## Output Contract
26
28
 
27
- Search tasksets should define their own output contract. The `quest` and `openseeker` backends expect the agent to write one final researched response to `/task/answer.txt`, including supporting URLs/citations when available. Scratch reasoning, tool traces, and logs should not be written as the final answer.
29
+ Search tasksets should define their own output contract. The `quest`, `openseeker`, and `redsearcher` backends expect the agent to write one final researched response to `/task/answer.txt`, including supporting URLs/citations when available. Scratch reasoning, tool traces, and logs should not be written as the final answer.
28
30
 
29
31
  ## Error Handling
30
32
 
@@ -0,0 +1,15 @@
1
+ """Composable search/research tasksets."""
2
+
3
+ from .search_tasksets import (
4
+ make_openseeker_taskset,
5
+ make_quest_taskset,
6
+ make_redsearcher_taskset,
7
+ make_search_taskset,
8
+ )
9
+
10
+ __all__ = [
11
+ "make_openseeker_taskset",
12
+ "make_quest_taskset",
13
+ "make_redsearcher_taskset",
14
+ "make_search_taskset",
15
+ ]
@@ -0,0 +1,38 @@
1
+ # REDSearcher Search Taskset
2
+
3
+ Text RL queries from REDSearcher ported into the composable search taskset framework.
4
+
5
+ ## Source
6
+
7
+ - Dataset: [`Zchu/REDSearcher_RL_1K`](https://huggingface.co/datasets/Zchu/REDSearcher_RL_1K)
8
+ - Collection: [`Zchu/redsearcher`](https://huggingface.co/collections/Zchu/redsearcher)
9
+ - Upstream project: [`RedSearchAgent/REDSearcher`](https://github.com/RedSearchAgent/REDSearcher)
10
+ - Paper: [`arXiv:2602.14234`](https://arxiv.org/abs/2602.14234)
11
+
12
+ The released text RL dataset contains 1,000 rows with `problem`, `answer`, and `difficulty` columns. The upstream REDSearcher repo describes converting each row into a Slime-style `prompt` plus `label`; this taskset keeps the same problem/answer boundary while adapting it to Verifiers' taskset format.
13
+
14
+ ## Task Contract
15
+
16
+ Each example is a long-horizon web-search question. The agent should research across sources and produce one final answer in `/task/answer.txt`, with supporting URLs/citations when available.
17
+
18
+ The paired `rlm_search` environment prompts RLM to write this file and provides web search/open-page skills. The rubric can fall back to the final assistant text if the answer file is empty, but agents should still write the file directly.
19
+
20
+ ## Scoring
21
+
22
+ `RedSearcherRubric` compares the final response against the released `answer` label. It first applies a strict normalized exact-answer shortcut for unambiguous matches. Otherwise it uses an OpenAI-compatible LLM-as-judge prompt that follows the answer-matching convention in REDSearcher's DeepTraceHub evaluation code: judge whether the predicted final answer is equivalent to the ground truth and return binary accuracy.
23
+
24
+ A reward of `1.0` means the final response matched the ground-truth answer; `0.0` means it did not, or no final answer was produced. Judge provider failures are preserved as `vf.Error` values on `state["error"]`.
25
+
26
+ ## Common Arguments
27
+
28
+ | Argument | Default | Description |
29
+ |---|---:|---|
30
+ | `dataset_name` | `Zchu/REDSearcher_RL_1K` | Hugging Face dataset name. |
31
+ | `split` | `train` | Dataset split. |
32
+ | `difficulty` | `None` | Optional difficulty filter: `easy`, `medium`, `hard`, or `all`. |
33
+ | `answer_file` | `/task/answer.txt` | Final answer path in the sandbox. |
34
+ | `judge_model` | `openai/gpt-5.4-mini` | OpenAI-compatible model for answer-match judging. |
35
+ | `judge_base_url` | `https://api.pinference.ai/api/v1` | Judge API base URL. |
36
+ | `judge_api_key_var` | `PRIME_API_KEY` | Env var containing the judge API key. |
37
+ | `judge_max_retries` | `5` | Number of parse retries for the A/B judge response. |
38
+ | `use_exact_match_shortcut` | `True` | Return `1.0` without an LLM call when the normalized final response exactly equals the normalized ground-truth answer. |
@@ -0,0 +1,5 @@
1
+ """REDSearcher search taskset."""
2
+
3
+ from .taskset import RedSearcherRubric, RedSearcherTaskSet
4
+
5
+ __all__ = ["RedSearcherRubric", "RedSearcherTaskSet"]
@@ -0,0 +1,556 @@
1
+ """REDSearcher composable search taskset.
2
+
3
+ This ports the released REDSearcher text RL query set into the composable
4
+ search taskset family. The public artifact is a simple QA dataset, so scoring
5
+ uses the paper/repo's answer-matching LLM-as-judge convention rather than
6
+ dataset-provided verifier scripts.
7
+ """
8
+
9
+ import asyncio
10
+ import logging
11
+ import os
12
+ import re
13
+ import unicodedata
14
+ from typing import Any, NoReturn
15
+
16
+ import verifiers as vf
17
+ from datasets import Dataset, load_dataset
18
+ from openai import (
19
+ APIConnectionError,
20
+ APIResponseValidationError,
21
+ APITimeoutError,
22
+ APIStatusError,
23
+ AsyncOpenAI,
24
+ AuthenticationError,
25
+ BadRequestError,
26
+ ConflictError,
27
+ ContentFilterFinishReasonError,
28
+ InternalServerError,
29
+ LengthFinishReasonError,
30
+ NotFoundError,
31
+ PermissionDeniedError,
32
+ RateLimitError,
33
+ UnprocessableEntityError,
34
+ )
35
+ from verifiers.envs.experimental.composable import SandboxSpec, SandboxTaskSet
36
+ from verifiers.types import ClientConfig
37
+ from verifiers.utils.client_utils import setup_openai_client
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ DEFAULT_DATASET_NAME = "Zchu/REDSearcher_RL_1K"
42
+ DEFAULT_SPLIT = "train"
43
+ DEFAULT_ANSWER_FILE = "/task/answer.txt"
44
+ DEFAULT_WORKDIR = "/workspace"
45
+ DEFAULT_JUDGE_BASE_URL = "https://api.pinference.ai/api/v1"
46
+ DEFAULT_JUDGE_API_KEY_VAR = "PRIME_API_KEY"
47
+ DEFAULT_JUDGE_MODEL = "openai/gpt-5.4-mini"
48
+ DEFAULT_SANDBOX_IMAGE = "python:3.11-slim"
49
+
50
+ _JUDGE_PROMPT = """\
51
+ You are grading a deep-search question answering response.
52
+
53
+ Decide whether the predicted response gives the same final answer as the
54
+ ground-truth answer. Ignore citations, formatting, capitalization, and extra
55
+ explanation unless they contradict the final answer. For numeric answers,
56
+ allow insignificant formatting differences but not a different value. If the
57
+ response gives multiple incompatible answers, is evasive, or merely repeats
58
+ the question, mark it incorrect.
59
+
60
+ Question:
61
+ {question}
62
+
63
+ Ground-truth answer:
64
+ {answer}
65
+
66
+ Predicted response:
67
+ {response}
68
+
69
+ Return only one letter:
70
+ A. CORRECT
71
+ B. INCORRECT
72
+ """
73
+
74
+ _CONTEXT_LENGTH_ERROR_PHRASES = (
75
+ "this model's maximum context length is",
76
+ "is longer than the model's context length",
77
+ "is longer than the maximum model length",
78
+ "exceeds the model's context length",
79
+ "exceed the configured limit",
80
+ "exceeds the configured limit",
81
+ "exceeded model",
82
+ "prompt_too_long",
83
+ "context length",
84
+ "maximum model length",
85
+ )
86
+
87
+ _REDSEARCHER_JUDGE_ERROR_TYPES = (
88
+ APIConnectionError,
89
+ APIResponseValidationError,
90
+ APITimeoutError,
91
+ APIStatusError,
92
+ AuthenticationError,
93
+ BadRequestError,
94
+ ConflictError,
95
+ ContentFilterFinishReasonError,
96
+ InternalServerError,
97
+ LengthFinishReasonError,
98
+ NotFoundError,
99
+ PermissionDeniedError,
100
+ RateLimitError,
101
+ UnprocessableEntityError,
102
+ )
103
+
104
+ _REDSEARCHER_TRANSIENT_JUDGE_ERROR_TYPES = (
105
+ APIConnectionError,
106
+ APITimeoutError,
107
+ ConflictError,
108
+ InternalServerError,
109
+ RateLimitError,
110
+ )
111
+
112
+
113
+ def _is_context_length_error(exc: BadRequestError) -> bool:
114
+ response = getattr(exc, "response", None)
115
+ response_text = getattr(response, "text", "") or ""
116
+ error_text = f"{response_text}\n{exc}".lower()
117
+ return any(phrase in error_text for phrase in _CONTEXT_LENGTH_ERROR_PHRASES)
118
+
119
+
120
+ def _raise_redsearcher_judge_error(exc: Exception, *, model: str) -> NoReturn:
121
+ if isinstance(exc, BadRequestError) and _is_context_length_error(exc):
122
+ raise vf.OverlongPromptError(
123
+ f"REDSearcher judge prompt exceeded model context for {model}: {exc}"
124
+ ) from exc
125
+ if isinstance(
126
+ exc,
127
+ (
128
+ APIConnectionError,
129
+ APITimeoutError,
130
+ RateLimitError,
131
+ InternalServerError,
132
+ ConflictError,
133
+ ),
134
+ ):
135
+ raise vf.InfraError(
136
+ f"REDSearcher judge transient request failed for {model}: {exc}"
137
+ ) from exc
138
+ if isinstance(exc, APIResponseValidationError):
139
+ raise vf.InvalidModelResponseError(
140
+ f"REDSearcher judge SDK response validation failed for {model}: {exc}"
141
+ ) from exc
142
+ if isinstance(exc, LengthFinishReasonError):
143
+ raise vf.InvalidModelResponseError(
144
+ f"REDSearcher judge stopped due to length for {model}: {exc}"
145
+ ) from exc
146
+ if isinstance(
147
+ exc,
148
+ (
149
+ AuthenticationError,
150
+ PermissionDeniedError,
151
+ NotFoundError,
152
+ UnprocessableEntityError,
153
+ ContentFilterFinishReasonError,
154
+ BadRequestError,
155
+ APIStatusError,
156
+ ),
157
+ ):
158
+ raise vf.ModelError(
159
+ f"REDSearcher judge request failed for {model}: {exc}"
160
+ ) from exc
161
+ raise AssertionError(
162
+ f"Unhandled REDSearcher judge exception type: {type(exc).__name__}"
163
+ ) from exc
164
+
165
+
166
+ def _completion_text(state: vf.State) -> str:
167
+ completion = state.get("completion")
168
+ if isinstance(completion, str):
169
+ return completion.strip()
170
+ if not isinstance(completion, list):
171
+ return ""
172
+ parts: list[str] = []
173
+ for message in completion:
174
+ role = None
175
+ content = None
176
+ if isinstance(message, dict):
177
+ role = message.get("role")
178
+ content = message.get("content")
179
+ else:
180
+ role = getattr(message, "role", None)
181
+ content = getattr(message, "content", None)
182
+ if role == "assistant" and isinstance(content, str) and content.strip():
183
+ parts.append(content.strip())
184
+ return "\n\n".join(parts).strip()
185
+
186
+
187
+ def _normalize_for_match(value: str) -> str:
188
+ text = unicodedata.normalize("NFKC", value).casefold()
189
+ text = re.sub(r"https?://\S+", " ", text)
190
+ text = re.sub(r"[^a-z0-9]+", " ", text)
191
+ return re.sub(r"\s+", " ", text).strip()
192
+
193
+
194
+ def _exact_answer_match(*, response: str, answer: str) -> bool:
195
+ normalized_answer = _normalize_for_match(answer)
196
+ normalized_response = _normalize_for_match(response)
197
+ if not normalized_answer or not normalized_response:
198
+ return False
199
+ return normalized_answer == normalized_response
200
+
201
+
202
+ def _parse_judge_choice(content: str) -> float | None:
203
+ text = content.strip()
204
+ if not text:
205
+ return None
206
+ first_line = text.splitlines()[0].strip("`*_ \t")
207
+ upper = first_line.upper()
208
+ if re.match(r"^\[?INCORRECT\]?(?:[\s.):\]-]|$)", upper) or re.match(
209
+ r"^B(?:[\s.):\]-]|$)", upper
210
+ ):
211
+ return 0.0
212
+ if re.match(r"^\[?CORRECT\]?(?:[\s.):\]-]|$)", upper) or re.match(
213
+ r"^A(?:[\s.):\]-]|$)", upper
214
+ ):
215
+ return 1.0
216
+ return None
217
+
218
+
219
+ class RedSearcherTaskSet(SandboxTaskSet):
220
+ """REDSearcher text RL deep-search taskset."""
221
+
222
+ default_workdir = DEFAULT_WORKDIR
223
+
224
+ def __init__(
225
+ self,
226
+ dataset_name: str = DEFAULT_DATASET_NAME,
227
+ split: str = DEFAULT_SPLIT,
228
+ difficulty: str | None = None,
229
+ filter_fn: str | None = None,
230
+ ds_keep_in_memory: bool | None = True,
231
+ ds_num_proc: int | None = None,
232
+ sandbox_image: str = DEFAULT_SANDBOX_IMAGE,
233
+ sandbox_cpu_cores: int = 2,
234
+ sandbox_memory_gb: int = 2,
235
+ sandbox_disk_size_gb: int = 5,
236
+ sandbox_timeout_minutes: int | None = None,
237
+ answer_file: str = DEFAULT_ANSWER_FILE,
238
+ judge_model: str = DEFAULT_JUDGE_MODEL,
239
+ judge_base_url: str | None = DEFAULT_JUDGE_BASE_URL,
240
+ judge_api_key_var: str = DEFAULT_JUDGE_API_KEY_VAR,
241
+ judge_sampling_args: dict[str, Any] | None = None,
242
+ judge_max_retries: int = 5,
243
+ use_exact_match_shortcut: bool = True,
244
+ ) -> None:
245
+ if difficulty not in {None, "all", "easy", "medium", "hard"}:
246
+ raise ValueError(
247
+ "difficulty must be one of None, 'all', 'easy', 'medium', or 'hard'"
248
+ )
249
+ self.dataset_name = dataset_name
250
+ self.split = split
251
+ self.difficulty = difficulty
252
+ self.ds_keep_in_memory = ds_keep_in_memory
253
+ self.ds_num_proc = ds_num_proc
254
+ self.answer_file = answer_file
255
+ self._sandbox_spec = SandboxSpec(
256
+ image=sandbox_image,
257
+ cpu_cores=sandbox_cpu_cores,
258
+ memory_gb=sandbox_memory_gb,
259
+ disk_size_gb=sandbox_disk_size_gb,
260
+ timeout_minutes=sandbox_timeout_minutes,
261
+ )
262
+ self._judge_model = judge_model
263
+ self._judge_base_url = judge_base_url
264
+ self._judge_api_key_var = judge_api_key_var
265
+ self._judge_sampling_args = dict(judge_sampling_args or {})
266
+ self._judge_max_retries = judge_max_retries
267
+ self._use_exact_match_shortcut = use_exact_match_shortcut
268
+ label = difficulty or "all"
269
+ super().__init__(
270
+ dataset=self._build_dataset,
271
+ name=f"search/redsearcher/{label}",
272
+ filter_fn=filter_fn,
273
+ )
274
+
275
+ def _build_dataset(self) -> Dataset:
276
+ raw = load_dataset(
277
+ self.dataset_name,
278
+ split=self.split,
279
+ keep_in_memory=self.ds_keep_in_memory,
280
+ num_proc=self.ds_num_proc,
281
+ )
282
+ rows: list[dict[str, Any]] = []
283
+ for idx, row in enumerate(raw):
284
+ difficulty = str(row.get("difficulty") or "")
285
+ if self.difficulty not in {None, "all"} and difficulty != self.difficulty:
286
+ continue
287
+ question = str(row.get("problem") or "").strip()
288
+ answer = str(row.get("answer") or "").strip()
289
+ if not question or not answer:
290
+ continue
291
+ rows.append(
292
+ {
293
+ "question": question,
294
+ "answer": answer,
295
+ "info": {
296
+ "question": question,
297
+ "problem": question,
298
+ "answer": answer,
299
+ "difficulty": difficulty,
300
+ "dataset_name": self.dataset_name,
301
+ "split": self.split,
302
+ "row_index": idx,
303
+ "answer_file": self.answer_file,
304
+ },
305
+ }
306
+ )
307
+ return Dataset.from_list(rows)
308
+
309
+ def get_instruction(self, info: dict) -> str:
310
+ question = str(info.get("question") or "")
311
+ return (
312
+ f"{question}\n\n"
313
+ "This is a REDSearcher long-horizon search task. Break the problem into search subgoals, "
314
+ "cross-check the answer across sources, and synthesize a concise final response.\n\n"
315
+ f"When you have the final response, write it to {self.answer_file} using a tool call, "
316
+ "then stop. The task is incomplete unless that file exists. Include the requested answer "
317
+ "and supporting URLs/citations in the file, but do not include scratch reasoning or tool traces."
318
+ )
319
+
320
+ def get_sandbox_spec(self, info: dict) -> SandboxSpec:
321
+ return self._sandbox_spec
322
+
323
+ def get_workdir(self, info: dict) -> str:
324
+ return self.default_workdir
325
+
326
+ def get_env_vars(self) -> dict[str, str]:
327
+ env_vars: dict[str, str] = {}
328
+ for key in ("SERPER_API_KEY",):
329
+ value = os.environ.get(key)
330
+ if value:
331
+ env_vars[key] = value
332
+ return env_vars
333
+
334
+ async def setup(self, state: vf.State) -> None:
335
+ sandbox_client = state["sandbox_client"]
336
+ sandbox_id = state["sandbox_id"]
337
+ await sandbox_client.execute_command(
338
+ sandbox_id, f"mkdir -p {self.default_workdir} /task", timeout=10
339
+ )
340
+
341
+ def get_rubric(self) -> vf.Rubric:
342
+ return RedSearcherRubric(
343
+ answer_file=self.answer_file,
344
+ judge_model=self._judge_model,
345
+ judge_base_url=self._judge_base_url,
346
+ judge_api_key_var=self._judge_api_key_var,
347
+ judge_sampling_args=self._judge_sampling_args,
348
+ judge_max_retries=self._judge_max_retries,
349
+ use_exact_match_shortcut=self._use_exact_match_shortcut,
350
+ )
351
+
352
+
353
+ class RedSearcherRubric(vf.Rubric):
354
+ """Scores REDSearcher answers against the released ground-truth label."""
355
+
356
+ def __init__(
357
+ self,
358
+ *,
359
+ answer_file: str = DEFAULT_ANSWER_FILE,
360
+ judge_model: str = DEFAULT_JUDGE_MODEL,
361
+ judge_base_url: str | None = DEFAULT_JUDGE_BASE_URL,
362
+ judge_api_key_var: str = DEFAULT_JUDGE_API_KEY_VAR,
363
+ judge_sampling_args: dict[str, Any] | None = None,
364
+ judge_max_retries: int = 5,
365
+ use_exact_match_shortcut: bool = True,
366
+ ) -> None:
367
+ super().__init__()
368
+ self.answer_file = answer_file
369
+ self.judge_model = judge_model
370
+ self.judge_base_url = judge_base_url
371
+ self.judge_api_key_var = judge_api_key_var
372
+ self.judge_sampling_args = dict(judge_sampling_args or {})
373
+ self.judge_max_retries = judge_max_retries
374
+ self.use_exact_match_shortcut = use_exact_match_shortcut
375
+ self._client: AsyncOpenAI | None = None
376
+ self.add_reward_func(self.answer_reward, weight=1.0)
377
+
378
+ async def _answer_score_for_state(self, state: vf.State) -> float:
379
+ existing_error = state.get("error")
380
+ if existing_error is not None:
381
+ state["redsearcher_agent_error"] = repr(existing_error)
382
+ return 0.0
383
+ try:
384
+ return await self.answer_reward(state)
385
+ except vf.Error as exc:
386
+ state["error"] = exc
387
+ return 0.0
388
+
389
+ async def score_rollout(self, state: vf.State) -> None:
390
+ """Score one rollout and preserve judge failures as ``vf.Error`` values."""
391
+ score = await self._answer_score_for_state(state)
392
+ state["reward"] = score
393
+ state["metrics"] = {"answer_reward": score}
394
+
395
+ async def score_group(self, states: list[vf.State]) -> None:
396
+ """Score rollouts while preserving judge failures as ``vf.Error`` values."""
397
+ if not states:
398
+ logger.warning("No states to score")
399
+ return
400
+ scores = await asyncio.gather(
401
+ *(self._answer_score_for_state(state) for state in states)
402
+ )
403
+ avg_score = sum(scores) / len(scores)
404
+ for state, score in zip(states, scores):
405
+ state["reward"] = score
406
+ state["advantage"] = score - avg_score
407
+ for turn in state.get("trajectory", []):
408
+ if isinstance(turn, dict):
409
+ if turn.get("advantage") is None:
410
+ turn["advantage"] = state["advantage"]
411
+ if turn.get("reward") is None:
412
+ turn["reward"] = state["reward"]
413
+ state["metrics"] = {"answer_reward": score}
414
+
415
+ async def answer_reward(self, state: vf.State, **_: Any) -> float:
416
+ sandbox_client = state.get("sandbox_client")
417
+ sandbox_id = state.get("sandbox_id")
418
+ if not sandbox_client or not sandbox_id:
419
+ raise vf.SandboxError("REDSearcher scoring requires a live sandbox")
420
+ try:
421
+ result = await sandbox_client.execute_command(
422
+ sandbox_id,
423
+ f"cat {self.answer_file} 2>/dev/null || true",
424
+ working_dir=None,
425
+ )
426
+ except Exception as exc:
427
+ raise vf.SandboxError(
428
+ f"Failed to read REDSearcher answer file {self.answer_file}"
429
+ ) from exc
430
+ response = (result.stdout or "").strip()
431
+ answer_source = "answer_file"
432
+ if not response:
433
+ response = _completion_text(state)
434
+ answer_source = "completion_fallback" if response else "missing"
435
+ state["redsearcher_answer"] = response
436
+ state["redsearcher_answer_source"] = answer_source
437
+ if not response:
438
+ state["redsearcher_eval_error"] = "empty_answer"
439
+ return 0.0
440
+ info = state.get("info") or {}
441
+ question = str(info.get("question") or info.get("problem") or "")
442
+ answer = str(state.get("answer") or info.get("answer") or "").strip()
443
+ if not answer:
444
+ raise vf.InfraError(
445
+ "REDSearcher task is missing ground-truth answer metadata"
446
+ )
447
+ state["redsearcher_ground_truth"] = answer
448
+ if self.use_exact_match_shortcut and _exact_answer_match(
449
+ response=response, answer=answer
450
+ ):
451
+ state["redsearcher_match_method"] = "exact_match"
452
+ state["redsearcher_judge_result"] = {
453
+ "correct": "yes",
454
+ "accuracy": 1.0,
455
+ "reasoning": "Exact normalized answer match.",
456
+ }
457
+ return 1.0
458
+ score = await self._judge_answer(
459
+ question=question,
460
+ response=response,
461
+ answer=answer,
462
+ state=state,
463
+ )
464
+ state["redsearcher_match_method"] = "llm_judge"
465
+ return score
466
+
467
+ async def _judge_answer(
468
+ self,
469
+ *,
470
+ question: str,
471
+ response: str,
472
+ answer: str,
473
+ state: vf.State,
474
+ ) -> float:
475
+ prompt = _JUDGE_PROMPT.format(
476
+ question=question,
477
+ response=response,
478
+ answer=answer,
479
+ )
480
+ client = self._get_client()
481
+ request_kwargs = dict(self.judge_sampling_args)
482
+ last_content = ""
483
+ max_attempts = max(1, self.judge_max_retries)
484
+ for attempt in range(max_attempts):
485
+ try:
486
+ judge_response = await client.chat.completions.create(
487
+ model=self.judge_model,
488
+ messages=[{"role": "user", "content": prompt}],
489
+ **request_kwargs,
490
+ )
491
+ except _REDSEARCHER_JUDGE_ERROR_TYPES as exc:
492
+ if isinstance(exc, _REDSEARCHER_TRANSIENT_JUDGE_ERROR_TYPES) and (
493
+ attempt + 1 < max_attempts
494
+ ):
495
+ logger.warning(
496
+ "REDSearcher judge transient request failed on attempt %s/%s: %r",
497
+ attempt + 1,
498
+ max_attempts,
499
+ exc,
500
+ )
501
+ continue
502
+ _raise_redsearcher_judge_error(exc, model=self.judge_model)
503
+ choices = getattr(judge_response, "choices", None)
504
+ if choices is None or len(choices) != 1:
505
+ last_content = (
506
+ f"invalid choice count: {0 if choices is None else len(choices)}"
507
+ )
508
+ else:
509
+ content = choices[0].message.content
510
+ last_content = content or ""
511
+ parsed = _parse_judge_choice(last_content)
512
+ if parsed is not None:
513
+ state["redsearcher_judge_response"] = last_content
514
+ state["redsearcher_judge_result"] = {
515
+ "correct": "yes" if parsed == 1.0 else "no",
516
+ "accuracy": parsed,
517
+ }
518
+ return parsed
519
+ logger.warning(
520
+ "Failed to parse REDSearcher judge response on attempt %s/%s: %r",
521
+ attempt + 1,
522
+ max_attempts,
523
+ last_content[:200],
524
+ )
525
+ raise vf.InvalidModelResponseError(
526
+ f"REDSearcher judge response was not parseable as A/B: {last_content!r}"
527
+ )
528
+
529
+ def _get_client(self) -> AsyncOpenAI:
530
+ if self._client is not None:
531
+ return self._client
532
+ api_base_url = self.judge_base_url or "https://api.openai.com/v1"
533
+ self._client = setup_openai_client(
534
+ ClientConfig(
535
+ api_key_var=self.judge_api_key_var,
536
+ api_base_url=api_base_url,
537
+ timeout=1200.0,
538
+ )
539
+ )
540
+ return self._client
541
+
542
+ @vf.cleanup
543
+ async def cleanup(self, state: vf.State) -> None:
544
+ sandbox_client = state.get("sandbox_client")
545
+ sandbox_id = state.get("sandbox_id")
546
+ if sandbox_client and sandbox_id:
547
+ try:
548
+ await sandbox_client.delete(sandbox_id)
549
+ except Exception:
550
+ pass
551
+
552
+ @vf.teardown
553
+ async def teardown(self) -> None:
554
+ if self._client is not None:
555
+ await self._client.close()
556
+ self._client = None
@@ -10,6 +10,7 @@ def make_search_taskset(backend: str = "quest", **kwargs: Any) -> TaskSet:
10
10
  factories = {
11
11
  "openseeker": make_openseeker_taskset,
12
12
  "quest": make_quest_taskset,
13
+ "redsearcher": make_redsearcher_taskset,
13
14
  }
14
15
  if backend not in factories:
15
16
  raise ValueError(
@@ -34,3 +35,12 @@ def make_openseeker_taskset(**kwargs: Any) -> TaskSet:
34
35
  )
35
36
 
36
37
  return OpenSeekerTaskSet(**kwargs)
38
+
39
+
40
+ def make_redsearcher_taskset(**kwargs: Any) -> TaskSet:
41
+ """REDSearcher RL query-set deep-search TaskSet."""
42
+ from verifiers.envs.experimental.composable.tasksets.search.redsearcher import (
43
+ RedSearcherTaskSet,
44
+ )
45
+
46
+ return RedSearcherTaskSet(**kwargs)
@@ -1,9 +0,0 @@
1
- """Composable search/research tasksets."""
2
-
3
- from .search_tasksets import (
4
- make_openseeker_taskset,
5
- make_quest_taskset,
6
- make_search_taskset,
7
- )
8
-
9
- __all__ = ["make_openseeker_taskset", "make_quest_taskset", "make_search_taskset"]