verifiers 0.1.15.dev175__tar.gz → 0.1.15.dev177__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/PKG-INFO +2 -2
  2. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/pyproject.toml +1 -1
  3. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/README.md +20 -8
  4. verifiers-0.1.15.dev177/verifiers/envs/experimental/composable/tasksets/search/quest/open_ended.py +329 -0
  5. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/taskset.py +119 -23
  6. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/harness.py +3 -1
  7. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/endpoint_utils.py +6 -1
  8. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/.gitignore +0 -0
  9. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/LICENSE +0 -0
  10. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/README.md +0 -0
  11. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/AGENTS.md +0 -0
  12. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/README.md +0 -0
  13. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/__init__.py +0 -0
  14. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/conftest.py +0 -0
  15. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_browser_env.py +0 -0
  16. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_build_script.py +0 -0
  17. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_cli_agent_env.py +0 -0
  18. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_client_auth_errors.py +0 -0
  19. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_client_config.py +0 -0
  20. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_client_multimodal_types.py +0 -0
  21. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_composable_env.py +0 -0
  22. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_context_token_metrics.py +0 -0
  23. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_decorator_ranks.py +0 -0
  24. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_endpoint_registry.py +0 -0
  25. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_env_group.py +0 -0
  26. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_env_server.py +0 -0
  27. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_environment.py +0 -0
  28. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_environment_extra.py +0 -0
  29. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_envs.py +0 -0
  30. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_error_chain.py +0 -0
  31. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_eval_cli.py +0 -0
  32. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_eval_display.py +0 -0
  33. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_eval_utils.py +0 -0
  34. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_gepa_cli.py +0 -0
  35. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_gepa_utils.py +0 -0
  36. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_gym_env.py +0 -0
  37. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_harbor_env_mcp.py +0 -0
  38. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_imports.py +0 -0
  39. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_init_script.py +0 -0
  40. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_install_utils.py +0 -0
  41. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_interception_utils.py +0 -0
  42. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_langchain_deep_agents_wikispeedia.py +0 -0
  43. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_lean_task.py +0 -0
  44. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_logging.py +0 -0
  45. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_math_rubric.py +0 -0
  46. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_maybe_think_parser.py +0 -0
  47. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_mcp_search_env.py +0 -0
  48. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_message_utils.py +0 -0
  49. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_message_utils_multimodal.py +0 -0
  50. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_multiturn_env.py +0 -0
  51. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_nemorl_client.py +0 -0
  52. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_openai_chat_completions_token_client.py +0 -0
  53. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_openai_responses_client.py +0 -0
  54. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_opencode_harbor.py +0 -0
  55. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_opencode_rlm_env.py +0 -0
  56. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_openenv_client.py +0 -0
  57. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_parser.py +0 -0
  58. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_path_utils.py +0 -0
  59. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_per_turn_timing.py +0 -0
  60. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_pricing_utils.py +0 -0
  61. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_prime_plugin.py +0 -0
  62. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_renderer_client.py +0 -0
  63. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_renderer_e2e.py +0 -0
  64. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_rlm_composable_env.py +0 -0
  65. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_rubric.py +0 -0
  66. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_rubric_group.py +0 -0
  67. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_sandbox_env.py +0 -0
  68. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_sandbox_mixin.py +0 -0
  69. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_save_utils.py +0 -0
  70. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_setup_script.py +0 -0
  71. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_singleturn_env.py +0 -0
  72. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_stateful_tool_env.py +0 -0
  73. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_think_parser.py +0 -0
  74. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_tool_env.py +0 -0
  75. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_tool_utils.py +0 -0
  76. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_trajectory_processing.py +0 -0
  77. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_tui_info_formatting.py +0 -0
  78. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_types.py +0 -0
  79. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_bfcl.py +0 -0
  80. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_config_extension.py +0 -0
  81. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_empty_completions.py +0 -0
  82. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_endpoint_protocols.py +0 -0
  83. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_example_counts.py +0 -0
  84. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_group_reward_env.py +0 -0
  85. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_harbor_cli.py +0 -0
  86. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_mini_swe_agent.py +0 -0
  87. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_nemo_gym_harness.py +0 -0
  88. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_openenv_taskset.py +0 -0
  89. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_openreward_taskset.py +0 -0
  90. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_replay_harness.py +0 -0
  91. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_rlm_swe.py +0 -0
  92. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_runtime_lifecycle.py +0 -0
  93. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_scoring_functions.py +0 -0
  94. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_taskset_bindings.py +0 -0
  95. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_taskset_utils.py +0 -0
  96. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_v1_textarena_taskset.py +0 -0
  97. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_wiki_search_v1.py +0 -0
  98. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_wordle_env.py +0 -0
  99. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_wordle_v1_env.py +0 -0
  100. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/tests/test_xml_parser.py +0 -0
  101. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/AGENTS.md +0 -0
  102. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/__init__.py +0 -0
  103. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/__init__.py +0 -0
  104. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/commands/__init__.py +0 -0
  105. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/commands/build.py +0 -0
  106. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/commands/eval.py +0 -0
  107. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/commands/gepa.py +0 -0
  108. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/commands/init.py +0 -0
  109. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/commands/install.py +0 -0
  110. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/commands/setup.py +0 -0
  111. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/plugins/__init__.py +0 -0
  112. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/plugins/prime.py +0 -0
  113. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/cli/tui.py +0 -0
  114. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/clients/__init__.py +0 -0
  115. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/clients/anthropic_messages_client.py +0 -0
  116. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/clients/client.py +0 -0
  117. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
  118. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/clients/openai_chat_completions_client.py +0 -0
  119. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
  120. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/clients/openai_completions_client.py +0 -0
  121. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/clients/openai_responses_client.py +0 -0
  122. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/clients/renderer_client.py +0 -0
  123. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/decorators.py +0 -0
  124. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/AGENTS.md +0 -0
  125. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/__init__.py +0 -0
  126. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/env_group.py +0 -0
  127. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/environment.py +0 -0
  128. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/README.md +0 -0
  129. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/__init__.py +0 -0
  130. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/cli_agent_env.py +0 -0
  131. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/README.md +0 -0
  132. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/__init__.py +0 -0
  133. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/_filter.py +0 -0
  134. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/composable_env.py +0 -0
  135. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/harness.py +0 -0
  136. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
  137. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
  138. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
  139. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  140. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
  141. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
  142. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/task.py +0 -0
  143. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  144. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  145. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
  146. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  147. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  148. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
  149. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
  150. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
  151. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  152. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
  153. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/README.md +0 -0
  154. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/__init__.py +0 -0
  155. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/openseeker/README.md +0 -0
  156. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/openseeker/__init__.py +0 -0
  157. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/openseeker/taskset.py +0 -0
  158. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/__init__.py +0 -0
  159. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/__init__.py +0 -0
  160. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/api_tools/__init__.py +0 -0
  161. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/api_tools/tool_pdf.py +0 -0
  162. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/eval_toolkit.py +0 -0
  163. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/evaluator.py +0 -0
  164. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/llm_client/__init__.py +0 -0
  165. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/llm_client/base_client.py +0 -0
  166. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/prompts/__init__.py +0 -0
  167. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/prompts/cache_prompts.py +0 -0
  168. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/__init__.py +0 -0
  169. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/cache_filesys.py +0 -0
  170. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/load_eval_script.py +0 -0
  171. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/misc.py +0 -0
  172. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/tool_visit.py +0 -0
  173. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/url_tools.py +0 -0
  174. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/verification_tree.py +0 -0
  175. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/redsearcher/README.md +0 -0
  176. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/redsearcher/__init__.py +0 -0
  177. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/redsearcher/taskset.py +0 -0
  178. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/search/search_tasksets.py +0 -0
  179. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/README.md +0 -0
  180. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  181. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/__init__.py +0 -0
  182. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/extract_fix_patch.sh +0 -0
  183. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/taskset.py +0 -0
  184. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/openswe/__init__.py +0 -0
  185. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/openswe/taskset.py +0 -0
  186. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/__init__.py +0 -0
  187. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/log_parser.py +0 -0
  188. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/taskset.py +0 -0
  189. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/scale_swe/__init__.py +0 -0
  190. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/scale_swe/taskset.py +0 -0
  191. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/shared/__init__.py +0 -0
  192. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/shared/test_patch.py +0 -0
  193. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench/__init__.py +0 -0
  194. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench/taskset.py +0 -0
  195. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego/__init__.py +0 -0
  196. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego/taskset.py +0 -0
  197. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/__init__.py +0 -0
  198. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/log_parsers.py +0 -0
  199. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/taskset.py +0 -0
  200. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith/__init__.py +0 -0
  201. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith/taskset.py +0 -0
  202. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
  203. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/gym_env.py +0 -0
  204. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
  205. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/harbor_env/env.py +0 -0
  206. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
  207. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/mcp_env.py +0 -0
  208. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/opencode_env.py +0 -0
  209. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  210. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
  211. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
  212. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/utils/__init__.py +0 -0
  213. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/utils/file_locks.py +0 -0
  214. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
  215. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/README.md +0 -0
  216. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/__init__.py +0 -0
  217. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/browser_env/README.md +0 -0
  218. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  219. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
  220. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  221. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
  222. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
  223. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
  224. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/openenv_env.py +0 -0
  225. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  226. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/integrations/textarena_env.py +0 -0
  227. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/multiturn_env.py +0 -0
  228. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/python_env.py +0 -0
  229. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/sandbox_env.py +0 -0
  230. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/singleturn_env.py +0 -0
  231. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/stateful_tool_env.py +0 -0
  232. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/envs/tool_env.py +0 -0
  233. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/errors.py +0 -0
  234. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/gepa/__init__.py +0 -0
  235. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/gepa/adapter.py +0 -0
  236. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/gepa/config.py +0 -0
  237. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/gepa/display.py +0 -0
  238. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/gepa/gepa_utils.py +0 -0
  239. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/parsers/__init__.py +0 -0
  240. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/parsers/maybe_think_parser.py +0 -0
  241. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/parsers/parser.py +0 -0
  242. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/parsers/think_parser.py +0 -0
  243. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/parsers/xml_parser.py +0 -0
  244. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rl/README.md +0 -0
  245. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rl/__init__.py +0 -0
  246. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rl/inference/__init__.py +0 -0
  247. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rl/inference/client.py +0 -0
  248. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rl/inference/server.py +0 -0
  249. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rl/trainer/__init__.py +0 -0
  250. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rl/trainer/config.py +0 -0
  251. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rl/trainer/orchestrator.py +0 -0
  252. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rl/trainer/trainer.py +0 -0
  253. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rl/trainer/utils.py +0 -0
  254. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rubrics/__init__.py +0 -0
  255. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
  256. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rubrics/judge_rubric.py +0 -0
  257. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rubrics/math_rubric.py +0 -0
  258. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rubrics/rubric.py +0 -0
  259. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/rubrics/rubric_group.py +0 -0
  260. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/__init__.py +0 -0
  261. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/build.py +0 -0
  262. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/eval.py +0 -0
  263. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/gepa.py +0 -0
  264. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/init.py +0 -0
  265. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/install.py +0 -0
  266. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/rl.py +0 -0
  267. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/setup.py +0 -0
  268. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/train.py +0 -0
  269. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/tui.py +0 -0
  270. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/scripts/vllm.py +0 -0
  271. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/serve/__init__.py +0 -0
  272. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/serve/client/env_client.py +0 -0
  273. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/serve/client/zmq_env_client.py +0 -0
  274. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/serve/server/__init__.py +0 -0
  275. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/serve/server/env_router.py +0 -0
  276. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/serve/server/env_server.py +0 -0
  277. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/serve/server/env_worker.py +0 -0
  278. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/serve/server/zmq_env_server.py +0 -0
  279. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/serve/types.py +0 -0
  280. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/types.py +0 -0
  281. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/__init__.py +0 -0
  282. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/async_utils.py +0 -0
  283. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/client_utils.py +0 -0
  284. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/config_utils.py +0 -0
  285. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/data_utils.py +0 -0
  286. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/display_utils.py +0 -0
  287. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/env_config_utils.py +0 -0
  288. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/env_utils.py +0 -0
  289. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/error_utils.py +0 -0
  290. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/eval_display.py +0 -0
  291. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/eval_utils.py +0 -0
  292. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/heartbeat.py +0 -0
  293. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/import_utils.py +0 -0
  294. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/install_utils.py +0 -0
  295. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/interception_utils.py +0 -0
  296. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/logging_utils.py +0 -0
  297. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/message_utils.py +0 -0
  298. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/metric_utils.py +0 -0
  299. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/path_utils.py +0 -0
  300. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/pricing_utils.py +0 -0
  301. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/process_utils.py +0 -0
  302. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/response_utils.py +0 -0
  303. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/save_utils.py +0 -0
  304. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/serve_utils.py +0 -0
  305. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/thread_utils.py +0 -0
  306. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/threaded_sandbox_client.py +0 -0
  307. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/tool_utils.py +0 -0
  308. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/usage_utils.py +0 -0
  309. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/utils/version_utils.py +0 -0
  310. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +0 -0
  311. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/README.md +0 -0
  312. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/RE_MIGRATION.md +0 -0
  313. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/__init__.py +0 -0
  314. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/artifact.py +0 -0
  315. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/config.py +0 -0
  316. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/env.py +0 -0
  317. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/model.py +0 -0
  318. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/program.py +0 -0
  319. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/runtime.py +0 -0
  320. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/runtime_handles.py +0 -0
  321. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/sandbox.py +0 -0
  322. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/state.py +0 -0
  323. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/task.py +0 -0
  324. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/taskset.py +0 -0
  325. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/toolset.py +0 -0
  326. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/types.py +0 -0
  327. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/user.py +0 -0
  328. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/__init__.py +0 -0
  329. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/binding_utils.py +0 -0
  330. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/config_callable_utils.py +0 -0
  331. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/config_utils.py +0 -0
  332. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/json_utils.py +0 -0
  333. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/judge_utils.py +0 -0
  334. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/lifecycle_utils.py +0 -0
  335. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/logging_utils.py +0 -0
  336. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
  337. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/mcp_utils.py +0 -0
  338. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/object_utils.py +0 -0
  339. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/program_utils.py +0 -0
  340. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/prompt_utils.py +0 -0
  341. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/runtime_owner_utils.py +0 -0
  342. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/runtime_registry.py +0 -0
  343. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
  344. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/sandbox_python_utils.py +0 -0
  345. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/sandbox_utils.py +0 -0
  346. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/scoring_utils.py +0 -0
  347. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/serialization_utils.py +0 -0
  348. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/task_freeze_utils.py +0 -0
  349. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/taskset_utils.py +0 -0
  350. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/tool_utils.py +0 -0
  351. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/toolset_utils.py +0 -0
  352. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/trajectory_utils.py +0 -0
  353. {verifiers-0.1.15.dev175 → verifiers-0.1.15.dev177}/verifiers/v1/utils/usage_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.15.dev175
3
+ Version: 0.1.15.dev177
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -40,7 +40,7 @@ Requires-Dist: openai>=1.108.1
40
40
  Requires-Dist: pillow
41
41
  Requires-Dist: prime-pydantic-config[toml]
42
42
  Requires-Dist: prime-sandboxes>=0.2.25
43
- Requires-Dist: prime-tunnel>=0.1.6
43
+ Requires-Dist: prime-tunnel>=0.1.8
44
44
  Requires-Dist: pydantic>=2.11.9
45
45
  Requires-Dist: pymupdf
46
46
  Requires-Dist: pyzmq>=27.1.0
@@ -37,7 +37,7 @@ dependencies = [
37
37
  "nest-asyncio>=1.6.0", # for jupyter notebooks
38
38
  "openai>=1.108.1",
39
39
  "openai-agents>=0.0.7",
40
- "prime-tunnel>=0.1.6",
40
+ "prime-tunnel>=0.1.8",
41
41
  "prime-sandboxes>=0.2.25",
42
42
  "pydantic>=2.11.9",
43
43
  "requests",
@@ -1,13 +1,13 @@
1
1
  # QUEST Search Taskset
2
2
 
3
- Objective QUEST tasks ported into the composable search taskset framework.
3
+ QUEST tasks ported into the composable search taskset framework.
4
4
 
5
5
  ## Source
6
6
 
7
7
  - Dataset: [`osunlp/QUEST-RL-Data`](https://huggingface.co/datasets/osunlp/QUEST-RL-Data)
8
8
  - Upstream project: [`OSU-NLP-Group/QUEST`](https://github.com/OSU-NLP-Group/QUEST)
9
9
 
10
- The taskset loads the Hugging Face dataset, filters to `rl_task_category == "objective"` by default, and uses the dataset-provided generated evaluation scripts under `eval_scripts/*.py`.
10
+ The taskset loads the Hugging Face dataset and filters by `rl_task_category`. Objective tasks use the dataset-provided generated evaluation scripts under `eval_scripts/*.py`. Open-ended tasks use the dataset-provided reference answer and rubric criteria.
11
11
 
12
12
  ## Task Contract
13
13
 
@@ -17,25 +17,37 @@ The paired `rlm_search` environment prompts RLM to write this file and provides
17
17
 
18
18
  ## Scoring
19
19
 
20
- `QuestRubric` loads the generated eval script for the example's `task_id` and calls its async `evaluate_answer(...)` entrypoint using the vendored minimal `obj_task_eval` runtime. The rollout reward is `summary["final_score"]`, clipped to `[0.0, 1.0]`.
20
+ ### Objective
21
+
22
+ For objective tasks, `QuestRubric` loads the generated eval script for the example's `task_id` and calls its async `evaluate_answer(...)` entrypoint using the vendored minimal `obj_task_eval` runtime. The rollout reward is `summary["final_score"]`, clipped to `[0.0, 1.0]`.
21
23
 
22
24
  Generated scripts may request URL-backed verification. PDF URLs are detected and parsed with the upstream QUEST PDF parser path before falling back to generic webpage retrieval.
23
25
 
24
26
  This port intentionally preserves upstream QUEST behavior for URL-backed verification semantics. The upstream verifier generally treats invalid, irrelevant, or inaccessible cited webpages as unsupported claims, which can assign `0.0` to the affected verification node even when the immediate cause is source access such as a bot challenge, rate limit, timeout, or parser failure. Future work should consider a finer-grained source-access taxonomy so verifier infrastructure limitations can be distinguished from model-provided bad URLs or unsupported claims.
25
27
 
26
- A reward of `0.0` with no `state["error"]` means the QUEST evaluator ran and judged the answer incorrect under the upstream-compatible scoring path. Infrastructure and evaluator failures outside normal QUEST source verification are represented with `vf.Error` subclasses instead of ad hoc success metrics.
28
+ ### Open-ended
29
+
30
+ For open-ended tasks, `QuestRubric` evaluates each rubric criterion independently. Each judge call compares the candidate answer against the dataset reference answer and returns scores for both documents on the criterion. The expected nominal judge-call count is the number of rubric criteria in the example, typically about 31 calls.
31
+
32
+ The summary stores both scoring views:
33
+
34
+ - `upstream_pairwise_score`: upstream QUEST's `total_score_a / (total_score_a + total_score_b)` comparison value.
35
+ - `raw_reference_ratio`: raw `total_score_a / total_score_b` candidate-vs-reference score.
36
+ - `final_score`: Verifiers reward, `raw_reference_ratio / 0.9` clipped to `[0.0, 1.0]`, so near-reference-quality answers can receive reward `1.0` despite noisy continuous criterion judging.
27
37
 
28
38
  ## Error Handling
29
39
 
40
+ A reward of `0.0` with no `state["error"]` means the QUEST evaluator ran and judged the answer incorrect or insufficient under the selected scoring path. Infrastructure and evaluator failures outside normal QUEST source verification are represented with `vf.Error` subclasses instead of ad hoc success metrics.
41
+
30
42
  QUEST uses Verifiers' framework-managed error field for non-answer failures when the failure comes from external runtime systems:
31
43
 
32
44
  - Missing live sandbox or answer-file read failure: `vf.SandboxError`.
33
45
  - Transient judge provider/network/rate-limit/server failures: retryable `vf.InfraError`.
34
46
  - Empty or invalid judge responses: retryable `vf.InvalidModelResponseError` / `vf.EmptyModelResponseError`.
35
47
  - Judge auth, model-not-found, content-filter, or invalid request failures: non-retryable `vf.ModelError`.
36
- - QUEST eval-script download/cache resolution failure: `vf.InfraError`.
48
+ - QUEST objective eval-script download/cache resolution failure: `vf.InfraError`.
37
49
 
38
- Wrong answers, empty answers, and inaccessible or irrelevant cited sources remain ordinary scored outcomes and return `0.0` without setting `state["error"]`. Generated eval-script source errors, missing task metadata, missing eval-script files, import/load failures, and unexpected evaluator runtime bugs are not converted to `vf.Error`; they raise normally so broken evaluator code fails hard.
50
+ Wrong answers, empty answers, and inaccessible or irrelevant cited sources remain ordinary scored outcomes and return `0.0` without setting `state["error"]`. Generated objective eval-script source errors, missing task metadata, missing eval-script files, import/load failures, and unexpected evaluator runtime bugs are not converted to `vf.Error`; they raise normally so broken evaluator code fails hard.
39
51
 
40
52
  ## Common Arguments
41
53
 
@@ -43,10 +55,10 @@ Wrong answers, empty answers, and inaccessible or irrelevant cited sources remai
43
55
  |---|---:|---|
44
56
  | `dataset_name` | `osunlp/QUEST-RL-Data` | Hugging Face dataset name. |
45
57
  | `split` | `train` | Dataset split. |
46
- | `category` | `objective` | Initial implementation supports objective tasks only. |
58
+ | `category` | `objective` | QUEST category: `objective`, `open-ended`, or `all`. |
47
59
  | `answer_file` | `/task/answer.txt` | Final answer path in the sandbox. |
48
60
  | `judge_model` | `openai/gpt-5.4-mini` | OpenAI-compatible model for QUEST verifier calls. |
49
61
  | `judge_base_url` | `https://api.pinference.ai/api/v1` | Judge API base URL. |
50
62
  | `judge_api_key_var` | `PRIME_API_KEY` | Env var containing the judge API key. |
51
- | `quest_eval_scripts_dir` | HF cache | Optional local directory containing `eval_scripts/*.py`. |
63
+ | `quest_eval_scripts_dir` | HF cache | Optional local directory containing `eval_scripts/*.py` for objective tasks. |
52
64
  | `quest_cache_dir` | `~/.cache/verifiers/quest` | Host cache for QUEST verifier state. |
@@ -0,0 +1,329 @@
1
+ """QUEST open-ended rubric scoring."""
2
+
3
+ import asyncio
4
+ import math
5
+ from typing import Any, Protocol
6
+
7
+ from pydantic import BaseModel
8
+
9
+
10
+ OPEN_ENDED_SYSTEM_PROMPT = """You are an expert evaluator tasked with scoring two documents (both presenting research findings in response to the user's query) on specific rubric criteria. Your evaluation must be precise, objective, and based solely on the evidence present in both documents.
11
+
12
+ ## Evaluation Framework
13
+ For each criterion, score both documents on a scale of 0-10 (continuous values). The score should reflect the quality of performance on that criterion:
14
+ * 0-2 points: Very poor performance. Almost completely fails to meet the criterion requirements.
15
+ * 2-4 points: Poor performance. Minimally meets the criterion requirements with significant deficiencies.
16
+ * 4-6 points: Average performance. Basically meets the criterion requirements, neither good nor bad.
17
+ * 6-8 points: Good performance. Largely meets the criterion requirements with notable strengths.
18
+ * 8-10 points: Excellent/outstanding performance. Fully meets or exceeds the criterion requirements.
19
+
20
+ ## Evaluation Process
21
+ 1. **Understand the Criterion**: Carefully read and interpret what the rubric is asking for.
22
+ 2. **Search for Evidence**: Systematically review both documents for relevant content that addresses the criterion.
23
+ 3. **Score Each Document**: Evaluate how each document performs against the criterion and assign a score from 0-10.
24
+ 4. **Provide Reasoning**: Explain your evaluation with specific references to both documents.
25
+
26
+ ## Important Guidelines
27
+ - Base your evaluation ONLY on what is explicitly present in both documents
28
+ - Do not make assumptions about implied or missing content
29
+ - Consider the quality, completeness, and relevance of the evidence in both documents
30
+ - Be consistent in your evaluation standards across all criteria
31
+ - Provide specific examples from both documents to support your scores"""
32
+
33
+
34
+ OPEN_ENDED_REFERENCE_QUALITY_RATIO = 0.9
35
+
36
+
37
+ OPEN_ENDED_USER_PROMPT = """## Document A (Content to Evaluate)
38
+ {document_content}
39
+
40
+ ## Document B (Reference Content)
41
+ {ref_content}
42
+
43
+ ## Original Query
44
+ {query}
45
+
46
+ ## Rubric Criterion to Evaluate
47
+ **Rubric**: {rubric_title}
48
+ **Category**: {rubric_category}
49
+ **Explanation**: {rubric_explanation}
50
+
51
+ ## Your Task
52
+ Score both Document A (content to evaluate) and Document B (reference content) on this specific rubric criterion using the 0-10 scoring scale provided in the evaluation framework.
53
+
54
+ Return a JSON object with these fields:
55
+ - reason: Detailed explanation with specific evidence from both documents evaluating their performance against the rubric.
56
+ - score_a: The score for Document A (content to evaluate), from 0 to 10.
57
+ - score_b: The score for Document B (reference content), from 0 to 10.
58
+ - confidence: Confidence from 0.0 to 1.0."""
59
+
60
+
61
+ class OpenEndedJudgeClient(Protocol):
62
+ """Minimal client protocol used by QUEST open-ended scoring."""
63
+
64
+ async def async_response(self, *, count_token: bool = False, **kwargs: Any) -> Any:
65
+ """Return a judge response using an OpenAI-compatible chat endpoint."""
66
+
67
+
68
+ class OpenEndedCriterionJudgment(BaseModel):
69
+ """Structured response for one open-ended QUEST criterion."""
70
+
71
+ reason: str
72
+ score_a: float
73
+ score_b: float
74
+ confidence: float = 1.0
75
+
76
+
77
+ class OpenEndedCriterionScore(BaseModel):
78
+ """Normalized score record for one open-ended criterion."""
79
+
80
+ criterion_name: str
81
+ category: str
82
+ weight: float
83
+ reason: str
84
+ score_a: float
85
+ score_b: float
86
+ confidence: float
87
+
88
+
89
+ def _finite_clamped(value: Any, *, lower: float, upper: float, default: float) -> float:
90
+ try:
91
+ numeric = float(value)
92
+ except (TypeError, ValueError):
93
+ return default
94
+ if not math.isfinite(numeric):
95
+ return default
96
+ return min(upper, max(lower, numeric))
97
+
98
+
99
+ def _extract_answer_content(text: str) -> str:
100
+ text = (text or "").strip()
101
+ if not text:
102
+ return ""
103
+ if "<answer>" not in text:
104
+ return text
105
+ start = text.find("<answer>") + len("<answer>")
106
+ end = text.find("</answer>")
107
+ if end == -1:
108
+ return text[start:].strip()
109
+ return text[start:end].strip()
110
+
111
+
112
+ def _criteria_items(criteria_list: Any) -> list[dict[str, Any]]:
113
+ if criteria_list is None:
114
+ return []
115
+ if hasattr(criteria_list, "tolist"):
116
+ criteria_list = criteria_list.tolist()
117
+ if isinstance(criteria_list, tuple):
118
+ criteria_list = list(criteria_list)
119
+ if not isinstance(criteria_list, list):
120
+ return []
121
+ return [item for item in criteria_list if isinstance(item, dict)]
122
+
123
+
124
+ async def _score_one_criterion(
125
+ *,
126
+ client: OpenEndedJudgeClient,
127
+ model: str,
128
+ semaphore: asyncio.Semaphore,
129
+ document_content: str,
130
+ ref_content: str,
131
+ query: str,
132
+ dimension: str,
133
+ criterion_data: dict[str, Any],
134
+ ) -> OpenEndedCriterionScore:
135
+ criterion_name = str(criterion_data.get("criterion") or "")
136
+ explanation = str(criterion_data.get("explanation") or "")
137
+ weight = _finite_clamped(
138
+ criterion_data.get("weight", 1.0), lower=0.0, upper=float("inf"), default=1.0
139
+ )
140
+ messages = [
141
+ {"role": "system", "content": OPEN_ENDED_SYSTEM_PROMPT},
142
+ {
143
+ "role": "user",
144
+ "content": OPEN_ENDED_USER_PROMPT.format(
145
+ document_content=document_content,
146
+ ref_content=ref_content,
147
+ query=query,
148
+ rubric_title=criterion_name,
149
+ rubric_category=dimension,
150
+ rubric_explanation=explanation,
151
+ ),
152
+ },
153
+ ]
154
+ async with semaphore:
155
+ judgment = await client.async_response(
156
+ messages=messages,
157
+ model=model,
158
+ response_format=OpenEndedCriterionJudgment,
159
+ )
160
+ return OpenEndedCriterionScore(
161
+ criterion_name=criterion_name,
162
+ category=dimension,
163
+ weight=weight,
164
+ reason=judgment.reason,
165
+ score_a=_finite_clamped(judgment.score_a, lower=0.0, upper=10.0, default=0.0),
166
+ score_b=_finite_clamped(judgment.score_b, lower=0.0, upper=10.0, default=0.0),
167
+ confidence=_finite_clamped(
168
+ judgment.confidence, lower=0.0, upper=1.0, default=0.0
169
+ ),
170
+ )
171
+
172
+
173
+ def _dimension_score(scores: list[OpenEndedCriterionScore], *, document: str) -> float:
174
+ total_weight = sum(score.weight for score in scores)
175
+ if total_weight <= 0:
176
+ return 0.0
177
+ if document == "a":
178
+ weighted_sum = sum(score.score_a * score.weight for score in scores)
179
+ else:
180
+ weighted_sum = sum(score.score_b * score.weight for score in scores)
181
+ return weighted_sum / total_weight
182
+
183
+
184
+ def _raw_reference_ratio(total_score_a: float, total_score_b: float) -> float:
185
+ if total_score_b > 0:
186
+ return _finite_clamped(
187
+ total_score_a / total_score_b, lower=0.0, upper=float("inf"), default=0.0
188
+ )
189
+ return _finite_clamped(total_score_a / 10.0, lower=0.0, upper=1.0, default=0.0)
190
+
191
+
192
+ def _reference_normalized_reward(total_score_a: float, total_score_b: float) -> float:
193
+ raw_ratio = _raw_reference_ratio(total_score_a, total_score_b)
194
+ if total_score_b > 0:
195
+ return _finite_clamped(
196
+ raw_ratio / OPEN_ENDED_REFERENCE_QUALITY_RATIO,
197
+ lower=0.0,
198
+ upper=1.0,
199
+ default=0.0,
200
+ )
201
+ return raw_ratio
202
+
203
+
204
+ def _upstream_pairwise_score(total_score_a: float, total_score_b: float) -> float:
205
+ denominator = total_score_a + total_score_b
206
+ if denominator <= 0:
207
+ return 0.0
208
+ return _finite_clamped(
209
+ total_score_a / denominator, lower=0.0, upper=1.0, default=0.0
210
+ )
211
+
212
+
213
+ async def score_open_ended_answer(
214
+ *,
215
+ client: OpenEndedJudgeClient,
216
+ model: str,
217
+ semaphore: asyncio.Semaphore,
218
+ answer: str,
219
+ question: str,
220
+ reward_model: dict[str, Any],
221
+ ) -> dict[str, Any]:
222
+ """Score a QUEST open-ended answer with criterion-level judge calls.
223
+
224
+ Upstream QUEST reports ``total_score_a / (total_score_a + total_score_b)``.
225
+ For Verifiers rewards, this returns a reference-normalized score clipped to
226
+ ``[0, 1]`` and saturates at ``1.0`` once the candidate reaches the
227
+ reference-quality threshold. This prevents noisy continuous rubric scores
228
+ from making exact ``1.0`` unreachable in practice. The raw reference ratio
229
+ and upstream pairwise value are retained in the returned summary.
230
+ """
231
+
232
+ ground_truth = reward_model.get("ground_truth")
233
+ if not isinstance(ground_truth, dict):
234
+ raise ValueError("QUEST open-ended task is missing ground_truth metadata")
235
+ criterions = ground_truth.get("criterions")
236
+ if not isinstance(criterions, dict):
237
+ raise ValueError("QUEST open-ended task is missing criterion metadata")
238
+ dimension_weights = ground_truth.get("dimension_weight")
239
+ if not isinstance(dimension_weights, dict):
240
+ raise ValueError("QUEST open-ended task is missing dimension weights")
241
+ ref_answer = ground_truth.get("ref_answer")
242
+ if not isinstance(ref_answer, str) or not ref_answer.strip():
243
+ raise ValueError("QUEST open-ended task is missing reference answer")
244
+
245
+ document_content = _extract_answer_content(answer)
246
+ ref_content = _extract_answer_content(ref_answer)
247
+ tasks: list[asyncio.Task[OpenEndedCriterionScore]] = []
248
+ dimensions: list[str] = []
249
+ for dimension, criteria_list in criterions.items():
250
+ dimension_name = str(dimension)
251
+ dimensions.append(dimension_name)
252
+ for criterion_data in _criteria_items(criteria_list):
253
+ tasks.append(
254
+ asyncio.create_task(
255
+ _score_one_criterion(
256
+ client=client,
257
+ model=model,
258
+ semaphore=semaphore,
259
+ document_content=document_content,
260
+ ref_content=ref_content,
261
+ query=question,
262
+ dimension=dimension_name,
263
+ criterion_data=criterion_data,
264
+ )
265
+ )
266
+ )
267
+ if not tasks:
268
+ raise ValueError("QUEST open-ended task has no rubric criteria")
269
+
270
+ scores = await asyncio.gather(*tasks)
271
+ evaluations: dict[str, list[dict[str, Any]]] = {
272
+ dimension: [] for dimension in dimensions
273
+ }
274
+ grouped_scores: dict[str, list[OpenEndedCriterionScore]] = {
275
+ dimension: [] for dimension in dimensions
276
+ }
277
+ for score in scores:
278
+ grouped_scores.setdefault(score.category, []).append(score)
279
+ evaluations.setdefault(score.category, []).append(score.model_dump())
280
+
281
+ dimension_scores_a: dict[str, float] = {}
282
+ dimension_scores_b: dict[str, float] = {}
283
+ dimension_score_ratios: dict[str, float] = {}
284
+ normalized_dimension_scores: dict[str, float] = {}
285
+ raw_dimension_score_ratios: dict[str, float] = {}
286
+ for dimension, dimension_scores in grouped_scores.items():
287
+ score_a = _dimension_score(dimension_scores, document="a")
288
+ score_b = _dimension_score(dimension_scores, document="b")
289
+ dimension_scores_a[dimension] = score_a
290
+ dimension_scores_b[dimension] = score_b
291
+ dimension_score_ratios[dimension] = _upstream_pairwise_score(score_a, score_b)
292
+ raw_dimension_score_ratios[dimension] = _raw_reference_ratio(score_a, score_b)
293
+ normalized_dimension_scores[dimension] = _reference_normalized_reward(
294
+ score_a, score_b
295
+ )
296
+
297
+ normalized_weights = {
298
+ str(dimension): _finite_clamped(
299
+ weight, lower=0.0, upper=float("inf"), default=0.0
300
+ )
301
+ for dimension, weight in dimension_weights.items()
302
+ }
303
+ total_score_a = sum(
304
+ dimension_scores_a.get(dimension, 0.0) * weight
305
+ for dimension, weight in normalized_weights.items()
306
+ )
307
+ total_score_b = sum(
308
+ dimension_scores_b.get(dimension, 0.0) * weight
309
+ for dimension, weight in normalized_weights.items()
310
+ )
311
+ raw_reference_ratio = _raw_reference_ratio(total_score_a, total_score_b)
312
+ final_score = _reference_normalized_reward(total_score_a, total_score_b)
313
+ upstream_final_score = _upstream_pairwise_score(total_score_a, total_score_b)
314
+ return {
315
+ "final_score": final_score,
316
+ "upstream_pairwise_score": upstream_final_score,
317
+ "raw_reference_ratio": raw_reference_ratio,
318
+ "reference_quality_ratio": OPEN_ENDED_REFERENCE_QUALITY_RATIO,
319
+ "total_score_a": total_score_a,
320
+ "total_score_b": total_score_b,
321
+ "dimension_scores_a": dimension_scores_a,
322
+ "dimension_scores_b": dimension_scores_b,
323
+ "dimension_scores": normalized_dimension_scores,
324
+ "raw_dimension_score_ratios": raw_dimension_score_ratios,
325
+ "upstream_dimension_score_ratios": dimension_score_ratios,
326
+ "dimension_weights": normalized_weights,
327
+ "evaluations": evaluations,
328
+ "criterion_count": len(scores),
329
+ }
@@ -41,6 +41,7 @@ from verifiers.utils.client_utils import setup_openai_client
41
41
 
42
42
  from .obj_task_eval.utils.cache_filesys import CacheFileSys
43
43
  from .obj_task_eval.utils.load_eval_script import load_eval_script
44
+ from .open_ended import score_open_ended_answer
44
45
 
45
46
  logger = logging.getLogger(__name__)
46
47
 
@@ -228,13 +229,42 @@ def _usage_dict(response: Any) -> dict[str, int]:
228
229
  }
229
230
 
230
231
 
232
+ def _parse_ast_literal(node: ast.AST) -> Any:
233
+ if isinstance(node, ast.Expression):
234
+ return _parse_ast_literal(node.body)
235
+ if isinstance(node, ast.Constant):
236
+ return node.value
237
+ if isinstance(node, ast.List):
238
+ return [_parse_ast_literal(item) for item in node.elts]
239
+ if isinstance(node, ast.Tuple):
240
+ return tuple(_parse_ast_literal(item) for item in node.elts)
241
+ if isinstance(node, ast.Dict):
242
+ return {
243
+ _parse_ast_literal(key): _parse_ast_literal(value)
244
+ for key, value in zip(node.keys, node.values)
245
+ }
246
+ if isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub):
247
+ operand = _parse_ast_literal(node.operand)
248
+ if isinstance(operand, int | float):
249
+ return -operand
250
+ if isinstance(node, ast.Name) and node.id == "object":
251
+ return object
252
+ if isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
253
+ if node.func.id == "array" and len(node.args) == 1:
254
+ return _parse_ast_literal(node.args[0])
255
+ raise ValueError(f"Unsupported QUEST literal syntax: {ast.dump(node)}")
256
+
257
+
231
258
  def _parse_literal(value: Any) -> Any:
232
259
  if not isinstance(value, str):
233
260
  return value
234
261
  try:
235
262
  return ast.literal_eval(value)
236
263
  except Exception:
237
- return value
264
+ try:
265
+ return _parse_ast_literal(ast.parse(value, mode="eval"))
266
+ except Exception:
267
+ return value
238
268
 
239
269
 
240
270
  def _extract_question(prompt: Any, extra_info: Any) -> str:
@@ -345,7 +375,7 @@ def _resolve_eval_scripts_root(
345
375
 
346
376
 
347
377
  class QuestTaskSet(SandboxTaskSet):
348
- """QUEST objective search/research taskset."""
378
+ """QUEST search/research taskset."""
349
379
 
350
380
  default_workdir = DEFAULT_WORKDIR
351
381
 
@@ -375,10 +405,6 @@ class QuestTaskSet(SandboxTaskSet):
375
405
  raise ValueError(
376
406
  "category must be one of 'objective', 'open-ended', or 'all'"
377
407
  )
378
- if category != "objective":
379
- raise NotImplementedError(
380
- "Initial QUEST taskset implementation supports category='objective' only"
381
- )
382
408
  self.dataset_name = dataset_name
383
409
  self.split = split
384
410
  self.category = category
@@ -397,9 +423,13 @@ class QuestTaskSet(SandboxTaskSet):
397
423
  self._judge_api_key_var = judge_api_key_var
398
424
  self._judge_sampling_args = dict(judge_sampling_args or {})
399
425
  self._quest_cache_dir = quest_cache_dir
400
- self._quest_eval_scripts_root = _resolve_eval_scripts_root(
401
- dataset_name=dataset_name,
402
- eval_scripts_dir=quest_eval_scripts_dir,
426
+ self._quest_eval_scripts_root = (
427
+ None
428
+ if category == "open-ended" and quest_eval_scripts_dir is None
429
+ else _resolve_eval_scripts_root(
430
+ dataset_name=dataset_name,
431
+ eval_scripts_dir=quest_eval_scripts_dir,
432
+ )
403
433
  )
404
434
  self._quest_eval_concurrency = quest_eval_concurrency
405
435
  super().__init__(
@@ -416,15 +446,29 @@ class QuestTaskSet(SandboxTaskSet):
416
446
  num_proc=self.ds_num_proc,
417
447
  )
418
448
  rows: list[dict[str, Any]] = []
419
- for row in raw:
420
- if self.category != "all" and row.get("rl_task_category") != self.category:
449
+ for row_index, row in enumerate(raw):
450
+ row_category = row.get("rl_task_category")
451
+ if self.category != "all" and row_category != self.category:
421
452
  continue
453
+ if row_category not in {"objective", "open-ended"}:
454
+ raise ValueError(
455
+ f"Unsupported QUEST row category at dataset index {row_index}: "
456
+ f"{row_category!r}"
457
+ )
422
458
  extra_info = _parse_literal(row.get("extra_info"))
423
459
  reward_model = _parse_literal(row.get("reward_model"))
424
460
  question = _extract_question(row.get("prompt"), extra_info)
425
461
  task_id = _extract_task_id(reward_model, extra_info)
426
- if self.category == "objective" and not task_id:
427
- continue
462
+ if not task_id:
463
+ raise ValueError(
464
+ f"QUEST {row_category} row is missing task_id metadata "
465
+ f"at dataset index {row_index}"
466
+ )
467
+ if row_category == "open-ended" and not isinstance(reward_model, dict):
468
+ raise ValueError(
469
+ "QUEST open-ended row has invalid reward_model metadata "
470
+ f"at dataset index {row_index}"
471
+ )
428
472
  rows.append(
429
473
  {
430
474
  "question": question,
@@ -479,7 +523,11 @@ class QuestTaskSet(SandboxTaskSet):
479
523
  return QuestRubric(
480
524
  answer_file=self.answer_file,
481
525
  dataset_name=self.dataset_name,
482
- eval_scripts_dir=str(self._quest_eval_scripts_root),
526
+ eval_scripts_dir=(
527
+ str(self._quest_eval_scripts_root)
528
+ if self._quest_eval_scripts_root is not None
529
+ else None
530
+ ),
483
531
  cache_dir=self._quest_cache_dir,
484
532
  judge_model=self._judge_model,
485
533
  judge_base_url=self._judge_base_url,
@@ -490,7 +538,7 @@ class QuestTaskSet(SandboxTaskSet):
490
538
 
491
539
 
492
540
  class QuestRubric(vf.Rubric):
493
- """Scores QUEST objective tasks using their generated eval scripts."""
541
+ """Scores QUEST objective and open-ended tasks."""
494
542
 
495
543
  def __init__(
496
544
  self,
@@ -524,22 +572,28 @@ class QuestRubric(vf.Rubric):
524
572
  self._client: QuestOpenAIClient | None = None
525
573
  self._semaphore = asyncio.Semaphore(eval_concurrency)
526
574
  self._scripts_root: Path | None = None
527
- self.add_reward_func(self.objective_reward, weight=1.0)
575
+ self.add_reward_func(self.quest_reward, weight=1.0)
528
576
 
529
- async def _objective_score_for_state(self, state: vf.State) -> float:
577
+ async def _quest_score_for_state(self, state: vf.State) -> float:
530
578
  if state.get("error") is not None:
531
579
  return 0.0
532
580
  try:
533
- return await self.objective_reward(state)
581
+ return await self.quest_reward(state)
534
582
  except vf.Error as exc:
535
583
  state["error"] = exc
536
584
  return 0.0
537
585
 
586
+ def _metric_name(self, state: vf.State) -> str:
587
+ info = state.get("info") or {}
588
+ if info.get("rl_task_category") == "open-ended":
589
+ return "open_ended_reward"
590
+ return "objective_reward"
591
+
538
592
  async def score_rollout(self, state: vf.State) -> None:
539
593
  """Score one rollout and preserve QUEST infrastructure failures as ``vf.Error`` values."""
540
- score = await self._objective_score_for_state(state)
594
+ score = await self._quest_score_for_state(state)
541
595
  state["reward"] = score
542
- state["metrics"] = {"objective_reward": score}
596
+ state["metrics"] = {"quest_reward": score, self._metric_name(state): score}
543
597
 
544
598
  async def score_group(self, states: list[vf.State]) -> None:
545
599
  """Score rollouts while preserving QUEST infrastructure failures as ``vf.Error`` values."""
@@ -547,7 +601,7 @@ class QuestRubric(vf.Rubric):
547
601
  logger.warning("No states to score")
548
602
  return
549
603
  scores = await asyncio.gather(
550
- *(self._objective_score_for_state(state) for state in states)
604
+ *(self._quest_score_for_state(state) for state in states)
551
605
  )
552
606
  avg_score = sum(scores) / len(scores)
553
607
  for state, score in zip(states, scores):
@@ -559,9 +613,15 @@ class QuestRubric(vf.Rubric):
559
613
  turn["advantage"] = state["advantage"]
560
614
  if turn.get("reward") is None:
561
615
  turn["reward"] = state["reward"]
562
- state["metrics"] = {"objective_reward": score}
616
+ state["metrics"] = {"quest_reward": score, self._metric_name(state): score}
563
617
 
564
- async def objective_reward(self, state: vf.State, **_: Any) -> float:
618
+ async def quest_reward(self, state: vf.State, **_: Any) -> float:
619
+ info = state.get("info") or {}
620
+ if info.get("rl_task_category") == "open-ended":
621
+ return await self.open_ended_reward(state)
622
+ return await self.objective_reward(state)
623
+
624
+ async def _read_answer(self, state: vf.State) -> tuple[str, str]:
565
625
  sandbox_client = state.get("sandbox_client")
566
626
  sandbox_id = state.get("sandbox_id")
567
627
  if not sandbox_client or not sandbox_id:
@@ -583,6 +643,10 @@ class QuestRubric(vf.Rubric):
583
643
  answer_source = "completion_fallback" if answer else "missing"
584
644
  state["quest_answer"] = answer
585
645
  state["quest_answer_source"] = answer_source
646
+ return answer, answer_source
647
+
648
+ async def objective_reward(self, state: vf.State, **_: Any) -> float:
649
+ answer, _ = await self._read_answer(state)
586
650
  if not answer:
587
651
  state["quest_eval_error"] = "empty_answer"
588
652
  return 0.0
@@ -613,6 +677,38 @@ class QuestRubric(vf.Rubric):
613
677
  state["quest_final_score"] = final_score
614
678
  return final_score
615
679
 
680
+ async def open_ended_reward(self, state: vf.State, **_: Any) -> float:
681
+ answer, _ = await self._read_answer(state)
682
+ if not answer:
683
+ state["quest_eval_error"] = "empty_answer"
684
+ return 0.0
685
+ info = state.get("info") or {}
686
+ task_id = info.get("task_id")
687
+ if not isinstance(task_id, str) or not task_id:
688
+ raise ValueError("QUEST open-ended task is missing task_id metadata")
689
+ reward_model = info.get("reward_model")
690
+ if not isinstance(reward_model, dict):
691
+ raise ValueError("QUEST open-ended task is missing reward_model metadata")
692
+ question = str(info.get("question") or "")
693
+ state["quest_task_id"] = task_id
694
+ client = self._get_client()
695
+ summary = await score_open_ended_answer(
696
+ client=client,
697
+ model=self.judge_model,
698
+ semaphore=self._semaphore,
699
+ answer=answer,
700
+ question=question,
701
+ reward_model=reward_model,
702
+ )
703
+ state["quest_eval_summary"] = summary
704
+ final_score = float(summary.get("final_score", 0.0) or 0.0)
705
+ if not math.isfinite(final_score):
706
+ final_score = 0.0
707
+ final_score = max(0.0, min(1.0, final_score))
708
+ state["quest_final_score"] = final_score
709
+ state["quest_upstream_pairwise_score"] = summary.get("upstream_pairwise_score")
710
+ return final_score
711
+
616
712
  def _get_client(self) -> QuestOpenAIClient:
617
713
  if self._client is not None:
618
714
  return self._client