verifiers 0.1.15.dev176__tar.gz → 0.1.15.dev178__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/PKG-INFO +1 -1
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/README.md +20 -8
- verifiers-0.1.15.dev178/verifiers/envs/experimental/composable/tasksets/search/quest/open_ended.py +329 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/taskset.py +119 -23
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/sandbox_utils.py +14 -15
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/.gitignore +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/LICENSE +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/pyproject.toml +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/AGENTS.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/conftest.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_build_script.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_client_config.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_env_group.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_env_server.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_environment.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_envs.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_eval_cli.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_gepa_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_imports.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_init_script.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_langchain_deep_agents_wikispeedia.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_lean_task.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_logging.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_mcp_search_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_openenv_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_parser.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_per_turn_timing.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_pricing_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_renderer_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_renderer_e2e.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_rubric.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_save_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_types.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_bfcl.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_config_extension.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_empty_completions.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_endpoint_protocols.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_example_counts.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_group_reward_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_harbor_cli.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_nemo_gym_harness.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_openenv_taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_openreward_taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_replay_harness.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_rlm_swe.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_runtime_lifecycle.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_scoring_functions.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_taskset_bindings.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_taskset_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_v1_textarena_taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_wiki_search_v1.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_wordle_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_wordle_v1_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/clients/openai_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/clients/openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/clients/renderer_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/decorators.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/environment.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/composable_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/harness.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/task.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/openseeker/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/openseeker/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/openseeker/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/api_tools/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/api_tools/tool_pdf.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/eval_toolkit.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/evaluator.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/llm_client/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/llm_client/base_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/prompts/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/prompts/cache_prompts.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/cache_filesys.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/load_eval_script.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/misc.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/tool_visit.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/url_tools.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/verification_tree.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/redsearcher/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/redsearcher/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/redsearcher/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/search/search_tasksets.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/extract_fix_patch.sh +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/openswe/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/openswe/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/log_parser.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/scale_swe/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/scale_swe/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/shared/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/shared/test_patch.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/log_parsers.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/openenv_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/errors.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/eval.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/types.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/env_config_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/eval_display.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/eval_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/interception_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/pricing_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/response_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/save_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/utils/version_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/README.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/RE_MIGRATION.md +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/artifact.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/config.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/env.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/harness.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/model.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/program.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/runtime.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/runtime_handles.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/sandbox.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/state.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/task.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/taskset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/toolset.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/types.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/user.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/binding_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/config_callable_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/endpoint_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/json_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/judge_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/lifecycle_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/logging_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/mcp_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/object_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/program_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/prompt_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/runtime_owner_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/runtime_registry.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/sandbox_python_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/scoring_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/serialization_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/task_freeze_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/taskset_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/toolset_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/trajectory_utils.py +0 -0
- {verifiers-0.1.15.dev176 → verifiers-0.1.15.dev178}/verifiers/v1/utils/usage_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.15.
|
|
3
|
+
Version: 0.1.15.dev178
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
# QUEST Search Taskset
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
QUEST tasks ported into the composable search taskset framework.
|
|
4
4
|
|
|
5
5
|
## Source
|
|
6
6
|
|
|
7
7
|
- Dataset: [`osunlp/QUEST-RL-Data`](https://huggingface.co/datasets/osunlp/QUEST-RL-Data)
|
|
8
8
|
- Upstream project: [`OSU-NLP-Group/QUEST`](https://github.com/OSU-NLP-Group/QUEST)
|
|
9
9
|
|
|
10
|
-
The taskset loads the Hugging Face dataset
|
|
10
|
+
The taskset loads the Hugging Face dataset and filters by `rl_task_category`. Objective tasks use the dataset-provided generated evaluation scripts under `eval_scripts/*.py`. Open-ended tasks use the dataset-provided reference answer and rubric criteria.
|
|
11
11
|
|
|
12
12
|
## Task Contract
|
|
13
13
|
|
|
@@ -17,25 +17,37 @@ The paired `rlm_search` environment prompts RLM to write this file and provides
|
|
|
17
17
|
|
|
18
18
|
## Scoring
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
### Objective
|
|
21
|
+
|
|
22
|
+
For objective tasks, `QuestRubric` loads the generated eval script for the example's `task_id` and calls its async `evaluate_answer(...)` entrypoint using the vendored minimal `obj_task_eval` runtime. The rollout reward is `summary["final_score"]`, clipped to `[0.0, 1.0]`.
|
|
21
23
|
|
|
22
24
|
Generated scripts may request URL-backed verification. PDF URLs are detected and parsed with the upstream QUEST PDF parser path before falling back to generic webpage retrieval.
|
|
23
25
|
|
|
24
26
|
This port intentionally preserves upstream QUEST behavior for URL-backed verification semantics. The upstream verifier generally treats invalid, irrelevant, or inaccessible cited webpages as unsupported claims, which can assign `0.0` to the affected verification node even when the immediate cause is source access such as a bot challenge, rate limit, timeout, or parser failure. Future work should consider a finer-grained source-access taxonomy so verifier infrastructure limitations can be distinguished from model-provided bad URLs or unsupported claims.
|
|
25
27
|
|
|
26
|
-
|
|
28
|
+
### Open-ended
|
|
29
|
+
|
|
30
|
+
For open-ended tasks, `QuestRubric` evaluates each rubric criterion independently. Each judge call compares the candidate answer against the dataset reference answer and returns scores for both documents on the criterion. The expected nominal judge-call count is the number of rubric criteria in the example, typically about 31 calls.
|
|
31
|
+
|
|
32
|
+
The summary stores both scoring views:
|
|
33
|
+
|
|
34
|
+
- `upstream_pairwise_score`: upstream QUEST's `total_score_a / (total_score_a + total_score_b)` comparison value.
|
|
35
|
+
- `raw_reference_ratio`: raw `total_score_a / total_score_b` candidate-vs-reference score.
|
|
36
|
+
- `final_score`: Verifiers reward, `raw_reference_ratio / 0.9` clipped to `[0.0, 1.0]`, so near-reference-quality answers can receive reward `1.0` despite noisy continuous criterion judging.
|
|
27
37
|
|
|
28
38
|
## Error Handling
|
|
29
39
|
|
|
40
|
+
A reward of `0.0` with no `state["error"]` means the QUEST evaluator ran and judged the answer incorrect or insufficient under the selected scoring path. Infrastructure and evaluator failures outside normal QUEST source verification are represented with `vf.Error` subclasses instead of ad hoc success metrics.
|
|
41
|
+
|
|
30
42
|
QUEST uses Verifiers' framework-managed error field for non-answer failures when the failure comes from external runtime systems:
|
|
31
43
|
|
|
32
44
|
- Missing live sandbox or answer-file read failure: `vf.SandboxError`.
|
|
33
45
|
- Transient judge provider/network/rate-limit/server failures: retryable `vf.InfraError`.
|
|
34
46
|
- Empty or invalid judge responses: retryable `vf.InvalidModelResponseError` / `vf.EmptyModelResponseError`.
|
|
35
47
|
- Judge auth, model-not-found, content-filter, or invalid request failures: non-retryable `vf.ModelError`.
|
|
36
|
-
- QUEST eval-script download/cache resolution failure: `vf.InfraError`.
|
|
48
|
+
- QUEST objective eval-script download/cache resolution failure: `vf.InfraError`.
|
|
37
49
|
|
|
38
|
-
Wrong answers, empty answers, and inaccessible or irrelevant cited sources remain ordinary scored outcomes and return `0.0` without setting `state["error"]`. Generated eval-script source errors, missing task metadata, missing eval-script files, import/load failures, and unexpected evaluator runtime bugs are not converted to `vf.Error`; they raise normally so broken evaluator code fails hard.
|
|
50
|
+
Wrong answers, empty answers, and inaccessible or irrelevant cited sources remain ordinary scored outcomes and return `0.0` without setting `state["error"]`. Generated objective eval-script source errors, missing task metadata, missing eval-script files, import/load failures, and unexpected evaluator runtime bugs are not converted to `vf.Error`; they raise normally so broken evaluator code fails hard.
|
|
39
51
|
|
|
40
52
|
## Common Arguments
|
|
41
53
|
|
|
@@ -43,10 +55,10 @@ Wrong answers, empty answers, and inaccessible or irrelevant cited sources remai
|
|
|
43
55
|
|---|---:|---|
|
|
44
56
|
| `dataset_name` | `osunlp/QUEST-RL-Data` | Hugging Face dataset name. |
|
|
45
57
|
| `split` | `train` | Dataset split. |
|
|
46
|
-
| `category` | `objective` |
|
|
58
|
+
| `category` | `objective` | QUEST category: `objective`, `open-ended`, or `all`. |
|
|
47
59
|
| `answer_file` | `/task/answer.txt` | Final answer path in the sandbox. |
|
|
48
60
|
| `judge_model` | `openai/gpt-5.4-mini` | OpenAI-compatible model for QUEST verifier calls. |
|
|
49
61
|
| `judge_base_url` | `https://api.pinference.ai/api/v1` | Judge API base URL. |
|
|
50
62
|
| `judge_api_key_var` | `PRIME_API_KEY` | Env var containing the judge API key. |
|
|
51
|
-
| `quest_eval_scripts_dir` | HF cache | Optional local directory containing `eval_scripts/*.py
|
|
63
|
+
| `quest_eval_scripts_dir` | HF cache | Optional local directory containing `eval_scripts/*.py` for objective tasks. |
|
|
52
64
|
| `quest_cache_dir` | `~/.cache/verifiers/quest` | Host cache for QUEST verifier state. |
|
verifiers-0.1.15.dev178/verifiers/envs/experimental/composable/tasksets/search/quest/open_ended.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""QUEST open-ended rubric scoring."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import math
|
|
5
|
+
from typing import Any, Protocol
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
OPEN_ENDED_SYSTEM_PROMPT = """You are an expert evaluator tasked with scoring two documents (both presenting research findings in response to the user's query) on specific rubric criteria. Your evaluation must be precise, objective, and based solely on the evidence present in both documents.
|
|
11
|
+
|
|
12
|
+
## Evaluation Framework
|
|
13
|
+
For each criterion, score both documents on a scale of 0-10 (continuous values). The score should reflect the quality of performance on that criterion:
|
|
14
|
+
* 0-2 points: Very poor performance. Almost completely fails to meet the criterion requirements.
|
|
15
|
+
* 2-4 points: Poor performance. Minimally meets the criterion requirements with significant deficiencies.
|
|
16
|
+
* 4-6 points: Average performance. Basically meets the criterion requirements, neither good nor bad.
|
|
17
|
+
* 6-8 points: Good performance. Largely meets the criterion requirements with notable strengths.
|
|
18
|
+
* 8-10 points: Excellent/outstanding performance. Fully meets or exceeds the criterion requirements.
|
|
19
|
+
|
|
20
|
+
## Evaluation Process
|
|
21
|
+
1. **Understand the Criterion**: Carefully read and interpret what the rubric is asking for.
|
|
22
|
+
2. **Search for Evidence**: Systematically review both documents for relevant content that addresses the criterion.
|
|
23
|
+
3. **Score Each Document**: Evaluate how each document performs against the criterion and assign a score from 0-10.
|
|
24
|
+
4. **Provide Reasoning**: Explain your evaluation with specific references to both documents.
|
|
25
|
+
|
|
26
|
+
## Important Guidelines
|
|
27
|
+
- Base your evaluation ONLY on what is explicitly present in both documents
|
|
28
|
+
- Do not make assumptions about implied or missing content
|
|
29
|
+
- Consider the quality, completeness, and relevance of the evidence in both documents
|
|
30
|
+
- Be consistent in your evaluation standards across all criteria
|
|
31
|
+
- Provide specific examples from both documents to support your scores"""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
OPEN_ENDED_REFERENCE_QUALITY_RATIO = 0.9
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
OPEN_ENDED_USER_PROMPT = """## Document A (Content to Evaluate)
|
|
38
|
+
{document_content}
|
|
39
|
+
|
|
40
|
+
## Document B (Reference Content)
|
|
41
|
+
{ref_content}
|
|
42
|
+
|
|
43
|
+
## Original Query
|
|
44
|
+
{query}
|
|
45
|
+
|
|
46
|
+
## Rubric Criterion to Evaluate
|
|
47
|
+
**Rubric**: {rubric_title}
|
|
48
|
+
**Category**: {rubric_category}
|
|
49
|
+
**Explanation**: {rubric_explanation}
|
|
50
|
+
|
|
51
|
+
## Your Task
|
|
52
|
+
Score both Document A (content to evaluate) and Document B (reference content) on this specific rubric criterion using the 0-10 scoring scale provided in the evaluation framework.
|
|
53
|
+
|
|
54
|
+
Return a JSON object with these fields:
|
|
55
|
+
- reason: Detailed explanation with specific evidence from both documents evaluating their performance against the rubric.
|
|
56
|
+
- score_a: The score for Document A (content to evaluate), from 0 to 10.
|
|
57
|
+
- score_b: The score for Document B (reference content), from 0 to 10.
|
|
58
|
+
- confidence: Confidence from 0.0 to 1.0."""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class OpenEndedJudgeClient(Protocol):
|
|
62
|
+
"""Minimal client protocol used by QUEST open-ended scoring."""
|
|
63
|
+
|
|
64
|
+
async def async_response(self, *, count_token: bool = False, **kwargs: Any) -> Any:
|
|
65
|
+
"""Return a judge response using an OpenAI-compatible chat endpoint."""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class OpenEndedCriterionJudgment(BaseModel):
|
|
69
|
+
"""Structured response for one open-ended QUEST criterion."""
|
|
70
|
+
|
|
71
|
+
reason: str
|
|
72
|
+
score_a: float
|
|
73
|
+
score_b: float
|
|
74
|
+
confidence: float = 1.0
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class OpenEndedCriterionScore(BaseModel):
|
|
78
|
+
"""Normalized score record for one open-ended criterion."""
|
|
79
|
+
|
|
80
|
+
criterion_name: str
|
|
81
|
+
category: str
|
|
82
|
+
weight: float
|
|
83
|
+
reason: str
|
|
84
|
+
score_a: float
|
|
85
|
+
score_b: float
|
|
86
|
+
confidence: float
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _finite_clamped(value: Any, *, lower: float, upper: float, default: float) -> float:
|
|
90
|
+
try:
|
|
91
|
+
numeric = float(value)
|
|
92
|
+
except (TypeError, ValueError):
|
|
93
|
+
return default
|
|
94
|
+
if not math.isfinite(numeric):
|
|
95
|
+
return default
|
|
96
|
+
return min(upper, max(lower, numeric))
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _extract_answer_content(text: str) -> str:
|
|
100
|
+
text = (text or "").strip()
|
|
101
|
+
if not text:
|
|
102
|
+
return ""
|
|
103
|
+
if "<answer>" not in text:
|
|
104
|
+
return text
|
|
105
|
+
start = text.find("<answer>") + len("<answer>")
|
|
106
|
+
end = text.find("</answer>")
|
|
107
|
+
if end == -1:
|
|
108
|
+
return text[start:].strip()
|
|
109
|
+
return text[start:end].strip()
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _criteria_items(criteria_list: Any) -> list[dict[str, Any]]:
|
|
113
|
+
if criteria_list is None:
|
|
114
|
+
return []
|
|
115
|
+
if hasattr(criteria_list, "tolist"):
|
|
116
|
+
criteria_list = criteria_list.tolist()
|
|
117
|
+
if isinstance(criteria_list, tuple):
|
|
118
|
+
criteria_list = list(criteria_list)
|
|
119
|
+
if not isinstance(criteria_list, list):
|
|
120
|
+
return []
|
|
121
|
+
return [item for item in criteria_list if isinstance(item, dict)]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
async def _score_one_criterion(
|
|
125
|
+
*,
|
|
126
|
+
client: OpenEndedJudgeClient,
|
|
127
|
+
model: str,
|
|
128
|
+
semaphore: asyncio.Semaphore,
|
|
129
|
+
document_content: str,
|
|
130
|
+
ref_content: str,
|
|
131
|
+
query: str,
|
|
132
|
+
dimension: str,
|
|
133
|
+
criterion_data: dict[str, Any],
|
|
134
|
+
) -> OpenEndedCriterionScore:
|
|
135
|
+
criterion_name = str(criterion_data.get("criterion") or "")
|
|
136
|
+
explanation = str(criterion_data.get("explanation") or "")
|
|
137
|
+
weight = _finite_clamped(
|
|
138
|
+
criterion_data.get("weight", 1.0), lower=0.0, upper=float("inf"), default=1.0
|
|
139
|
+
)
|
|
140
|
+
messages = [
|
|
141
|
+
{"role": "system", "content": OPEN_ENDED_SYSTEM_PROMPT},
|
|
142
|
+
{
|
|
143
|
+
"role": "user",
|
|
144
|
+
"content": OPEN_ENDED_USER_PROMPT.format(
|
|
145
|
+
document_content=document_content,
|
|
146
|
+
ref_content=ref_content,
|
|
147
|
+
query=query,
|
|
148
|
+
rubric_title=criterion_name,
|
|
149
|
+
rubric_category=dimension,
|
|
150
|
+
rubric_explanation=explanation,
|
|
151
|
+
),
|
|
152
|
+
},
|
|
153
|
+
]
|
|
154
|
+
async with semaphore:
|
|
155
|
+
judgment = await client.async_response(
|
|
156
|
+
messages=messages,
|
|
157
|
+
model=model,
|
|
158
|
+
response_format=OpenEndedCriterionJudgment,
|
|
159
|
+
)
|
|
160
|
+
return OpenEndedCriterionScore(
|
|
161
|
+
criterion_name=criterion_name,
|
|
162
|
+
category=dimension,
|
|
163
|
+
weight=weight,
|
|
164
|
+
reason=judgment.reason,
|
|
165
|
+
score_a=_finite_clamped(judgment.score_a, lower=0.0, upper=10.0, default=0.0),
|
|
166
|
+
score_b=_finite_clamped(judgment.score_b, lower=0.0, upper=10.0, default=0.0),
|
|
167
|
+
confidence=_finite_clamped(
|
|
168
|
+
judgment.confidence, lower=0.0, upper=1.0, default=0.0
|
|
169
|
+
),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _dimension_score(scores: list[OpenEndedCriterionScore], *, document: str) -> float:
|
|
174
|
+
total_weight = sum(score.weight for score in scores)
|
|
175
|
+
if total_weight <= 0:
|
|
176
|
+
return 0.0
|
|
177
|
+
if document == "a":
|
|
178
|
+
weighted_sum = sum(score.score_a * score.weight for score in scores)
|
|
179
|
+
else:
|
|
180
|
+
weighted_sum = sum(score.score_b * score.weight for score in scores)
|
|
181
|
+
return weighted_sum / total_weight
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _raw_reference_ratio(total_score_a: float, total_score_b: float) -> float:
|
|
185
|
+
if total_score_b > 0:
|
|
186
|
+
return _finite_clamped(
|
|
187
|
+
total_score_a / total_score_b, lower=0.0, upper=float("inf"), default=0.0
|
|
188
|
+
)
|
|
189
|
+
return _finite_clamped(total_score_a / 10.0, lower=0.0, upper=1.0, default=0.0)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _reference_normalized_reward(total_score_a: float, total_score_b: float) -> float:
|
|
193
|
+
raw_ratio = _raw_reference_ratio(total_score_a, total_score_b)
|
|
194
|
+
if total_score_b > 0:
|
|
195
|
+
return _finite_clamped(
|
|
196
|
+
raw_ratio / OPEN_ENDED_REFERENCE_QUALITY_RATIO,
|
|
197
|
+
lower=0.0,
|
|
198
|
+
upper=1.0,
|
|
199
|
+
default=0.0,
|
|
200
|
+
)
|
|
201
|
+
return raw_ratio
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _upstream_pairwise_score(total_score_a: float, total_score_b: float) -> float:
|
|
205
|
+
denominator = total_score_a + total_score_b
|
|
206
|
+
if denominator <= 0:
|
|
207
|
+
return 0.0
|
|
208
|
+
return _finite_clamped(
|
|
209
|
+
total_score_a / denominator, lower=0.0, upper=1.0, default=0.0
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
async def score_open_ended_answer(
|
|
214
|
+
*,
|
|
215
|
+
client: OpenEndedJudgeClient,
|
|
216
|
+
model: str,
|
|
217
|
+
semaphore: asyncio.Semaphore,
|
|
218
|
+
answer: str,
|
|
219
|
+
question: str,
|
|
220
|
+
reward_model: dict[str, Any],
|
|
221
|
+
) -> dict[str, Any]:
|
|
222
|
+
"""Score a QUEST open-ended answer with criterion-level judge calls.
|
|
223
|
+
|
|
224
|
+
Upstream QUEST reports ``total_score_a / (total_score_a + total_score_b)``.
|
|
225
|
+
For Verifiers rewards, this returns a reference-normalized score clipped to
|
|
226
|
+
``[0, 1]`` and saturates at ``1.0`` once the candidate reaches the
|
|
227
|
+
reference-quality threshold. This prevents noisy continuous rubric scores
|
|
228
|
+
from making exact ``1.0`` unreachable in practice. The raw reference ratio
|
|
229
|
+
and upstream pairwise value are retained in the returned summary.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
ground_truth = reward_model.get("ground_truth")
|
|
233
|
+
if not isinstance(ground_truth, dict):
|
|
234
|
+
raise ValueError("QUEST open-ended task is missing ground_truth metadata")
|
|
235
|
+
criterions = ground_truth.get("criterions")
|
|
236
|
+
if not isinstance(criterions, dict):
|
|
237
|
+
raise ValueError("QUEST open-ended task is missing criterion metadata")
|
|
238
|
+
dimension_weights = ground_truth.get("dimension_weight")
|
|
239
|
+
if not isinstance(dimension_weights, dict):
|
|
240
|
+
raise ValueError("QUEST open-ended task is missing dimension weights")
|
|
241
|
+
ref_answer = ground_truth.get("ref_answer")
|
|
242
|
+
if not isinstance(ref_answer, str) or not ref_answer.strip():
|
|
243
|
+
raise ValueError("QUEST open-ended task is missing reference answer")
|
|
244
|
+
|
|
245
|
+
document_content = _extract_answer_content(answer)
|
|
246
|
+
ref_content = _extract_answer_content(ref_answer)
|
|
247
|
+
tasks: list[asyncio.Task[OpenEndedCriterionScore]] = []
|
|
248
|
+
dimensions: list[str] = []
|
|
249
|
+
for dimension, criteria_list in criterions.items():
|
|
250
|
+
dimension_name = str(dimension)
|
|
251
|
+
dimensions.append(dimension_name)
|
|
252
|
+
for criterion_data in _criteria_items(criteria_list):
|
|
253
|
+
tasks.append(
|
|
254
|
+
asyncio.create_task(
|
|
255
|
+
_score_one_criterion(
|
|
256
|
+
client=client,
|
|
257
|
+
model=model,
|
|
258
|
+
semaphore=semaphore,
|
|
259
|
+
document_content=document_content,
|
|
260
|
+
ref_content=ref_content,
|
|
261
|
+
query=question,
|
|
262
|
+
dimension=dimension_name,
|
|
263
|
+
criterion_data=criterion_data,
|
|
264
|
+
)
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
if not tasks:
|
|
268
|
+
raise ValueError("QUEST open-ended task has no rubric criteria")
|
|
269
|
+
|
|
270
|
+
scores = await asyncio.gather(*tasks)
|
|
271
|
+
evaluations: dict[str, list[dict[str, Any]]] = {
|
|
272
|
+
dimension: [] for dimension in dimensions
|
|
273
|
+
}
|
|
274
|
+
grouped_scores: dict[str, list[OpenEndedCriterionScore]] = {
|
|
275
|
+
dimension: [] for dimension in dimensions
|
|
276
|
+
}
|
|
277
|
+
for score in scores:
|
|
278
|
+
grouped_scores.setdefault(score.category, []).append(score)
|
|
279
|
+
evaluations.setdefault(score.category, []).append(score.model_dump())
|
|
280
|
+
|
|
281
|
+
dimension_scores_a: dict[str, float] = {}
|
|
282
|
+
dimension_scores_b: dict[str, float] = {}
|
|
283
|
+
dimension_score_ratios: dict[str, float] = {}
|
|
284
|
+
normalized_dimension_scores: dict[str, float] = {}
|
|
285
|
+
raw_dimension_score_ratios: dict[str, float] = {}
|
|
286
|
+
for dimension, dimension_scores in grouped_scores.items():
|
|
287
|
+
score_a = _dimension_score(dimension_scores, document="a")
|
|
288
|
+
score_b = _dimension_score(dimension_scores, document="b")
|
|
289
|
+
dimension_scores_a[dimension] = score_a
|
|
290
|
+
dimension_scores_b[dimension] = score_b
|
|
291
|
+
dimension_score_ratios[dimension] = _upstream_pairwise_score(score_a, score_b)
|
|
292
|
+
raw_dimension_score_ratios[dimension] = _raw_reference_ratio(score_a, score_b)
|
|
293
|
+
normalized_dimension_scores[dimension] = _reference_normalized_reward(
|
|
294
|
+
score_a, score_b
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
normalized_weights = {
|
|
298
|
+
str(dimension): _finite_clamped(
|
|
299
|
+
weight, lower=0.0, upper=float("inf"), default=0.0
|
|
300
|
+
)
|
|
301
|
+
for dimension, weight in dimension_weights.items()
|
|
302
|
+
}
|
|
303
|
+
total_score_a = sum(
|
|
304
|
+
dimension_scores_a.get(dimension, 0.0) * weight
|
|
305
|
+
for dimension, weight in normalized_weights.items()
|
|
306
|
+
)
|
|
307
|
+
total_score_b = sum(
|
|
308
|
+
dimension_scores_b.get(dimension, 0.0) * weight
|
|
309
|
+
for dimension, weight in normalized_weights.items()
|
|
310
|
+
)
|
|
311
|
+
raw_reference_ratio = _raw_reference_ratio(total_score_a, total_score_b)
|
|
312
|
+
final_score = _reference_normalized_reward(total_score_a, total_score_b)
|
|
313
|
+
upstream_final_score = _upstream_pairwise_score(total_score_a, total_score_b)
|
|
314
|
+
return {
|
|
315
|
+
"final_score": final_score,
|
|
316
|
+
"upstream_pairwise_score": upstream_final_score,
|
|
317
|
+
"raw_reference_ratio": raw_reference_ratio,
|
|
318
|
+
"reference_quality_ratio": OPEN_ENDED_REFERENCE_QUALITY_RATIO,
|
|
319
|
+
"total_score_a": total_score_a,
|
|
320
|
+
"total_score_b": total_score_b,
|
|
321
|
+
"dimension_scores_a": dimension_scores_a,
|
|
322
|
+
"dimension_scores_b": dimension_scores_b,
|
|
323
|
+
"dimension_scores": normalized_dimension_scores,
|
|
324
|
+
"raw_dimension_score_ratios": raw_dimension_score_ratios,
|
|
325
|
+
"upstream_dimension_score_ratios": dimension_score_ratios,
|
|
326
|
+
"dimension_weights": normalized_weights,
|
|
327
|
+
"evaluations": evaluations,
|
|
328
|
+
"criterion_count": len(scores),
|
|
329
|
+
}
|
|
@@ -41,6 +41,7 @@ from verifiers.utils.client_utils import setup_openai_client
|
|
|
41
41
|
|
|
42
42
|
from .obj_task_eval.utils.cache_filesys import CacheFileSys
|
|
43
43
|
from .obj_task_eval.utils.load_eval_script import load_eval_script
|
|
44
|
+
from .open_ended import score_open_ended_answer
|
|
44
45
|
|
|
45
46
|
logger = logging.getLogger(__name__)
|
|
46
47
|
|
|
@@ -228,13 +229,42 @@ def _usage_dict(response: Any) -> dict[str, int]:
|
|
|
228
229
|
}
|
|
229
230
|
|
|
230
231
|
|
|
232
|
+
def _parse_ast_literal(node: ast.AST) -> Any:
|
|
233
|
+
if isinstance(node, ast.Expression):
|
|
234
|
+
return _parse_ast_literal(node.body)
|
|
235
|
+
if isinstance(node, ast.Constant):
|
|
236
|
+
return node.value
|
|
237
|
+
if isinstance(node, ast.List):
|
|
238
|
+
return [_parse_ast_literal(item) for item in node.elts]
|
|
239
|
+
if isinstance(node, ast.Tuple):
|
|
240
|
+
return tuple(_parse_ast_literal(item) for item in node.elts)
|
|
241
|
+
if isinstance(node, ast.Dict):
|
|
242
|
+
return {
|
|
243
|
+
_parse_ast_literal(key): _parse_ast_literal(value)
|
|
244
|
+
for key, value in zip(node.keys, node.values)
|
|
245
|
+
}
|
|
246
|
+
if isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub):
|
|
247
|
+
operand = _parse_ast_literal(node.operand)
|
|
248
|
+
if isinstance(operand, int | float):
|
|
249
|
+
return -operand
|
|
250
|
+
if isinstance(node, ast.Name) and node.id == "object":
|
|
251
|
+
return object
|
|
252
|
+
if isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
|
|
253
|
+
if node.func.id == "array" and len(node.args) == 1:
|
|
254
|
+
return _parse_ast_literal(node.args[0])
|
|
255
|
+
raise ValueError(f"Unsupported QUEST literal syntax: {ast.dump(node)}")
|
|
256
|
+
|
|
257
|
+
|
|
231
258
|
def _parse_literal(value: Any) -> Any:
|
|
232
259
|
if not isinstance(value, str):
|
|
233
260
|
return value
|
|
234
261
|
try:
|
|
235
262
|
return ast.literal_eval(value)
|
|
236
263
|
except Exception:
|
|
237
|
-
|
|
264
|
+
try:
|
|
265
|
+
return _parse_ast_literal(ast.parse(value, mode="eval"))
|
|
266
|
+
except Exception:
|
|
267
|
+
return value
|
|
238
268
|
|
|
239
269
|
|
|
240
270
|
def _extract_question(prompt: Any, extra_info: Any) -> str:
|
|
@@ -345,7 +375,7 @@ def _resolve_eval_scripts_root(
|
|
|
345
375
|
|
|
346
376
|
|
|
347
377
|
class QuestTaskSet(SandboxTaskSet):
|
|
348
|
-
"""QUEST
|
|
378
|
+
"""QUEST search/research taskset."""
|
|
349
379
|
|
|
350
380
|
default_workdir = DEFAULT_WORKDIR
|
|
351
381
|
|
|
@@ -375,10 +405,6 @@ class QuestTaskSet(SandboxTaskSet):
|
|
|
375
405
|
raise ValueError(
|
|
376
406
|
"category must be one of 'objective', 'open-ended', or 'all'"
|
|
377
407
|
)
|
|
378
|
-
if category != "objective":
|
|
379
|
-
raise NotImplementedError(
|
|
380
|
-
"Initial QUEST taskset implementation supports category='objective' only"
|
|
381
|
-
)
|
|
382
408
|
self.dataset_name = dataset_name
|
|
383
409
|
self.split = split
|
|
384
410
|
self.category = category
|
|
@@ -397,9 +423,13 @@ class QuestTaskSet(SandboxTaskSet):
|
|
|
397
423
|
self._judge_api_key_var = judge_api_key_var
|
|
398
424
|
self._judge_sampling_args = dict(judge_sampling_args or {})
|
|
399
425
|
self._quest_cache_dir = quest_cache_dir
|
|
400
|
-
self._quest_eval_scripts_root =
|
|
401
|
-
|
|
402
|
-
|
|
426
|
+
self._quest_eval_scripts_root = (
|
|
427
|
+
None
|
|
428
|
+
if category == "open-ended" and quest_eval_scripts_dir is None
|
|
429
|
+
else _resolve_eval_scripts_root(
|
|
430
|
+
dataset_name=dataset_name,
|
|
431
|
+
eval_scripts_dir=quest_eval_scripts_dir,
|
|
432
|
+
)
|
|
403
433
|
)
|
|
404
434
|
self._quest_eval_concurrency = quest_eval_concurrency
|
|
405
435
|
super().__init__(
|
|
@@ -416,15 +446,29 @@ class QuestTaskSet(SandboxTaskSet):
|
|
|
416
446
|
num_proc=self.ds_num_proc,
|
|
417
447
|
)
|
|
418
448
|
rows: list[dict[str, Any]] = []
|
|
419
|
-
for row in raw:
|
|
420
|
-
|
|
449
|
+
for row_index, row in enumerate(raw):
|
|
450
|
+
row_category = row.get("rl_task_category")
|
|
451
|
+
if self.category != "all" and row_category != self.category:
|
|
421
452
|
continue
|
|
453
|
+
if row_category not in {"objective", "open-ended"}:
|
|
454
|
+
raise ValueError(
|
|
455
|
+
f"Unsupported QUEST row category at dataset index {row_index}: "
|
|
456
|
+
f"{row_category!r}"
|
|
457
|
+
)
|
|
422
458
|
extra_info = _parse_literal(row.get("extra_info"))
|
|
423
459
|
reward_model = _parse_literal(row.get("reward_model"))
|
|
424
460
|
question = _extract_question(row.get("prompt"), extra_info)
|
|
425
461
|
task_id = _extract_task_id(reward_model, extra_info)
|
|
426
|
-
if
|
|
427
|
-
|
|
462
|
+
if not task_id:
|
|
463
|
+
raise ValueError(
|
|
464
|
+
f"QUEST {row_category} row is missing task_id metadata "
|
|
465
|
+
f"at dataset index {row_index}"
|
|
466
|
+
)
|
|
467
|
+
if row_category == "open-ended" and not isinstance(reward_model, dict):
|
|
468
|
+
raise ValueError(
|
|
469
|
+
"QUEST open-ended row has invalid reward_model metadata "
|
|
470
|
+
f"at dataset index {row_index}"
|
|
471
|
+
)
|
|
428
472
|
rows.append(
|
|
429
473
|
{
|
|
430
474
|
"question": question,
|
|
@@ -479,7 +523,11 @@ class QuestTaskSet(SandboxTaskSet):
|
|
|
479
523
|
return QuestRubric(
|
|
480
524
|
answer_file=self.answer_file,
|
|
481
525
|
dataset_name=self.dataset_name,
|
|
482
|
-
eval_scripts_dir=
|
|
526
|
+
eval_scripts_dir=(
|
|
527
|
+
str(self._quest_eval_scripts_root)
|
|
528
|
+
if self._quest_eval_scripts_root is not None
|
|
529
|
+
else None
|
|
530
|
+
),
|
|
483
531
|
cache_dir=self._quest_cache_dir,
|
|
484
532
|
judge_model=self._judge_model,
|
|
485
533
|
judge_base_url=self._judge_base_url,
|
|
@@ -490,7 +538,7 @@ class QuestTaskSet(SandboxTaskSet):
|
|
|
490
538
|
|
|
491
539
|
|
|
492
540
|
class QuestRubric(vf.Rubric):
|
|
493
|
-
"""Scores QUEST objective
|
|
541
|
+
"""Scores QUEST objective and open-ended tasks."""
|
|
494
542
|
|
|
495
543
|
def __init__(
|
|
496
544
|
self,
|
|
@@ -524,22 +572,28 @@ class QuestRubric(vf.Rubric):
|
|
|
524
572
|
self._client: QuestOpenAIClient | None = None
|
|
525
573
|
self._semaphore = asyncio.Semaphore(eval_concurrency)
|
|
526
574
|
self._scripts_root: Path | None = None
|
|
527
|
-
self.add_reward_func(self.
|
|
575
|
+
self.add_reward_func(self.quest_reward, weight=1.0)
|
|
528
576
|
|
|
529
|
-
async def
|
|
577
|
+
async def _quest_score_for_state(self, state: vf.State) -> float:
|
|
530
578
|
if state.get("error") is not None:
|
|
531
579
|
return 0.0
|
|
532
580
|
try:
|
|
533
|
-
return await self.
|
|
581
|
+
return await self.quest_reward(state)
|
|
534
582
|
except vf.Error as exc:
|
|
535
583
|
state["error"] = exc
|
|
536
584
|
return 0.0
|
|
537
585
|
|
|
586
|
+
def _metric_name(self, state: vf.State) -> str:
|
|
587
|
+
info = state.get("info") or {}
|
|
588
|
+
if info.get("rl_task_category") == "open-ended":
|
|
589
|
+
return "open_ended_reward"
|
|
590
|
+
return "objective_reward"
|
|
591
|
+
|
|
538
592
|
async def score_rollout(self, state: vf.State) -> None:
|
|
539
593
|
"""Score one rollout and preserve QUEST infrastructure failures as ``vf.Error`` values."""
|
|
540
|
-
score = await self.
|
|
594
|
+
score = await self._quest_score_for_state(state)
|
|
541
595
|
state["reward"] = score
|
|
542
|
-
state["metrics"] = {"
|
|
596
|
+
state["metrics"] = {"quest_reward": score, self._metric_name(state): score}
|
|
543
597
|
|
|
544
598
|
async def score_group(self, states: list[vf.State]) -> None:
|
|
545
599
|
"""Score rollouts while preserving QUEST infrastructure failures as ``vf.Error`` values."""
|
|
@@ -547,7 +601,7 @@ class QuestRubric(vf.Rubric):
|
|
|
547
601
|
logger.warning("No states to score")
|
|
548
602
|
return
|
|
549
603
|
scores = await asyncio.gather(
|
|
550
|
-
*(self.
|
|
604
|
+
*(self._quest_score_for_state(state) for state in states)
|
|
551
605
|
)
|
|
552
606
|
avg_score = sum(scores) / len(scores)
|
|
553
607
|
for state, score in zip(states, scores):
|
|
@@ -559,9 +613,15 @@ class QuestRubric(vf.Rubric):
|
|
|
559
613
|
turn["advantage"] = state["advantage"]
|
|
560
614
|
if turn.get("reward") is None:
|
|
561
615
|
turn["reward"] = state["reward"]
|
|
562
|
-
state["metrics"] = {"
|
|
616
|
+
state["metrics"] = {"quest_reward": score, self._metric_name(state): score}
|
|
563
617
|
|
|
564
|
-
async def
|
|
618
|
+
async def quest_reward(self, state: vf.State, **_: Any) -> float:
|
|
619
|
+
info = state.get("info") or {}
|
|
620
|
+
if info.get("rl_task_category") == "open-ended":
|
|
621
|
+
return await self.open_ended_reward(state)
|
|
622
|
+
return await self.objective_reward(state)
|
|
623
|
+
|
|
624
|
+
async def _read_answer(self, state: vf.State) -> tuple[str, str]:
|
|
565
625
|
sandbox_client = state.get("sandbox_client")
|
|
566
626
|
sandbox_id = state.get("sandbox_id")
|
|
567
627
|
if not sandbox_client or not sandbox_id:
|
|
@@ -583,6 +643,10 @@ class QuestRubric(vf.Rubric):
|
|
|
583
643
|
answer_source = "completion_fallback" if answer else "missing"
|
|
584
644
|
state["quest_answer"] = answer
|
|
585
645
|
state["quest_answer_source"] = answer_source
|
|
646
|
+
return answer, answer_source
|
|
647
|
+
|
|
648
|
+
async def objective_reward(self, state: vf.State, **_: Any) -> float:
|
|
649
|
+
answer, _ = await self._read_answer(state)
|
|
586
650
|
if not answer:
|
|
587
651
|
state["quest_eval_error"] = "empty_answer"
|
|
588
652
|
return 0.0
|
|
@@ -613,6 +677,38 @@ class QuestRubric(vf.Rubric):
|
|
|
613
677
|
state["quest_final_score"] = final_score
|
|
614
678
|
return final_score
|
|
615
679
|
|
|
680
|
+
async def open_ended_reward(self, state: vf.State, **_: Any) -> float:
|
|
681
|
+
answer, _ = await self._read_answer(state)
|
|
682
|
+
if not answer:
|
|
683
|
+
state["quest_eval_error"] = "empty_answer"
|
|
684
|
+
return 0.0
|
|
685
|
+
info = state.get("info") or {}
|
|
686
|
+
task_id = info.get("task_id")
|
|
687
|
+
if not isinstance(task_id, str) or not task_id:
|
|
688
|
+
raise ValueError("QUEST open-ended task is missing task_id metadata")
|
|
689
|
+
reward_model = info.get("reward_model")
|
|
690
|
+
if not isinstance(reward_model, dict):
|
|
691
|
+
raise ValueError("QUEST open-ended task is missing reward_model metadata")
|
|
692
|
+
question = str(info.get("question") or "")
|
|
693
|
+
state["quest_task_id"] = task_id
|
|
694
|
+
client = self._get_client()
|
|
695
|
+
summary = await score_open_ended_answer(
|
|
696
|
+
client=client,
|
|
697
|
+
model=self.judge_model,
|
|
698
|
+
semaphore=self._semaphore,
|
|
699
|
+
answer=answer,
|
|
700
|
+
question=question,
|
|
701
|
+
reward_model=reward_model,
|
|
702
|
+
)
|
|
703
|
+
state["quest_eval_summary"] = summary
|
|
704
|
+
final_score = float(summary.get("final_score", 0.0) or 0.0)
|
|
705
|
+
if not math.isfinite(final_score):
|
|
706
|
+
final_score = 0.0
|
|
707
|
+
final_score = max(0.0, min(1.0, final_score))
|
|
708
|
+
state["quest_final_score"] = final_score
|
|
709
|
+
state["quest_upstream_pairwise_score"] = summary.get("upstream_pairwise_score")
|
|
710
|
+
return final_score
|
|
711
|
+
|
|
616
712
|
def _get_client(self) -> QuestOpenAIClient:
|
|
617
713
|
if self._client is not None:
|
|
618
714
|
return self._client
|