verifiers 0.1.15.dev167__tar.gz → 0.1.15.dev169__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/PKG-INFO +5 -1
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/pyproject.toml +4 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_environment.py +24 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_trajectory_processing.py +43 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_runtime_lifecycle.py +180 -3
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/anthropic_messages_client.py +6 -6
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/openai_responses_client.py +2 -2
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/README.md +35 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/__init__.py +5 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/README.md +52 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/__init__.py +5 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/__init__.py +17 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/api_tools/__init__.py +5 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/api_tools/tool_pdf.py +275 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/eval_toolkit.py +1119 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/evaluator.py +1271 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/llm_client/__init__.py +5 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/llm_client/base_client.py +15 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/prompts/__init__.py +4 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/prompts/cache_prompts.py +15 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/__init__.py +7 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/cache_filesys.py +45 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/load_eval_script.py +107 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/misc.py +106 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/tool_visit.py +69 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/utils/url_tools.py +27 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/obj_task_eval/verification_tree.py +153 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/taskset.py +667 -0
- verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/search_tasksets.py +26 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/types.py +35 -1
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/async_utils.py +14 -15
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/interception_utils.py +38 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/harness.py +9 -2
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/endpoint_utils.py +131 -25
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/sandbox_program_utils.py +40 -240
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/sandbox_utils.py +58 -17
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/.gitignore +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/LICENSE +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/README.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/AGENTS.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/README.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/conftest.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_build_script.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_client_config.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_env_group.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_env_server.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_envs.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_eval_cli.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_gepa_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_imports.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_init_script.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_langchain_deep_agents_wikispeedia.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_lean_task.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_logging.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_mcp_search_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_openenv_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_parser.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_per_turn_timing.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_pricing_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_renderer_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_renderer_e2e.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_rubric.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_save_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_types.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_bfcl.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_config_extension.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_empty_completions.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_endpoint_protocols.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_example_counts.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_group_reward_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_harbor_cli.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_nemo_gym_harness.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_openenv_taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_openreward_taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_replay_harness.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_rlm_swe.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_scoring_functions.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_taskset_bindings.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_taskset_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_v1_textarena_taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_wiki_search_v1.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_wordle_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_wordle_v1_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/openai_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/renderer_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/decorators.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/environment.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/composable_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/harness.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/task.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/README.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/extract_fix_patch.sh +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe/taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/openswe/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/openswe/taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/log_parser.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym/taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/scale_swe/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/scale_swe/taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/shared/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/shared/test_patch.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench/taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego/taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/log_parsers.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2/taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith/taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/openenv_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/errors.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/eval.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/env_config_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/eval_display.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/eval_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/pricing_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/response_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/save_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/utils/version_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/README.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/RE_MIGRATION.md +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/artifact.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/config.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/env.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/model.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/program.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/runtime.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/runtime_handles.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/sandbox.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/state.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/task.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/taskset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/toolset.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/types.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/user.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/binding_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/config_callable_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/json_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/judge_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/lifecycle_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/logging_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/mcp_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/object_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/program_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/prompt_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/runtime_owner_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/runtime_registry.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/sandbox_python_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/scoring_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/serialization_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/task_freeze_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/taskset_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/toolset_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/trajectory_utils.py +0 -0
- {verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/v1/utils/usage_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.15.
|
|
3
|
+
Version: 0.1.15.dev169
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -22,8 +22,10 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
23
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
24
|
Requires-Python: <3.14,>=3.10
|
|
25
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
25
26
|
Requires-Dist: aiolimiter>=1.2.1
|
|
26
27
|
Requires-Dist: anthropic>=0.78.0
|
|
28
|
+
Requires-Dist: certifi
|
|
27
29
|
Requires-Dist: datasets<4.7.0,>=3.0.0
|
|
28
30
|
Requires-Dist: gepa
|
|
29
31
|
Requires-Dist: httpx>=0.27.0
|
|
@@ -35,10 +37,12 @@ Requires-Dist: nest-asyncio>=1.6.0
|
|
|
35
37
|
Requires-Dist: numpy
|
|
36
38
|
Requires-Dist: openai-agents>=0.0.7
|
|
37
39
|
Requires-Dist: openai>=1.108.1
|
|
40
|
+
Requires-Dist: pillow
|
|
38
41
|
Requires-Dist: prime-pydantic-config[toml]
|
|
39
42
|
Requires-Dist: prime-sandboxes>=0.2.25
|
|
40
43
|
Requires-Dist: prime-tunnel>=0.1.6
|
|
41
44
|
Requires-Dist: pydantic>=2.11.9
|
|
45
|
+
Requires-Dist: pymupdf
|
|
42
46
|
Requires-Dist: pyzmq>=27.1.0
|
|
43
47
|
Requires-Dist: regex<2026.4.4
|
|
44
48
|
Requires-Dist: requests
|
|
@@ -53,6 +53,10 @@ dependencies = [
|
|
|
53
53
|
"setproctitle>=1.3.0",
|
|
54
54
|
"regex<2026.4.4",
|
|
55
55
|
"httpx>=0.27.0",
|
|
56
|
+
"aiohttp>=3.9.0",
|
|
57
|
+
"pymupdf",
|
|
58
|
+
"pillow",
|
|
59
|
+
"certifi",
|
|
56
60
|
"prime-pydantic-config[toml]",
|
|
57
61
|
"uvloop>=0.21.0; sys_platform != 'win32' and sys_platform != 'cygwin' and platform_python_implementation != 'PyPy'",
|
|
58
62
|
]
|
|
@@ -697,6 +697,30 @@ class TestMaybeRetry:
|
|
|
697
697
|
error_data = rollout_outputs[0]["error"]
|
|
698
698
|
assert "InfraError" == error_data["error"]
|
|
699
699
|
|
|
700
|
+
@pytest.mark.asyncio
|
|
701
|
+
async def test_retries_serialized_infra_error_subclass(self):
|
|
702
|
+
"""A serialized InfraError subclass (e.g. SandboxError) in returned state
|
|
703
|
+
must trigger retry.
|
|
704
|
+
|
|
705
|
+
The v1 harness serializes state["error"] to ErrorData before maybe_retry
|
|
706
|
+
inspects it, so matching must be subclass-aware (rebuild concrete error +
|
|
707
|
+
isinstance) — base-name substring matching missed SandboxError, which is
|
|
708
|
+
an InfraError and should be retried.
|
|
709
|
+
"""
|
|
710
|
+
from verifiers.utils.async_utils import maybe_retry
|
|
711
|
+
from verifiers.utils.error_utils import error_data
|
|
712
|
+
|
|
713
|
+
serialized = error_data(vf.SandboxError("Program file upload failed"))
|
|
714
|
+
calls = {"n": 0}
|
|
715
|
+
|
|
716
|
+
async def attempt():
|
|
717
|
+
calls["n"] += 1
|
|
718
|
+
return {"error": serialized}
|
|
719
|
+
|
|
720
|
+
result = await maybe_retry(attempt, max_retries=2, initial=0.0, max_wait=0.0)()
|
|
721
|
+
assert calls["n"] == 3 # 1 initial + 2 retries (InfraError is retryable)
|
|
722
|
+
assert result["error"] == serialized # last result returned after exhaustion
|
|
723
|
+
|
|
700
724
|
|
|
701
725
|
class TestEmptyModelResponseErrors:
|
|
702
726
|
"""Test cases for empty and invalid model response error handling."""
|
|
@@ -282,6 +282,49 @@ async def test_parsed_prompt_attribution_survives_v1_assert_serializable():
|
|
|
282
282
|
State({"trajectory": [step]}).assert_serializable()
|
|
283
283
|
|
|
284
284
|
|
|
285
|
+
def test_assert_serializable_accepts_msgpack_sidecars_rejects_unknown():
|
|
286
|
+
"""The ``assert_serializable`` json.dumps gate must accept exactly what the
|
|
287
|
+
trainer transport (msgpack) accepts, while staying strict otherwise.
|
|
288
|
+
|
|
289
|
+
Trajectory token steps carry sidecars that are non-JSON by design and reach
|
|
290
|
+
the trainer via msgpack, not JSON: the renderer ``MultiModalData`` (a
|
|
291
|
+
dataclass holding numpy pixel arrays) and ``routed_experts`` (a raw
|
|
292
|
+
``memoryview`` buffer). Both must clear the gate; any other
|
|
293
|
+
non-serializable object must still raise.
|
|
294
|
+
"""
|
|
295
|
+
import dataclasses
|
|
296
|
+
|
|
297
|
+
import numpy as np
|
|
298
|
+
|
|
299
|
+
@dataclasses.dataclass
|
|
300
|
+
class _FakeMultiModalData:
|
|
301
|
+
mm_hashes: dict
|
|
302
|
+
mm_items: dict
|
|
303
|
+
mm_placeholders: dict
|
|
304
|
+
|
|
305
|
+
mm = _FakeMultiModalData(
|
|
306
|
+
mm_hashes={"image": ["h1"]},
|
|
307
|
+
mm_items={"image": [np.zeros((2, 2), dtype=np.uint8)]},
|
|
308
|
+
mm_placeholders={"image": [{"offset": 0, "length": 4}]},
|
|
309
|
+
)
|
|
310
|
+
step = {
|
|
311
|
+
"tokens": {
|
|
312
|
+
"prompt_ids": [1, 2],
|
|
313
|
+
"multi_modal_data": mm,
|
|
314
|
+
"routed_experts": {"data": memoryview(b"abc"), "shape": [3], "start": 0},
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
# Must not raise: both sidecars are msgpack-transported, not JSON.
|
|
318
|
+
State({"trajectory": [step]}).assert_serializable()
|
|
319
|
+
|
|
320
|
+
# A genuinely non-serializable object must still be rejected.
|
|
321
|
+
class _Unknown:
|
|
322
|
+
pass
|
|
323
|
+
|
|
324
|
+
with pytest.raises(TypeError):
|
|
325
|
+
State({"trajectory": [{"tokens": _Unknown()}]}).assert_serializable()
|
|
326
|
+
|
|
327
|
+
|
|
285
328
|
def test_process_trajectory_steps_for_training(make_input):
|
|
286
329
|
"""Test processing trajectory steps into training examples."""
|
|
287
330
|
state1 = State(
|
|
@@ -110,6 +110,15 @@ class BlockingModelClient(CapturingModelClient):
|
|
|
110
110
|
return await super().get_response(**kwargs)
|
|
111
111
|
|
|
112
112
|
|
|
113
|
+
class RaisingModelClient:
|
|
114
|
+
def __init__(self, error: vf.Error):
|
|
115
|
+
self.error = error
|
|
116
|
+
|
|
117
|
+
async def get_response(self, **kwargs: object) -> Response:
|
|
118
|
+
_ = kwargs
|
|
119
|
+
raise self.error
|
|
120
|
+
|
|
121
|
+
|
|
113
122
|
class FakeCreateSandboxRequest:
|
|
114
123
|
def __init__(self, **kwargs: object):
|
|
115
124
|
self.kwargs = kwargs
|
|
@@ -517,6 +526,31 @@ async def endpoint_program(task, state):
|
|
|
517
526
|
}
|
|
518
527
|
|
|
519
528
|
|
|
529
|
+
async def endpoint_model_error_program(task, state):
|
|
530
|
+
_ = task
|
|
531
|
+
root = state["endpoint_root_url"].rstrip("/")
|
|
532
|
+
endpoint_client = cast(OpenAI, state.get_client(api="chat", sync=True))
|
|
533
|
+
auth_headers = {"Authorization": f"Bearer {endpoint_client.api_key}"}
|
|
534
|
+
endpoint_client.close()
|
|
535
|
+
|
|
536
|
+
def post_model() -> None:
|
|
537
|
+
request = urllib.request.Request(
|
|
538
|
+
f"{root}/vf/model",
|
|
539
|
+
data=json.dumps(
|
|
540
|
+
{"messages": [{"role": "user", "content": "too long"}]}
|
|
541
|
+
).encode(),
|
|
542
|
+
headers={"content-type": "application/json", **auth_headers},
|
|
543
|
+
)
|
|
544
|
+
with urllib.request.urlopen(request):
|
|
545
|
+
pass
|
|
546
|
+
|
|
547
|
+
try:
|
|
548
|
+
await asyncio.to_thread(post_model)
|
|
549
|
+
except Exception as exc:
|
|
550
|
+
raise vf.SandboxError("Sandbox command failed") from exc
|
|
551
|
+
raise AssertionError("Expected /vf/model to fail")
|
|
552
|
+
|
|
553
|
+
|
|
520
554
|
async def endpoint_trajectory_program(task, state):
|
|
521
555
|
_ = task
|
|
522
556
|
root = state["endpoint_root_url"].rstrip("/")
|
|
@@ -725,6 +759,7 @@ for _name, _value in {
|
|
|
725
759
|
"initialize_from_taskset": initialize_from_taskset,
|
|
726
760
|
"child_reads_program_sandbox": child_reads_program_sandbox,
|
|
727
761
|
"endpoint_program": endpoint_program,
|
|
762
|
+
"endpoint_model_error_program": endpoint_model_error_program,
|
|
728
763
|
"endpoint_trajectory_program": endpoint_trajectory_program,
|
|
729
764
|
"concurrent_endpoint_program": concurrent_endpoint_program,
|
|
730
765
|
"mcp_proxy_program": mcp_proxy_program,
|
|
@@ -827,6 +862,41 @@ async def test_endpoint_exposes_tool_user_and_stop_surfaces() -> None:
|
|
|
827
862
|
assert "endpoint_root_url" not in state
|
|
828
863
|
|
|
829
864
|
|
|
865
|
+
@pytest.mark.asyncio
|
|
866
|
+
async def test_vf_model_bridge_preserves_overlong_prompt_error() -> None:
|
|
867
|
+
harness = make_harness(
|
|
868
|
+
program={"fn": program_ref("endpoint_model_error_program")},
|
|
869
|
+
model="test-model",
|
|
870
|
+
client=RaisingModelClient(vf.OverlongPromptError("too long")),
|
|
871
|
+
)
|
|
872
|
+
task = vf.Task({"prompt": [{"role": "user", "content": "hi"}]}).freeze()
|
|
873
|
+
|
|
874
|
+
state = await harness.run(task)
|
|
875
|
+
await harness.teardown()
|
|
876
|
+
|
|
877
|
+
assert state["prompt_too_long"] is True
|
|
878
|
+
assert state["is_truncated"] is True
|
|
879
|
+
assert state["stop_condition"] == "prompt_too_long"
|
|
880
|
+
assert state.get("error") is None
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
@pytest.mark.asyncio
|
|
884
|
+
async def test_vf_model_bridge_preserves_model_error() -> None:
|
|
885
|
+
harness = make_harness(
|
|
886
|
+
program={"fn": program_ref("endpoint_model_error_program")},
|
|
887
|
+
model="test-model",
|
|
888
|
+
client=RaisingModelClient(vf.ModelError("model failed")),
|
|
889
|
+
)
|
|
890
|
+
task = vf.Task({"prompt": [{"role": "user", "content": "hi"}]}).freeze()
|
|
891
|
+
|
|
892
|
+
state = await harness.run(task)
|
|
893
|
+
await harness.teardown()
|
|
894
|
+
|
|
895
|
+
assert state["stop_condition"] == "has_error"
|
|
896
|
+
assert state["error"]["error"] == "ModelError"
|
|
897
|
+
assert "SandboxError" not in state["error"]["error_chain_str"]
|
|
898
|
+
|
|
899
|
+
|
|
830
900
|
@pytest.mark.asyncio
|
|
831
901
|
async def test_endpoint_request_can_hide_internal_model_call_from_trajectory() -> None:
|
|
832
902
|
client = FakeModelClient([fake_response("hidden"), fake_response("shown")])
|
|
@@ -1462,6 +1532,70 @@ async def test_create_sandbox_cleans_up_wait_failure_with_retry(
|
|
|
1462
1532
|
assert client.delete_calls == 2
|
|
1463
1533
|
|
|
1464
1534
|
|
|
1535
|
+
@pytest.mark.asyncio
|
|
1536
|
+
async def test_upload_program_files_retries_transient_transfer_error(
|
|
1537
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
1538
|
+
) -> None:
|
|
1539
|
+
install_fake_sandboxes(monkeypatch)
|
|
1540
|
+
disable_sandbox_retry_sleep(monkeypatch)
|
|
1541
|
+
|
|
1542
|
+
class FlakyUploadClient:
|
|
1543
|
+
calls = 0
|
|
1544
|
+
|
|
1545
|
+
async def upload_bytes(self, *args: object, **kwargs: object) -> None:
|
|
1546
|
+
_ = args, kwargs
|
|
1547
|
+
self.calls += 1
|
|
1548
|
+
if self.calls == 1:
|
|
1549
|
+
raise FakeAPIError("Upload failed: ")
|
|
1550
|
+
|
|
1551
|
+
client = FlakyUploadClient()
|
|
1552
|
+
task = vf.Task({"prompt": [{"role": "user", "content": "hi"}]}).freeze()
|
|
1553
|
+
state = vf.State.for_task(task)
|
|
1554
|
+
|
|
1555
|
+
await sandbox_utils.upload_program_files(
|
|
1556
|
+
cast(sandbox_utils.SandboxClient, client),
|
|
1557
|
+
"sbx-upload",
|
|
1558
|
+
{"files": {"/tmp/file.txt": "content"}},
|
|
1559
|
+
task,
|
|
1560
|
+
state,
|
|
1561
|
+
Runtime(),
|
|
1562
|
+
)
|
|
1563
|
+
|
|
1564
|
+
assert client.calls == 2
|
|
1565
|
+
|
|
1566
|
+
|
|
1567
|
+
@pytest.mark.asyncio
|
|
1568
|
+
async def test_upload_program_files_does_not_retry_non_transient_api_error(
|
|
1569
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
1570
|
+
) -> None:
|
|
1571
|
+
install_fake_sandboxes(monkeypatch)
|
|
1572
|
+
disable_sandbox_retry_sleep(monkeypatch)
|
|
1573
|
+
|
|
1574
|
+
class FailingUploadClient:
|
|
1575
|
+
calls = 0
|
|
1576
|
+
|
|
1577
|
+
async def upload_bytes(self, *args: object, **kwargs: object) -> None:
|
|
1578
|
+
_ = args, kwargs
|
|
1579
|
+
self.calls += 1
|
|
1580
|
+
raise FakeAPIError("Upload failed: HTTP 400: bad request")
|
|
1581
|
+
|
|
1582
|
+
client = FailingUploadClient()
|
|
1583
|
+
task = vf.Task({"prompt": [{"role": "user", "content": "hi"}]}).freeze()
|
|
1584
|
+
state = vf.State.for_task(task)
|
|
1585
|
+
|
|
1586
|
+
with pytest.raises(vf.SandboxError, match="HTTP 400"):
|
|
1587
|
+
await sandbox_utils.upload_program_files(
|
|
1588
|
+
cast(sandbox_utils.SandboxClient, client),
|
|
1589
|
+
"sbx-upload",
|
|
1590
|
+
{"files": {"/tmp/file.txt": "content"}},
|
|
1591
|
+
task,
|
|
1592
|
+
state,
|
|
1593
|
+
Runtime(),
|
|
1594
|
+
)
|
|
1595
|
+
|
|
1596
|
+
assert client.calls == 1
|
|
1597
|
+
|
|
1598
|
+
|
|
1465
1599
|
@pytest.mark.asyncio
|
|
1466
1600
|
async def test_create_sandbox_cancellation_deletes_late_provider_result(
|
|
1467
1601
|
monkeypatch: pytest.MonkeyPatch,
|
|
@@ -1603,8 +1737,8 @@ async def test_sandbox_base_program_max_turns_zero_is_unbounded(
|
|
|
1603
1737
|
config_path.write_text(json.dumps({"max_turns": 0}))
|
|
1604
1738
|
namespace["RUNNER_CONFIG_PATH"] = str(config_path)
|
|
1605
1739
|
|
|
1606
|
-
async def create_model_message(state, messages
|
|
1607
|
-
_ = state, messages
|
|
1740
|
+
async def create_model_message(state, messages):
|
|
1741
|
+
_ = state, messages
|
|
1608
1742
|
return {"role": "assistant", "content": "done"}
|
|
1609
1743
|
|
|
1610
1744
|
async def call_user(state, messages):
|
|
@@ -1621,12 +1755,55 @@ async def test_sandbox_base_program_max_turns_zero_is_unbounded(
|
|
|
1621
1755
|
|
|
1622
1756
|
state = {"prompt": [{"role": "user", "content": "hi"}], "runtime": {}}
|
|
1623
1757
|
run_base = cast(Any, namespace["run_base"])
|
|
1624
|
-
result = await run_base({}, state
|
|
1758
|
+
result = await run_base({}, state)
|
|
1625
1759
|
|
|
1626
1760
|
assert result["completion"] == [{"role": "assistant", "content": "done"}]
|
|
1627
1761
|
assert result["stop_condition"] == "no_tools"
|
|
1628
1762
|
|
|
1629
1763
|
|
|
1764
|
+
@pytest.mark.asyncio
|
|
1765
|
+
async def test_sandbox_base_program_model_call_uses_vf_model_bridge() -> None:
|
|
1766
|
+
namespace: dict[str, object] = {}
|
|
1767
|
+
source = runner_source().rsplit("asyncio.run(main())", 1)[0]
|
|
1768
|
+
exec(source, namespace)
|
|
1769
|
+
|
|
1770
|
+
posted: list[tuple[str, Any, object]] = []
|
|
1771
|
+
|
|
1772
|
+
async def vf_post(state, path, payload, timeout=None):
|
|
1773
|
+
_ = state
|
|
1774
|
+
posted.append((path, payload, timeout))
|
|
1775
|
+
return {"message": {"role": "assistant", "content": "ok"}}
|
|
1776
|
+
|
|
1777
|
+
namespace["vf_post"] = vf_post
|
|
1778
|
+
create_model_message = cast(Any, namespace["create_model_message"])
|
|
1779
|
+
|
|
1780
|
+
# Canonical Messages (incl. an image content part) are sent unchanged over the
|
|
1781
|
+
# /vf/model bridge; the host owns client resolution + tokenization and returns
|
|
1782
|
+
# the assistant message.
|
|
1783
|
+
messages = [
|
|
1784
|
+
{"role": "user", "content": "hi"},
|
|
1785
|
+
{
|
|
1786
|
+
"role": "tool",
|
|
1787
|
+
"tool_call_id": "call_1",
|
|
1788
|
+
"content": [
|
|
1789
|
+
{"type": "text", "text": "shot"},
|
|
1790
|
+
{
|
|
1791
|
+
"type": "image_url",
|
|
1792
|
+
"image_url": {"url": "data:image/png;base64,AAA"},
|
|
1793
|
+
},
|
|
1794
|
+
],
|
|
1795
|
+
},
|
|
1796
|
+
]
|
|
1797
|
+
message = await create_model_message({"runtime": {}}, messages)
|
|
1798
|
+
|
|
1799
|
+
assert message == {"role": "assistant", "content": "ok"}
|
|
1800
|
+
assert len(posted) == 1
|
|
1801
|
+
path, payload, timeout = posted[0]
|
|
1802
|
+
assert path == "model"
|
|
1803
|
+
assert payload["messages"] == messages # image part preserved verbatim
|
|
1804
|
+
assert timeout is None
|
|
1805
|
+
|
|
1806
|
+
|
|
1630
1807
|
def test_sandbox_program_patch_cannot_set_lifecycle_fields() -> None:
|
|
1631
1808
|
state = vf.State.for_task(vf.Task({"prompt": []}).freeze())
|
|
1632
1809
|
|
{verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/anthropic_messages_client.py
RENAMED
|
@@ -214,15 +214,15 @@ class AnthropicMessagesClient(
|
|
|
214
214
|
return {}
|
|
215
215
|
|
|
216
216
|
def build_tool_result_block(message: ToolMessage) -> ToolResultBlockParam:
|
|
217
|
+
if isinstance(message.content, str):
|
|
218
|
+
result_content: Any = message.content
|
|
219
|
+
else:
|
|
220
|
+
# Keep images: image_url parts -> Anthropic image blocks (not "[image]" text).
|
|
221
|
+
result_content = normalize_anthropic_content(message.content)
|
|
217
222
|
return ToolResultBlockParam(
|
|
218
223
|
type="tool_result",
|
|
219
224
|
tool_use_id=message.tool_call_id,
|
|
220
|
-
content=cast(
|
|
221
|
-
Any,
|
|
222
|
-
message.content
|
|
223
|
-
if isinstance(message.content, str)
|
|
224
|
-
else " ".join(content_to_text_chunks(message.content)),
|
|
225
|
-
),
|
|
225
|
+
content=cast(Any, result_content),
|
|
226
226
|
)
|
|
227
227
|
|
|
228
228
|
def from_chat_message(message: Message) -> AnthropicMessageParam | None:
|
{verifiers-0.1.15.dev167 → verifiers-0.1.15.dev169}/verifiers/clients/openai_responses_client.py
RENAMED
|
@@ -156,8 +156,8 @@ class OpenAIResponsesClient(
|
|
|
156
156
|
if isinstance(message, ToolMessage):
|
|
157
157
|
output = message.content
|
|
158
158
|
if not isinstance(output, str):
|
|
159
|
-
|
|
160
|
-
output =
|
|
159
|
+
# Keep images: image_url parts -> Responses input_image (not text).
|
|
160
|
+
output = normalize_message_content(output)
|
|
161
161
|
return [
|
|
162
162
|
{
|
|
163
163
|
"type": "function_call_output",
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Search Tasksets
|
|
2
|
+
|
|
3
|
+
Composable search/research tasksets for agents that solve live information-seeking tasks in a sandbox.
|
|
4
|
+
|
|
5
|
+
The search family is intentionally backend-oriented, mirroring the SWE taskset pattern while keeping the task contract research-centric: each task expects a single final answer rather than a code patch. Agents may use web/search tools, browser helpers, or other sandbox resources provided by the paired environment.
|
|
6
|
+
|
|
7
|
+
## Backends
|
|
8
|
+
|
|
9
|
+
| Backend | Source | Default dataset | Status |
|
|
10
|
+
|---|---|---|---|
|
|
11
|
+
| `quest` | [OSU-NLP-Group/QUEST](https://github.com/OSU-NLP-Group/QUEST) | [`osunlp/QUEST-RL-Data`](https://huggingface.co/datasets/osunlp/QUEST-RL-Data) | Objective tasks supported |
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from verifiers.envs.experimental.composable.tasksets.search import make_search_taskset
|
|
17
|
+
|
|
18
|
+
taskset = make_search_taskset(backend="quest", category="objective")
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
`make_search_taskset()` dispatches by backend name. Unknown backends raise `ValueError` with the available backend list.
|
|
22
|
+
|
|
23
|
+
## Output Contract
|
|
24
|
+
|
|
25
|
+
Search tasksets should define their own output contract. The initial `quest` backend expects the agent to write one final researched response to `/task/answer.txt`, including supporting URLs/citations when available. Scratch reasoning, tool traces, and logs should not be written as the final answer.
|
|
26
|
+
|
|
27
|
+
## Error Handling
|
|
28
|
+
|
|
29
|
+
Search tasksets should use the framework error taxonomy for infrastructure failures:
|
|
30
|
+
|
|
31
|
+
- `vf.SandboxError` for sandbox setup, command, or lifecycle failures.
|
|
32
|
+
- `vf.ModelError` for judge/model provider failures.
|
|
33
|
+
- `vf.InfraError` for dataset, evaluator, or external runtime failures.
|
|
34
|
+
|
|
35
|
+
Incorrect answers should not set `state["error"]`; they should score normally, often as `0.0`.
|
verifiers-0.1.15.dev169/verifiers/envs/experimental/composable/tasksets/search/quest/README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# QUEST Search Taskset
|
|
2
|
+
|
|
3
|
+
Objective QUEST tasks ported into the composable search taskset framework.
|
|
4
|
+
|
|
5
|
+
## Source
|
|
6
|
+
|
|
7
|
+
- Dataset: [`osunlp/QUEST-RL-Data`](https://huggingface.co/datasets/osunlp/QUEST-RL-Data)
|
|
8
|
+
- Upstream project: [`OSU-NLP-Group/QUEST`](https://github.com/OSU-NLP-Group/QUEST)
|
|
9
|
+
|
|
10
|
+
The taskset loads the Hugging Face dataset, filters to `rl_task_category == "objective"` by default, and uses the dataset-provided generated evaluation scripts under `eval_scripts/*.py`.
|
|
11
|
+
|
|
12
|
+
## Task Contract
|
|
13
|
+
|
|
14
|
+
Each example is a live research question. The agent should produce one final answer in `/task/answer.txt`.
|
|
15
|
+
|
|
16
|
+
The paired `rlm_search` environment prompts RLM to write this file and provides web search/open-page skills. The rubric can fall back to the final assistant text if the answer file is empty, but agents should still write the file directly.
|
|
17
|
+
|
|
18
|
+
## Scoring
|
|
19
|
+
|
|
20
|
+
`QuestRubric` loads the generated eval script for the example's `task_id` and calls its async `evaluate_answer(...)` entrypoint using the vendored minimal `obj_task_eval` runtime. The rollout reward is `summary["final_score"]`, clipped to `[0.0, 1.0]`.
|
|
21
|
+
|
|
22
|
+
Generated scripts may request URL-backed verification. PDF URLs are detected and parsed with the upstream QUEST PDF parser path before falling back to generic webpage retrieval.
|
|
23
|
+
|
|
24
|
+
This port intentionally preserves upstream QUEST behavior for URL-backed verification semantics. The upstream verifier generally treats invalid, irrelevant, or inaccessible cited webpages as unsupported claims, which can assign `0.0` to the affected verification node even when the immediate cause is source access such as a bot challenge, rate limit, timeout, or parser failure. Future work should consider a finer-grained source-access taxonomy so verifier infrastructure limitations can be distinguished from model-provided bad URLs or unsupported claims.
|
|
25
|
+
|
|
26
|
+
A reward of `0.0` with no `state["error"]` means the QUEST evaluator ran and judged the answer incorrect under the upstream-compatible scoring path. Infrastructure and evaluator failures outside normal QUEST source verification are represented with `vf.Error` subclasses instead of ad hoc success metrics.
|
|
27
|
+
|
|
28
|
+
## Error Handling
|
|
29
|
+
|
|
30
|
+
QUEST uses Verifiers' framework-managed error field for non-answer failures when the failure comes from external runtime systems:
|
|
31
|
+
|
|
32
|
+
- Missing live sandbox or answer-file read failure: `vf.SandboxError`.
|
|
33
|
+
- Transient judge provider/network/rate-limit/server failures: retryable `vf.InfraError`.
|
|
34
|
+
- Empty or invalid judge responses: retryable `vf.InvalidModelResponseError` / `vf.EmptyModelResponseError`.
|
|
35
|
+
- Judge auth, model-not-found, content-filter, or invalid request failures: non-retryable `vf.ModelError`.
|
|
36
|
+
- QUEST eval-script download/cache resolution failure: `vf.InfraError`.
|
|
37
|
+
|
|
38
|
+
Wrong answers, empty answers, and inaccessible or irrelevant cited sources remain ordinary scored outcomes and return `0.0` without setting `state["error"]`. Generated eval-script source errors, missing task metadata, missing eval-script files, import/load failures, and unexpected evaluator runtime bugs are not converted to `vf.Error`; they raise normally so broken evaluator code fails hard.
|
|
39
|
+
|
|
40
|
+
## Common Arguments
|
|
41
|
+
|
|
42
|
+
| Argument | Default | Description |
|
|
43
|
+
|---|---:|---|
|
|
44
|
+
| `dataset_name` | `osunlp/QUEST-RL-Data` | Hugging Face dataset name. |
|
|
45
|
+
| `split` | `train` | Dataset split. |
|
|
46
|
+
| `category` | `objective` | Initial implementation supports objective tasks only. |
|
|
47
|
+
| `answer_file` | `/task/answer.txt` | Final answer path in the sandbox. |
|
|
48
|
+
| `judge_model` | `openai/gpt-5.4-mini` | OpenAI-compatible model for QUEST verifier calls. |
|
|
49
|
+
| `judge_base_url` | `https://api.pinference.ai/api/v1` | Judge API base URL. |
|
|
50
|
+
| `judge_api_key_var` | `PRIME_API_KEY` | Env var containing the judge API key. |
|
|
51
|
+
| `quest_eval_scripts_dir` | HF cache | Optional local directory containing `eval_scripts/*.py`. |
|
|
52
|
+
| `quest_cache_dir` | `~/.cache/verifiers/quest` | Host cache for QUEST verifier state. |
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Vendored QUEST objective evaluation runtime."""
|
|
2
|
+
|
|
3
|
+
from .eval_toolkit import BinaryEvalResult, Extractor, Verifier, create_evaluator
|
|
4
|
+
from .evaluator import Evaluator
|
|
5
|
+
from .utils import CacheFileSys
|
|
6
|
+
from .verification_tree import AggregationStrategy, VerificationNode
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"AggregationStrategy",
|
|
10
|
+
"BinaryEvalResult",
|
|
11
|
+
"CacheFileSys",
|
|
12
|
+
"Evaluator",
|
|
13
|
+
"Extractor",
|
|
14
|
+
"Verifier",
|
|
15
|
+
"VerificationNode",
|
|
16
|
+
"create_evaluator",
|
|
17
|
+
]
|