verifiers 0.1.15.dev7__tar.gz → 0.1.15.dev9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/PKG-INFO +33 -25
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/README.md +30 -23
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/pyproject.toml +5 -3
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_client_auth_errors.py +3 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_envs.py +43 -9
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_eval_cli.py +138 -2
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_imports.py +20 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_mcp_search_env.py +13 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_openai_chat_completions_token_client.py +21 -2
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_opencode_harbor.py +9 -9
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_renderer_client.py +45 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_rlm_env.py +1 -40
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_save_utils.py +31 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_tool_utils.py +11 -6
- verifiers-0.1.15.dev9/tests/test_tui_info_formatting.py +9 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_bfcl.py +8 -13
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_config_extension.py +917 -334
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_group_reward_env.py +1 -4
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_harbor_cli.py +83 -34
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_mini_swe_agent.py +7 -2
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_rlm_swe.py +36 -14
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_runtime_lifecycle.py +131 -53
- verifiers-0.1.15.dev9/tests/test_v1_taskset_bindings.py +347 -0
- verifiers-0.1.15.dev9/tests/test_v1_textarena_taskset.py +219 -0
- verifiers-0.1.15.dev9/tests/test_wiki_search_v1.py +136 -0
- verifiers-0.1.15.dev9/tests/test_wordle_v1_env.py +118 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/AGENTS.md +2 -3
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/__init__.py +22 -1
- verifiers-0.1.15.dev9/verifiers/cli/tui.py +9 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/clients/openai_chat_completions_client.py +22 -20
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/clients/openai_chat_completions_token_client.py +12 -10
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/clients/openai_completions_client.py +0 -4
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/clients/renderer_client.py +52 -36
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/env_group.py +0 -16
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/environment.py +0 -11
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +1 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +1 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +1 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +1 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +1 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +1 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +1 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/sandbox_env.py +1 -5
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/gepa/display.py +2 -2
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rubrics/rubric.py +0 -21
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/scripts/eval.py +11 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/scripts/init.py +65 -27
- verifiers-0.1.15.dev9/verifiers/scripts/tui.py +11 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/types.py +8 -2
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/async_utils.py +0 -8
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/client_utils.py +28 -5
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/data_utils.py +0 -52
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/display_utils.py +1 -23
- verifiers-0.1.15.dev9/verifiers/utils/env_utils.py +259 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/error_utils.py +0 -10
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/eval_utils.py +107 -1
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/message_utils.py +0 -6
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/response_utils.py +12 -25
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/save_utils.py +39 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/thread_utils.py +0 -12
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/tool_utils.py +1 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +101 -62
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/README.md +281 -221
- verifiers-0.1.15.dev9/verifiers/v1/RE_MIGRATION.md +472 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/__init__.py +21 -0
- verifiers-0.1.15.dev9/verifiers/v1/config.py +360 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/env.py +53 -7
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/harness.py +60 -111
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/packages/harnesses/__init__.py +10 -1
- verifiers-0.1.15.dev9/verifiers/v1/packages/harnesses/command.py +164 -0
- verifiers-0.1.15.dev9/verifiers/v1/packages/harnesses/configs.py +168 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/packages/harnesses/mini_swe_agent.py +72 -104
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/packages/harnesses/opencode.py +58 -145
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/packages/harnesses/pi.py +57 -74
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/packages/harnesses/rlm.py +23 -79
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/packages/harnesses/terminus_2.py +65 -94
- verifiers-0.1.15.dev9/verifiers/v1/packages/tasksets/__init__.py +17 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/packages/tasksets/harbor.py +50 -102
- verifiers-0.1.15.dev9/verifiers/v1/packages/tasksets/textarena.py +153 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/runtime.py +44 -11
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/taskset.py +29 -133
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/toolset.py +74 -30
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/types.py +2 -7
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/user.py +12 -8
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/binding_utils.py +4 -2
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/config_callable_utils.py +4 -0
- verifiers-0.1.15.dev9/verifiers/v1/utils/config_utils.py +119 -0
- verifiers-0.1.15.dev9/verifiers/v1/utils/object_utils.py +59 -0
- verifiers-0.1.15.dev9/verifiers/v1/utils/runtime_owner_utils.py +124 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/scoring_utils.py +0 -13
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/taskset_utils.py +36 -2
- verifiers-0.1.15.dev7/tests/test_tui_info_formatting.py +0 -1524
- verifiers-0.1.15.dev7/tests/test_v1_taskset_bindings.py +0 -188
- verifiers-0.1.15.dev7/verifiers/cli/tui.py +0 -9
- verifiers-0.1.15.dev7/verifiers/scripts/tui.py +0 -5928
- verifiers-0.1.15.dev7/verifiers/utils/env_utils.py +0 -145
- verifiers-0.1.15.dev7/verifiers/utils/tunnel_utils.py +0 -266
- verifiers-0.1.15.dev7/verifiers/v1/RE_MIGRATION.md +0 -825
- verifiers-0.1.15.dev7/verifiers/v1/config.py +0 -425
- verifiers-0.1.15.dev7/verifiers/v1/packages/harnesses/command.py +0 -116
- verifiers-0.1.15.dev7/verifiers/v1/packages/harnesses/configs.py +0 -102
- verifiers-0.1.15.dev7/verifiers/v1/packages/tasksets/__init__.py +0 -3
- verifiers-0.1.15.dev7/verifiers/v1/utils/config_utils.py +0 -200
- verifiers-0.1.15.dev7/verifiers/v1/utils/object_utils.py +0 -32
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/.gitignore +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/LICENSE +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/AGENTS.md +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/README.md +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/conftest.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_build_script.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_client_config.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_env_group.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_env_server.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_environment.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_gepa_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_langchain_deep_agents_wikispeedia.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_lean_task.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_logging.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_openenv_client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_parser.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_per_turn_timing.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_pricing_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_renderer_e2e.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_rubric.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_types.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_empty_completions.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_endpoint_protocols.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_example_counts.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_v1_scoring_functions.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_wordle_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/clients/openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/decorators.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/composable_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/harness.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/task.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/rlm_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/openenv_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/errors.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/env_config_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/eval_display.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/interception_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/pricing_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/utils/version_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/packages/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/state.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/task.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/artifact_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/endpoint_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/json_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/judge_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/lifecycle_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/mcp_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/program_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/prompt_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/runtime_registry.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/sandbox_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/serialization_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/task_freeze_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/timing_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/trajectory_utils.py +0 -0
- {verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/verifiers/v1/utils/usage_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.15.
|
|
3
|
+
Version: 0.1.15.dev9
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -35,6 +35,7 @@ Requires-Dist: nest-asyncio>=1.6.0
|
|
|
35
35
|
Requires-Dist: numpy
|
|
36
36
|
Requires-Dist: openai-agents>=0.0.7
|
|
37
37
|
Requires-Dist: openai>=1.108.1
|
|
38
|
+
Requires-Dist: prime-pydantic-config[toml]
|
|
38
39
|
Requires-Dist: prime-sandboxes>=0.2.25
|
|
39
40
|
Requires-Dist: prime-tunnel>=0.1.6
|
|
40
41
|
Requires-Dist: pydantic>=2.11.9
|
|
@@ -54,7 +55,7 @@ Requires-Dist: stagehand>=3.0.0; extra == 'browser'
|
|
|
54
55
|
Provides-Extra: openenv
|
|
55
56
|
Requires-Dist: openenv-core>=0.3.0; extra == 'openenv'
|
|
56
57
|
Provides-Extra: renderers
|
|
57
|
-
Requires-Dist: renderers>=0.1.8.
|
|
58
|
+
Requires-Dist: renderers>=0.1.8.dev4; extra == 'renderers'
|
|
58
59
|
Provides-Extra: rg
|
|
59
60
|
Requires-Dist: reasoning-gym; extra == 'rg'
|
|
60
61
|
Provides-Extra: rl
|
|
@@ -218,22 +219,39 @@ custom harnesses, use the v1 Taskset/Harness path:
|
|
|
218
219
|
# my_env.py
|
|
219
220
|
import verifiers as vf
|
|
220
221
|
|
|
221
|
-
def source():
|
|
222
|
-
yield {
|
|
223
|
-
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
224
|
-
"answer": "cba",
|
|
225
|
-
"max_turns": 1,
|
|
226
|
-
}
|
|
227
|
-
|
|
228
222
|
@vf.reward(weight=1.0)
|
|
229
223
|
async def contains_answer(task, state) -> float:
|
|
230
224
|
return float(task["answer"] in str(state.get("completion") or ""))
|
|
231
225
|
|
|
232
|
-
|
|
233
|
-
|
|
226
|
+
class MyTasksetConfig(vf.TasksetConfig):
|
|
227
|
+
split: str = "train"
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class MyTaskset(vf.Taskset):
|
|
231
|
+
config: MyTasksetConfig
|
|
232
|
+
_default_rewards = (contains_answer,)
|
|
233
|
+
|
|
234
|
+
def rows(self) -> list[dict[str, object]]:
|
|
235
|
+
rows = [
|
|
236
|
+
{
|
|
237
|
+
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
238
|
+
"answer": "cba",
|
|
239
|
+
"split": "train",
|
|
240
|
+
"max_turns": 1,
|
|
241
|
+
}
|
|
242
|
+
]
|
|
243
|
+
return [row for row in rows if row["split"] == self.config.split]
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def load_taskset(config: MyTasksetConfig) -> MyTaskset:
|
|
247
|
+
assert isinstance(config, MyTasksetConfig)
|
|
248
|
+
return MyTaskset(config=config)
|
|
249
|
+
|
|
234
250
|
|
|
235
251
|
def load_environment(config: vf.EnvConfig) -> vf.Env:
|
|
236
|
-
|
|
252
|
+
taskset_config = config.taskset
|
|
253
|
+
assert isinstance(taskset_config, MyTasksetConfig)
|
|
254
|
+
return vf.Env(taskset=load_taskset(taskset_config))
|
|
237
255
|
```
|
|
238
256
|
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
239
257
|
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
@@ -244,8 +262,8 @@ harness with:
|
|
|
244
262
|
|
|
245
263
|
```python
|
|
246
264
|
env = vf.Env(
|
|
247
|
-
taskset=vf.HarborTaskset(),
|
|
248
|
-
harness=vf.OpenCode(),
|
|
265
|
+
taskset=vf.HarborTaskset(config=vf.HarborTasksetConfig()),
|
|
266
|
+
harness=vf.OpenCode(config=vf.OpenCodeConfig()),
|
|
249
267
|
)
|
|
250
268
|
```
|
|
251
269
|
|
|
@@ -282,16 +300,6 @@ prime env install my-env
|
|
|
282
300
|
|
|
283
301
|
For self-managed training launch commands, use the `prime-rl` documentation.
|
|
284
302
|
|
|
285
|
-
To install the environment module into your project, do:
|
|
286
|
-
```bash
|
|
287
|
-
prime env install my-env # installs from ./environments/my_env
|
|
288
|
-
```
|
|
289
|
-
|
|
290
|
-
To install an environment from the Environments Hub into your project, do:
|
|
291
|
-
```bash
|
|
292
|
-
prime env install primeintellect/math-python
|
|
293
|
-
```
|
|
294
|
-
|
|
295
303
|
To run a local evaluation with any OpenAI-compatible model, do:
|
|
296
304
|
```bash
|
|
297
305
|
prime eval run my-env -m openai/gpt-5-nano # run and save eval results locally
|
|
@@ -300,7 +308,7 @@ Evaluations use [Prime Inference](https://docs.primeintellect.ai/inference/overv
|
|
|
300
308
|
|
|
301
309
|
View local evaluation results in the terminal UI:
|
|
302
310
|
```bash
|
|
303
|
-
prime eval
|
|
311
|
+
prime eval view
|
|
304
312
|
```
|
|
305
313
|
|
|
306
314
|
To publish the environment to the [Environments Hub](https://app.primeintellect.ai/dashboard/environments?ex_sort=most_stars), do:
|
|
@@ -143,22 +143,39 @@ custom harnesses, use the v1 Taskset/Harness path:
|
|
|
143
143
|
# my_env.py
|
|
144
144
|
import verifiers as vf
|
|
145
145
|
|
|
146
|
-
def source():
|
|
147
|
-
yield {
|
|
148
|
-
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
149
|
-
"answer": "cba",
|
|
150
|
-
"max_turns": 1,
|
|
151
|
-
}
|
|
152
|
-
|
|
153
146
|
@vf.reward(weight=1.0)
|
|
154
147
|
async def contains_answer(task, state) -> float:
|
|
155
148
|
return float(task["answer"] in str(state.get("completion") or ""))
|
|
156
149
|
|
|
157
|
-
|
|
158
|
-
|
|
150
|
+
class MyTasksetConfig(vf.TasksetConfig):
|
|
151
|
+
split: str = "train"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class MyTaskset(vf.Taskset):
|
|
155
|
+
config: MyTasksetConfig
|
|
156
|
+
_default_rewards = (contains_answer,)
|
|
157
|
+
|
|
158
|
+
def rows(self) -> list[dict[str, object]]:
|
|
159
|
+
rows = [
|
|
160
|
+
{
|
|
161
|
+
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
162
|
+
"answer": "cba",
|
|
163
|
+
"split": "train",
|
|
164
|
+
"max_turns": 1,
|
|
165
|
+
}
|
|
166
|
+
]
|
|
167
|
+
return [row for row in rows if row["split"] == self.config.split]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def load_taskset(config: MyTasksetConfig) -> MyTaskset:
|
|
171
|
+
assert isinstance(config, MyTasksetConfig)
|
|
172
|
+
return MyTaskset(config=config)
|
|
173
|
+
|
|
159
174
|
|
|
160
175
|
def load_environment(config: vf.EnvConfig) -> vf.Env:
|
|
161
|
-
|
|
176
|
+
taskset_config = config.taskset
|
|
177
|
+
assert isinstance(taskset_config, MyTasksetConfig)
|
|
178
|
+
return vf.Env(taskset=load_taskset(taskset_config))
|
|
162
179
|
```
|
|
163
180
|
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
164
181
|
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
@@ -169,8 +186,8 @@ harness with:
|
|
|
169
186
|
|
|
170
187
|
```python
|
|
171
188
|
env = vf.Env(
|
|
172
|
-
taskset=vf.HarborTaskset(),
|
|
173
|
-
harness=vf.OpenCode(),
|
|
189
|
+
taskset=vf.HarborTaskset(config=vf.HarborTasksetConfig()),
|
|
190
|
+
harness=vf.OpenCode(config=vf.OpenCodeConfig()),
|
|
174
191
|
)
|
|
175
192
|
```
|
|
176
193
|
|
|
@@ -207,16 +224,6 @@ prime env install my-env
|
|
|
207
224
|
|
|
208
225
|
For self-managed training launch commands, use the `prime-rl` documentation.
|
|
209
226
|
|
|
210
|
-
To install the environment module into your project, do:
|
|
211
|
-
```bash
|
|
212
|
-
prime env install my-env # installs from ./environments/my_env
|
|
213
|
-
```
|
|
214
|
-
|
|
215
|
-
To install an environment from the Environments Hub into your project, do:
|
|
216
|
-
```bash
|
|
217
|
-
prime env install primeintellect/math-python
|
|
218
|
-
```
|
|
219
|
-
|
|
220
227
|
To run a local evaluation with any OpenAI-compatible model, do:
|
|
221
228
|
```bash
|
|
222
229
|
prime eval run my-env -m openai/gpt-5-nano # run and save eval results locally
|
|
@@ -225,7 +232,7 @@ Evaluations use [Prime Inference](https://docs.primeintellect.ai/inference/overv
|
|
|
225
232
|
|
|
226
233
|
View local evaluation results in the terminal UI:
|
|
227
234
|
```bash
|
|
228
|
-
prime eval
|
|
235
|
+
prime eval view
|
|
229
236
|
```
|
|
230
237
|
|
|
231
238
|
To publish the environment to the [Environments Hub](https://app.primeintellect.ai/dashboard/environments?ex_sort=most_stars), do:
|
|
@@ -53,6 +53,7 @@ dependencies = [
|
|
|
53
53
|
"setproctitle>=1.3.0",
|
|
54
54
|
"regex<2026.4.4",
|
|
55
55
|
"httpx>=0.27.0",
|
|
56
|
+
"prime-pydantic-config[toml]",
|
|
56
57
|
]
|
|
57
58
|
|
|
58
59
|
[dependency-groups]
|
|
@@ -72,7 +73,7 @@ dev = [
|
|
|
72
73
|
"aiohttp>=3.9.0",
|
|
73
74
|
"python-dotenv>=1.0.0",
|
|
74
75
|
"nltk",
|
|
75
|
-
"renderers>=0.1.8.
|
|
76
|
+
"renderers>=0.1.8.dev4",
|
|
76
77
|
]
|
|
77
78
|
policy = [
|
|
78
79
|
"semgrep>=1.150.0",
|
|
@@ -95,7 +96,7 @@ openenv = [
|
|
|
95
96
|
"openenv-core>=0.3.0",
|
|
96
97
|
]
|
|
97
98
|
renderers = [
|
|
98
|
-
"renderers>=0.1.8.
|
|
99
|
+
"renderers>=0.1.8.dev4",
|
|
99
100
|
]
|
|
100
101
|
rl = [
|
|
101
102
|
"torch>=2.8.0,<2.9.0",
|
|
@@ -113,6 +114,7 @@ rl = [
|
|
|
113
114
|
[tool.uv]
|
|
114
115
|
preview = true
|
|
115
116
|
required-version = ">=0.11.1"
|
|
117
|
+
exclude-newer = "7 days"
|
|
116
118
|
conflicts = [
|
|
117
119
|
[
|
|
118
120
|
{ extra = "openenv" },
|
|
@@ -123,12 +125,12 @@ conflicts = [
|
|
|
123
125
|
name = "pypi"
|
|
124
126
|
url = "https://pypi.org/simple"
|
|
125
127
|
default = true
|
|
126
|
-
exclude-newer = "7 days"
|
|
127
128
|
|
|
128
129
|
[tool.uv.exclude-newer-package]
|
|
129
130
|
# PrimeIntellect-published on PyPI (trusted publisher)
|
|
130
131
|
prime-tunnel = false
|
|
131
132
|
prime-sandboxes = false
|
|
133
|
+
prime-pydantic-config = false
|
|
132
134
|
renderers = false
|
|
133
135
|
openenv-core = false
|
|
134
136
|
|
|
@@ -130,6 +130,9 @@ class _OverlongOpenAIChatClient:
|
|
|
130
130
|
def __init__(self, message: str) -> None:
|
|
131
131
|
self.chat = self._Chat(message)
|
|
132
132
|
|
|
133
|
+
async def post(self, *args, **kwargs): # noqa: ANN002, ANN003
|
|
134
|
+
return await self.chat.completions.create(*args, **kwargs)
|
|
135
|
+
|
|
133
136
|
|
|
134
137
|
@pytest.mark.parametrize(
|
|
135
138
|
"error_message",
|
|
@@ -101,18 +101,44 @@ def test_alphabet_sort_v1_validates_parameters():
|
|
|
101
101
|
spec.loader.exec_module(module)
|
|
102
102
|
|
|
103
103
|
with pytest.raises(ValueError, match="min_turns must be at least 1"):
|
|
104
|
-
module.
|
|
104
|
+
module.AlphabetSortTaskset(config=module.AlphabetSortTasksetConfig(min_turns=0))
|
|
105
105
|
with pytest.raises(
|
|
106
106
|
ValueError, match="min_turns must be less than or equal to max_turns"
|
|
107
107
|
):
|
|
108
|
-
module.
|
|
108
|
+
module.AlphabetSortTaskset(
|
|
109
|
+
config=module.AlphabetSortTasksetConfig(min_turns=3, max_turns=2)
|
|
110
|
+
)
|
|
109
111
|
with pytest.raises(ValueError, match="min_names_per_turn must be at least 1"):
|
|
110
|
-
module.
|
|
112
|
+
module.AlphabetSortTaskset(
|
|
113
|
+
config=module.AlphabetSortTasksetConfig(min_names_per_turn=0)
|
|
114
|
+
)
|
|
111
115
|
with pytest.raises(
|
|
112
116
|
ValueError,
|
|
113
117
|
match="min_names_per_turn must be less than or equal to max_names_per_turn",
|
|
114
118
|
):
|
|
115
|
-
module.
|
|
119
|
+
module.AlphabetSortTaskset(
|
|
120
|
+
config=module.AlphabetSortTasksetConfig(
|
|
121
|
+
min_names_per_turn=3,
|
|
122
|
+
max_names_per_turn=2,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@pytest.mark.parametrize("env_name", ["alphabet_sort", "math_python"])
|
|
128
|
+
def test_v1_wrapper_rejects_unknown_kwargs(env_name: str):
|
|
129
|
+
module_path = Path("environments") / env_name / f"{env_name}.py"
|
|
130
|
+
spec = importlib.util.spec_from_file_location(
|
|
131
|
+
f"{env_name}_wrapper_test", module_path
|
|
132
|
+
)
|
|
133
|
+
assert spec is not None and spec.loader is not None
|
|
134
|
+
module = importlib.util.module_from_spec(spec)
|
|
135
|
+
sys.modules[spec.name] = module
|
|
136
|
+
spec.loader.exec_module(module)
|
|
137
|
+
|
|
138
|
+
with pytest.raises(
|
|
139
|
+
TypeError, match="Unsupported v1 load_environment kwargs: extra"
|
|
140
|
+
):
|
|
141
|
+
module.load_environment(v1=True, extra=True)
|
|
116
142
|
|
|
117
143
|
|
|
118
144
|
@pytest.mark.slow
|
|
@@ -127,8 +153,12 @@ def test_env(env_dir: Path, tmp_path_factory: pytest.TempPathFactory):
|
|
|
127
153
|
repo_root = Path(__file__).parent.parent
|
|
128
154
|
cmd = (
|
|
129
155
|
f"cd {tmp_venv_dir} && uv venv --clear && source .venv/bin/activate && "
|
|
130
|
-
|
|
131
|
-
|
|
156
|
+
"uv pip install "
|
|
157
|
+
"--exclude-newer-package prime-pydantic-config=2026-05-20T00:00:00Z "
|
|
158
|
+
f"{repo_root.as_posix()} && "
|
|
159
|
+
"uv pip install "
|
|
160
|
+
"--exclude-newer-package prime-pydantic-config=2026-05-20T00:00:00Z "
|
|
161
|
+
f"{env_dir.absolute().as_posix()}"
|
|
132
162
|
)
|
|
133
163
|
try:
|
|
134
164
|
process = subprocess.run(
|
|
@@ -186,10 +216,14 @@ def help_test_can_load_env(tmp_venv_dir: Path, env_dir: Path):
|
|
|
186
216
|
|
|
187
217
|
def help_test_can_eval_env(tmp_venv_dir: Path, env_dir: Path):
|
|
188
218
|
"""Test that the environment can be run via vf-eval."""
|
|
189
|
-
if os.getenv("
|
|
190
|
-
|
|
191
|
-
|
|
219
|
+
if env_dir.name == "tau2_bench_v1" and not os.getenv("PRIME_API_KEY"):
|
|
220
|
+
pytest.skip(
|
|
221
|
+
"Skipping tau2 default eval because PRIME_API_KEY is not configured"
|
|
222
|
+
)
|
|
223
|
+
if os.getenv("PRIME_API_KEY"):
|
|
192
224
|
model_flags = "-m openai/gpt-4.1-mini -b https://api.pinference.ai/api/v1 -k PRIME_API_KEY"
|
|
225
|
+
elif os.getenv("OPENAI_API_KEY"):
|
|
226
|
+
model_flags = "-m gpt-4.1-mini -b https://api.openai.com/v1 -k OPENAI_API_KEY"
|
|
193
227
|
else:
|
|
194
228
|
pytest.skip("Skipping vf-eval smoke test because no API key is configured")
|
|
195
229
|
|
|
@@ -749,6 +749,92 @@ def test_load_toml_config_with_env_args():
|
|
|
749
749
|
assert result[0]["env_args"]["max_examples"] == 100
|
|
750
750
|
|
|
751
751
|
|
|
752
|
+
def test_load_toml_config_sampling_section_mirrors_chat_template_kwargs():
|
|
753
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
754
|
+
f.write(
|
|
755
|
+
"[sampling]\n"
|
|
756
|
+
"max_tokens = 1024\n"
|
|
757
|
+
'reasoning_effort = "medium"\n'
|
|
758
|
+
"enable_thinking = false\n\n"
|
|
759
|
+
"[sampling.extra_body]\n"
|
|
760
|
+
'custom = "value"\n\n'
|
|
761
|
+
"[sampling.extra_body.chat_template_kwargs]\n"
|
|
762
|
+
"clear_thinking = true\n\n"
|
|
763
|
+
"[[eval]]\n"
|
|
764
|
+
'env_id = "env1"\n'
|
|
765
|
+
)
|
|
766
|
+
f.flush()
|
|
767
|
+
result = load_toml_config(Path(f.name))
|
|
768
|
+
|
|
769
|
+
assert result[0]["sampling_args"] == {
|
|
770
|
+
"max_tokens": 1024,
|
|
771
|
+
"reasoning_effort": "medium",
|
|
772
|
+
"enable_thinking": False,
|
|
773
|
+
"extra_body": {
|
|
774
|
+
"custom": "value",
|
|
775
|
+
"chat_template_kwargs": {
|
|
776
|
+
"clear_thinking": True,
|
|
777
|
+
"reasoning_effort": "medium",
|
|
778
|
+
"enable_thinking": False,
|
|
779
|
+
},
|
|
780
|
+
},
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
def test_load_toml_config_sampling_args_mirrors_chat_template_kwargs():
|
|
785
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
786
|
+
f.write(
|
|
787
|
+
"[[eval]]\n"
|
|
788
|
+
'env_id = "env1"\n'
|
|
789
|
+
'sampling_args = { max_tokens = 256, reasoning_effort = "high", enable_thinking = true }\n'
|
|
790
|
+
)
|
|
791
|
+
f.flush()
|
|
792
|
+
result = load_toml_config(Path(f.name))
|
|
793
|
+
|
|
794
|
+
assert result[0]["sampling_args"] == {
|
|
795
|
+
"max_tokens": 256,
|
|
796
|
+
"reasoning_effort": "high",
|
|
797
|
+
"enable_thinking": True,
|
|
798
|
+
"extra_body": {
|
|
799
|
+
"chat_template_kwargs": {
|
|
800
|
+
"reasoning_effort": "high",
|
|
801
|
+
"enable_thinking": True,
|
|
802
|
+
}
|
|
803
|
+
},
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
def test_cli_toml_eval_sampling_section_pipes_thinking_args(monkeypatch, run_cli):
|
|
808
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
809
|
+
f.write(
|
|
810
|
+
"[[eval]]\n"
|
|
811
|
+
'env_id = "env1"\n\n'
|
|
812
|
+
"[eval.sampling]\n"
|
|
813
|
+
"max_tokens = 512\n"
|
|
814
|
+
'reasoning_effort = "low"\n'
|
|
815
|
+
"enable_thinking = true\n"
|
|
816
|
+
)
|
|
817
|
+
f.flush()
|
|
818
|
+
captured = run_cli(
|
|
819
|
+
monkeypatch,
|
|
820
|
+
{
|
|
821
|
+
"env_id_or_config": f.name,
|
|
822
|
+
},
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
assert captured["sampling_args"] == {
|
|
826
|
+
"max_tokens": 512,
|
|
827
|
+
"reasoning_effort": "low",
|
|
828
|
+
"enable_thinking": True,
|
|
829
|
+
"extra_body": {
|
|
830
|
+
"chat_template_kwargs": {
|
|
831
|
+
"reasoning_effort": "low",
|
|
832
|
+
"enable_thinking": True,
|
|
833
|
+
}
|
|
834
|
+
},
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
|
|
752
838
|
def test_load_toml_config_with_args_taskset_harness():
|
|
753
839
|
"""args/taskset/harness sections normalize into load_environment kwargs."""
|
|
754
840
|
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
@@ -757,8 +843,10 @@ def test_load_toml_config_with_args_taskset_harness():
|
|
|
757
843
|
"[eval.args]\n"
|
|
758
844
|
'split = "train"\n\n'
|
|
759
845
|
"[eval.taskset]\n"
|
|
846
|
+
'id = "user/taskset-package"\n'
|
|
760
847
|
"num_examples = 10\n\n"
|
|
761
848
|
"[eval.harness]\n"
|
|
849
|
+
'id = "user/harness-package"\n'
|
|
762
850
|
"max_turns = 5\n"
|
|
763
851
|
)
|
|
764
852
|
f.flush()
|
|
@@ -769,8 +857,8 @@ def test_load_toml_config_with_args_taskset_harness():
|
|
|
769
857
|
assert result[0]["env_args"] == {
|
|
770
858
|
"split": "train",
|
|
771
859
|
"config": {
|
|
772
|
-
"taskset": {"num_examples": 10},
|
|
773
|
-
"harness": {"max_turns": 5},
|
|
860
|
+
"taskset": {"id": "user/taskset-package", "num_examples": 10},
|
|
861
|
+
"harness": {"id": "user/harness-package", "max_turns": 5},
|
|
774
862
|
},
|
|
775
863
|
}
|
|
776
864
|
assert "args" not in result[0]
|
|
@@ -889,6 +977,16 @@ def test_cli_toml_ignores_cli_args(monkeypatch, run_cli):
|
|
|
889
977
|
assert config.rollouts_per_example == 3 # DEFAULT_ROLLOUTS_PER_EXAMPLE
|
|
890
978
|
assert config.max_concurrent == 32 # default
|
|
891
979
|
assert config.sampling_args["max_tokens"] is None # default
|
|
980
|
+
assert config.save_results is True
|
|
981
|
+
|
|
982
|
+
|
|
983
|
+
def test_cli_toml_respects_save_results_false(monkeypatch, run_cli):
|
|
984
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
985
|
+
f.write('[[eval]]\nenv_id = "env1"\nsave_results = false\n')
|
|
986
|
+
f.flush()
|
|
987
|
+
captured = run_cli(monkeypatch, {"env_id_or_config": f.name})
|
|
988
|
+
|
|
989
|
+
assert captured["configs"][0].save_results is False
|
|
892
990
|
|
|
893
991
|
|
|
894
992
|
def test_cli_toml_per_env_num_examples(monkeypatch, run_cli):
|
|
@@ -1263,6 +1361,44 @@ def test_ablation_global_defaults_apply():
|
|
|
1263
1361
|
assert all(c["num_examples"] == 100 for c in configs)
|
|
1264
1362
|
|
|
1265
1363
|
|
|
1364
|
+
def test_ablation_sampling_sweep_merges_with_global_sampling_defaults():
|
|
1365
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
1366
|
+
f.write(
|
|
1367
|
+
"[sampling]\n"
|
|
1368
|
+
"max_tokens = 1024\n"
|
|
1369
|
+
'reasoning_effort = "medium"\n\n'
|
|
1370
|
+
'[[ablation]]\nenv_id = "my-env"\n\n'
|
|
1371
|
+
"[ablation.sweep]\n"
|
|
1372
|
+
"sampling = [{ temperature = 0.0 }, { temperature = 1.0, enable_thinking = false }]\n"
|
|
1373
|
+
)
|
|
1374
|
+
f.flush()
|
|
1375
|
+
configs = load_toml_config(Path(f.name))
|
|
1376
|
+
|
|
1377
|
+
assert len(configs) == 2
|
|
1378
|
+
assert configs[0]["sampling_args"] == {
|
|
1379
|
+
"max_tokens": 1024,
|
|
1380
|
+
"reasoning_effort": "medium",
|
|
1381
|
+
"temperature": 0.0,
|
|
1382
|
+
"extra_body": {
|
|
1383
|
+
"chat_template_kwargs": {
|
|
1384
|
+
"reasoning_effort": "medium",
|
|
1385
|
+
}
|
|
1386
|
+
},
|
|
1387
|
+
}
|
|
1388
|
+
assert configs[1]["sampling_args"] == {
|
|
1389
|
+
"max_tokens": 1024,
|
|
1390
|
+
"reasoning_effort": "medium",
|
|
1391
|
+
"temperature": 1.0,
|
|
1392
|
+
"enable_thinking": False,
|
|
1393
|
+
"extra_body": {
|
|
1394
|
+
"chat_template_kwargs": {
|
|
1395
|
+
"reasoning_effort": "medium",
|
|
1396
|
+
"enable_thinking": False,
|
|
1397
|
+
}
|
|
1398
|
+
},
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
|
|
1266
1402
|
def test_ablation_endpoint_id_override_removes_global_model():
|
|
1267
1403
|
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
1268
1404
|
f.write(
|
|
@@ -1,6 +1,26 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import sys
|
|
3
|
+
|
|
1
4
|
import verifiers
|
|
2
5
|
|
|
3
6
|
|
|
7
|
+
def test_v1_taskset_imports_do_not_import_textarena():
|
|
8
|
+
textarena_module = "verifiers.v1.packages.tasksets.textarena"
|
|
9
|
+
sys.modules.pop(textarena_module, None)
|
|
10
|
+
|
|
11
|
+
tasksets = importlib.import_module("verifiers.v1.packages.tasksets")
|
|
12
|
+
tasksets.__dict__.pop("TextArenaTaskset", None)
|
|
13
|
+
tasksets.__dict__.pop("TextArenaTasksetConfig", None)
|
|
14
|
+
importlib.reload(tasksets)
|
|
15
|
+
assert textarena_module not in sys.modules
|
|
16
|
+
|
|
17
|
+
v1 = importlib.import_module("verifiers.v1")
|
|
18
|
+
v1.__dict__.pop("TextArenaTaskset", None)
|
|
19
|
+
v1.__dict__.pop("TextArenaTasksetConfig", None)
|
|
20
|
+
importlib.reload(v1)
|
|
21
|
+
assert textarena_module not in sys.modules
|
|
22
|
+
|
|
23
|
+
|
|
4
24
|
class TestImports:
|
|
5
25
|
"""Test that all public API imports work correctly.
|
|
6
26
|
This was inspired by issue #349.
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import importlib.util
|
|
2
2
|
import inspect
|
|
3
|
+
import sys
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
@@ -19,6 +20,7 @@ def _load_mcp_search_module() -> Any:
|
|
|
19
20
|
assert spec.loader is not None
|
|
20
21
|
|
|
21
22
|
module = importlib.util.module_from_spec(spec)
|
|
23
|
+
sys.modules[spec.name] = module
|
|
22
24
|
spec.loader.exec_module(module)
|
|
23
25
|
return module
|
|
24
26
|
|
|
@@ -39,10 +41,20 @@ def test_mcp_search_env_is_v1_only() -> None:
|
|
|
39
41
|
assert env.taskset.config.max_turns == 4
|
|
40
42
|
|
|
41
43
|
|
|
44
|
+
def test_mcp_search_env_preserves_harness_config() -> None:
|
|
45
|
+
module = _load_mcp_search_module()
|
|
46
|
+
|
|
47
|
+
env = module.load_environment(
|
|
48
|
+
config=module.MCPSearchEnvConfig(harness={"max_turns": 7})
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
assert env.harness.config.max_turns == 7
|
|
52
|
+
|
|
53
|
+
|
|
42
54
|
def test_mcp_search_default_taskset_has_stable_non_doc_fixture() -> None:
|
|
43
55
|
module = _load_mcp_search_module()
|
|
44
56
|
|
|
45
|
-
rows = module.
|
|
57
|
+
rows = module.MCPSearchTaskset(config=module.MCPSearchTasksetConfig()).rows()
|
|
46
58
|
|
|
47
59
|
assert len(rows) >= 10
|
|
48
60
|
assert len({row["answer"] for row in rows}) == len(rows)
|
{verifiers-0.1.15.dev7 → verifiers-0.1.15.dev9}/tests/test_openai_chat_completions_token_client.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Any, cast
|
|
2
2
|
|
|
3
|
+
import httpx
|
|
3
4
|
import pytest
|
|
4
5
|
|
|
5
6
|
from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient
|
|
@@ -24,7 +25,25 @@ class _RecordingClient(_NoopClient):
|
|
|
24
25
|
self, path: str, body: dict[str, Any], cast_to: type, **kwargs: Any
|
|
25
26
|
) -> Any:
|
|
26
27
|
self.calls.append({"path": path, "body": body, "cast_to": cast_to})
|
|
27
|
-
return
|
|
28
|
+
return httpx.Response(
|
|
29
|
+
200,
|
|
30
|
+
json={
|
|
31
|
+
"id": path,
|
|
32
|
+
"object": "chat.completion",
|
|
33
|
+
"created": 1,
|
|
34
|
+
"model": body["model"],
|
|
35
|
+
"choices": [
|
|
36
|
+
{
|
|
37
|
+
"index": 0,
|
|
38
|
+
"message": {"role": "assistant", "content": "ok"},
|
|
39
|
+
"finish_reason": "stop",
|
|
40
|
+
}
|
|
41
|
+
],
|
|
42
|
+
"ok": True,
|
|
43
|
+
"path": path,
|
|
44
|
+
"body": body,
|
|
45
|
+
},
|
|
46
|
+
)
|
|
28
47
|
|
|
29
48
|
|
|
30
49
|
class _PromptIdTestClient(OpenAIChatCompletionsTokenClient):
|
|
@@ -270,7 +289,7 @@ async def test_get_native_response_uses_token_route_when_prompt_ids_available(
|
|
|
270
289
|
state=state,
|
|
271
290
|
)
|
|
272
291
|
|
|
273
|
-
assert response["ok"] is True
|
|
292
|
+
assert response.model_extra["ok"] is True
|
|
274
293
|
assert len(recording_client.calls) == 1
|
|
275
294
|
assert recording_client.calls[0]["path"] == "/chat/completions/tokens"
|
|
276
295
|
assert recording_client.calls[0]["body"]["tokens"] == [10, 20]
|
|
@@ -53,15 +53,15 @@ def test_load_environment_accepts_v1_taskset_and_harness_config() -> None:
|
|
|
53
53
|
|
|
54
54
|
env = module.load_environment(
|
|
55
55
|
config=module.OpenCodeHarborEnvConfig(
|
|
56
|
-
taskset=
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
harness=
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
56
|
+
taskset=module.vf.HarborTasksetConfig(
|
|
57
|
+
task_names=["task-a"],
|
|
58
|
+
cpu_cores=1.5,
|
|
59
|
+
),
|
|
60
|
+
harness=module.vf.OpenCodeConfig(
|
|
61
|
+
agent_workdir="/workspace",
|
|
62
|
+
disabled_tools=["webfetch"],
|
|
63
|
+
max_turns=2,
|
|
64
|
+
),
|
|
65
65
|
)
|
|
66
66
|
)
|
|
67
67
|
|