verifiers 0.1.15.dev6__tar.gz → 0.1.15.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/PKG-INFO +30 -26
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/README.md +27 -24
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/pyproject.toml +5 -3
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_client_auth_errors.py +3 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_envs.py +36 -6
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_eval_cli.py +189 -2
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_eval_display.py +16 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_eval_utils.py +16 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_mcp_search_env.py +13 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_openai_chat_completions_token_client.py +21 -2
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_opencode_harbor.py +9 -9
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_path_utils.py +14 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_renderer_client.py +45 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_save_utils.py +31 -0
- verifiers-0.1.15.dev8/tests/test_tui_info_formatting.py +9 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_bfcl.py +8 -13
- verifiers-0.1.15.dev8/tests/test_v1_config_extension.py +3090 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_group_reward_env.py +1 -4
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_harbor_cli.py +78 -34
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_mini_swe_agent.py +7 -2
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_rlm_swe.py +36 -14
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_runtime_lifecycle.py +131 -53
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_taskset_bindings.py +104 -27
- verifiers-0.1.15.dev8/tests/test_wiki_search_v1.py +136 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/AGENTS.md +2 -3
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/__init__.py +22 -1
- verifiers-0.1.15.dev8/verifiers/cli/tui.py +9 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/clients/openai_chat_completions_client.py +22 -20
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/clients/openai_chat_completions_token_client.py +12 -10
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/clients/openai_completions_client.py +0 -4
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/clients/renderer_client.py +52 -23
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +1 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +1 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +1 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +1 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +1 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +1 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +1 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/scripts/eval.py +16 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/scripts/init.py +83 -36
- verifiers-0.1.15.dev8/verifiers/scripts/tui.py +11 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/types.py +10 -2
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/client_utils.py +30 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/env_utils.py +63 -13
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/eval_display.py +25 -9
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/eval_utils.py +137 -17
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/path_utils.py +9 -3
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/response_utils.py +12 -25
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/save_utils.py +39 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +75 -37
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/README.md +290 -222
- verifiers-0.1.15.dev8/verifiers/v1/RE_MIGRATION.md +465 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/__init__.py +13 -1
- verifiers-0.1.15.dev8/verifiers/v1/config.py +484 -0
- verifiers-0.1.15.dev8/verifiers/v1/env.py +351 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/harness.py +66 -115
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/packages/harnesses/__init__.py +10 -1
- verifiers-0.1.15.dev8/verifiers/v1/packages/harnesses/command.py +164 -0
- verifiers-0.1.15.dev8/verifiers/v1/packages/harnesses/configs.py +168 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/packages/harnesses/mini_swe_agent.py +75 -107
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/packages/harnesses/opencode.py +58 -145
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/packages/harnesses/pi.py +59 -76
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/packages/harnesses/rlm.py +23 -80
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/packages/harnesses/terminus_2.py +67 -96
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/packages/tasksets/harbor.py +51 -105
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/runtime.py +2 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/taskset.py +31 -137
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/toolset.py +74 -30
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/types.py +2 -7
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/user.py +12 -8
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/binding_utils.py +4 -2
- verifiers-0.1.15.dev8/verifiers/v1/utils/component_utils.py +136 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/config_callable_utils.py +4 -0
- verifiers-0.1.15.dev8/verifiers/v1/utils/config_utils.py +185 -0
- verifiers-0.1.15.dev8/verifiers/v1/utils/object_utils.py +52 -0
- verifiers-0.1.15.dev8/verifiers/v1/utils/runtime_owner_utils.py +129 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/taskset_utils.py +36 -2
- verifiers-0.1.15.dev6/tests/test_tui_info_formatting.py +0 -1524
- verifiers-0.1.15.dev6/tests/test_v1_config_extension.py +0 -1989
- verifiers-0.1.15.dev6/verifiers/cli/tui.py +0 -9
- verifiers-0.1.15.dev6/verifiers/scripts/tui.py +0 -5928
- verifiers-0.1.15.dev6/verifiers/v1/RE_MIGRATION.md +0 -825
- verifiers-0.1.15.dev6/verifiers/v1/config.py +0 -425
- verifiers-0.1.15.dev6/verifiers/v1/env.py +0 -134
- verifiers-0.1.15.dev6/verifiers/v1/packages/harnesses/command.py +0 -116
- verifiers-0.1.15.dev6/verifiers/v1/packages/harnesses/configs.py +0 -102
- verifiers-0.1.15.dev6/verifiers/v1/utils/config_utils.py +0 -200
- verifiers-0.1.15.dev6/verifiers/v1/utils/object_utils.py +0 -32
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/.gitignore +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/LICENSE +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/AGENTS.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/conftest.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_build_script.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_client_config.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_env_group.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_env_server.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_environment.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_gepa_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_imports.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_langchain_deep_agents_wikispeedia.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_lean_task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_logging.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_openenv_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_per_turn_timing.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_pricing_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_renderer_e2e.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_rlm_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_types.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_empty_completions.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_endpoint_protocols.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_example_counts.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_v1_scoring_functions.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_wordle_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/clients/openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/decorators.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/environment.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/composable_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/harness.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/rlm_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/openenv_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/errors.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/env_config_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/interception_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/pricing_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/utils/version_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/packages/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/packages/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/state.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/artifact_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/endpoint_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/json_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/judge_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/lifecycle_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/mcp_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/program_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/prompt_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/runtime_registry.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/sandbox_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/scoring_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/serialization_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/task_freeze_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/timing_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/trajectory_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev8}/verifiers/v1/utils/usage_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.15.
|
|
3
|
+
Version: 0.1.15.dev8
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -35,6 +35,7 @@ Requires-Dist: nest-asyncio>=1.6.0
|
|
|
35
35
|
Requires-Dist: numpy
|
|
36
36
|
Requires-Dist: openai-agents>=0.0.7
|
|
37
37
|
Requires-Dist: openai>=1.108.1
|
|
38
|
+
Requires-Dist: prime-pydantic-config[toml]
|
|
38
39
|
Requires-Dist: prime-sandboxes>=0.2.25
|
|
39
40
|
Requires-Dist: prime-tunnel>=0.1.6
|
|
40
41
|
Requires-Dist: pydantic>=2.11.9
|
|
@@ -54,7 +55,7 @@ Requires-Dist: stagehand>=3.0.0; extra == 'browser'
|
|
|
54
55
|
Provides-Extra: openenv
|
|
55
56
|
Requires-Dist: openenv-core>=0.3.0; extra == 'openenv'
|
|
56
57
|
Provides-Extra: renderers
|
|
57
|
-
Requires-Dist: renderers>=0.1.8.
|
|
58
|
+
Requires-Dist: renderers>=0.1.8.dev4; extra == 'renderers'
|
|
58
59
|
Provides-Extra: rg
|
|
59
60
|
Requires-Dist: reasoning-gym; extra == 'rg'
|
|
60
61
|
Provides-Extra: rl
|
|
@@ -218,22 +219,35 @@ custom harnesses, use the v1 Taskset/Harness path:
|
|
|
218
219
|
# my_env.py
|
|
219
220
|
import verifiers as vf
|
|
220
221
|
|
|
221
|
-
def source():
|
|
222
|
-
yield {
|
|
223
|
-
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
224
|
-
"answer": "cba",
|
|
225
|
-
"max_turns": 1,
|
|
226
|
-
}
|
|
227
|
-
|
|
228
222
|
@vf.reward(weight=1.0)
|
|
229
223
|
async def contains_answer(task, state) -> float:
|
|
230
224
|
return float(task["answer"] in str(state.get("completion") or ""))
|
|
231
225
|
|
|
232
|
-
|
|
233
|
-
|
|
226
|
+
class MyTasksetConfig(vf.TasksetConfig):
|
|
227
|
+
split: str = "train"
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class MyTaskset(vf.Taskset[MyTasksetConfig]):
|
|
231
|
+
_default_rewards = (contains_answer,)
|
|
232
|
+
|
|
233
|
+
def rows(self) -> list[dict[str, object]]:
|
|
234
|
+
rows = [
|
|
235
|
+
{
|
|
236
|
+
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
237
|
+
"answer": "cba",
|
|
238
|
+
"split": "train",
|
|
239
|
+
"max_turns": 1,
|
|
240
|
+
}
|
|
241
|
+
]
|
|
242
|
+
return [row for row in rows if row["split"] == self.config.split]
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class MyEnvConfig(vf.EnvConfig):
|
|
246
|
+
taskset: MyTasksetConfig = MyTasksetConfig()
|
|
247
|
+
|
|
234
248
|
|
|
235
|
-
def load_environment(config:
|
|
236
|
-
return vf.Env(taskset=
|
|
249
|
+
def load_environment(config: MyEnvConfig) -> vf.Env:
|
|
250
|
+
return vf.Env(taskset=MyTaskset(config=config.taskset))
|
|
237
251
|
```
|
|
238
252
|
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
239
253
|
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
@@ -244,8 +258,8 @@ harness with:
|
|
|
244
258
|
|
|
245
259
|
```python
|
|
246
260
|
env = vf.Env(
|
|
247
|
-
taskset=vf.HarborTaskset(),
|
|
248
|
-
harness=vf.OpenCode(),
|
|
261
|
+
taskset=vf.HarborTaskset(config=vf.HarborTasksetConfig()),
|
|
262
|
+
harness=vf.OpenCode(config=vf.OpenCodeConfig()),
|
|
249
263
|
)
|
|
250
264
|
```
|
|
251
265
|
|
|
@@ -282,16 +296,6 @@ prime env install my-env
|
|
|
282
296
|
|
|
283
297
|
For self-managed training launch commands, use the `prime-rl` documentation.
|
|
284
298
|
|
|
285
|
-
To install the environment module into your project, do:
|
|
286
|
-
```bash
|
|
287
|
-
prime env install my-env # installs from ./environments/my_env
|
|
288
|
-
```
|
|
289
|
-
|
|
290
|
-
To install an environment from the Environments Hub into your project, do:
|
|
291
|
-
```bash
|
|
292
|
-
prime env install primeintellect/math-python
|
|
293
|
-
```
|
|
294
|
-
|
|
295
299
|
To run a local evaluation with any OpenAI-compatible model, do:
|
|
296
300
|
```bash
|
|
297
301
|
prime eval run my-env -m openai/gpt-5-nano # run and save eval results locally
|
|
@@ -300,7 +304,7 @@ Evaluations use [Prime Inference](https://docs.primeintellect.ai/inference/overv
|
|
|
300
304
|
|
|
301
305
|
View local evaluation results in the terminal UI:
|
|
302
306
|
```bash
|
|
303
|
-
prime eval
|
|
307
|
+
prime eval view
|
|
304
308
|
```
|
|
305
309
|
|
|
306
310
|
To publish the environment to the [Environments Hub](https://app.primeintellect.ai/dashboard/environments?ex_sort=most_stars), do:
|
|
@@ -143,22 +143,35 @@ custom harnesses, use the v1 Taskset/Harness path:
|
|
|
143
143
|
# my_env.py
|
|
144
144
|
import verifiers as vf
|
|
145
145
|
|
|
146
|
-
def source():
|
|
147
|
-
yield {
|
|
148
|
-
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
149
|
-
"answer": "cba",
|
|
150
|
-
"max_turns": 1,
|
|
151
|
-
}
|
|
152
|
-
|
|
153
146
|
@vf.reward(weight=1.0)
|
|
154
147
|
async def contains_answer(task, state) -> float:
|
|
155
148
|
return float(task["answer"] in str(state.get("completion") or ""))
|
|
156
149
|
|
|
157
|
-
|
|
158
|
-
|
|
150
|
+
class MyTasksetConfig(vf.TasksetConfig):
|
|
151
|
+
split: str = "train"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class MyTaskset(vf.Taskset[MyTasksetConfig]):
|
|
155
|
+
_default_rewards = (contains_answer,)
|
|
156
|
+
|
|
157
|
+
def rows(self) -> list[dict[str, object]]:
|
|
158
|
+
rows = [
|
|
159
|
+
{
|
|
160
|
+
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
161
|
+
"answer": "cba",
|
|
162
|
+
"split": "train",
|
|
163
|
+
"max_turns": 1,
|
|
164
|
+
}
|
|
165
|
+
]
|
|
166
|
+
return [row for row in rows if row["split"] == self.config.split]
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class MyEnvConfig(vf.EnvConfig):
|
|
170
|
+
taskset: MyTasksetConfig = MyTasksetConfig()
|
|
171
|
+
|
|
159
172
|
|
|
160
|
-
def load_environment(config:
|
|
161
|
-
return vf.Env(taskset=
|
|
173
|
+
def load_environment(config: MyEnvConfig) -> vf.Env:
|
|
174
|
+
return vf.Env(taskset=MyTaskset(config=config.taskset))
|
|
162
175
|
```
|
|
163
176
|
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
164
177
|
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
@@ -169,8 +182,8 @@ harness with:
|
|
|
169
182
|
|
|
170
183
|
```python
|
|
171
184
|
env = vf.Env(
|
|
172
|
-
taskset=vf.HarborTaskset(),
|
|
173
|
-
harness=vf.OpenCode(),
|
|
185
|
+
taskset=vf.HarborTaskset(config=vf.HarborTasksetConfig()),
|
|
186
|
+
harness=vf.OpenCode(config=vf.OpenCodeConfig()),
|
|
174
187
|
)
|
|
175
188
|
```
|
|
176
189
|
|
|
@@ -207,16 +220,6 @@ prime env install my-env
|
|
|
207
220
|
|
|
208
221
|
For self-managed training launch commands, use the `prime-rl` documentation.
|
|
209
222
|
|
|
210
|
-
To install the environment module into your project, do:
|
|
211
|
-
```bash
|
|
212
|
-
prime env install my-env # installs from ./environments/my_env
|
|
213
|
-
```
|
|
214
|
-
|
|
215
|
-
To install an environment from the Environments Hub into your project, do:
|
|
216
|
-
```bash
|
|
217
|
-
prime env install primeintellect/math-python
|
|
218
|
-
```
|
|
219
|
-
|
|
220
223
|
To run a local evaluation with any OpenAI-compatible model, do:
|
|
221
224
|
```bash
|
|
222
225
|
prime eval run my-env -m openai/gpt-5-nano # run and save eval results locally
|
|
@@ -225,7 +228,7 @@ Evaluations use [Prime Inference](https://docs.primeintellect.ai/inference/overv
|
|
|
225
228
|
|
|
226
229
|
View local evaluation results in the terminal UI:
|
|
227
230
|
```bash
|
|
228
|
-
prime eval
|
|
231
|
+
prime eval view
|
|
229
232
|
```
|
|
230
233
|
|
|
231
234
|
To publish the environment to the [Environments Hub](https://app.primeintellect.ai/dashboard/environments?ex_sort=most_stars), do:
|
|
@@ -53,6 +53,7 @@ dependencies = [
|
|
|
53
53
|
"setproctitle>=1.3.0",
|
|
54
54
|
"regex<2026.4.4",
|
|
55
55
|
"httpx>=0.27.0",
|
|
56
|
+
"prime-pydantic-config[toml]",
|
|
56
57
|
]
|
|
57
58
|
|
|
58
59
|
[dependency-groups]
|
|
@@ -72,7 +73,7 @@ dev = [
|
|
|
72
73
|
"aiohttp>=3.9.0",
|
|
73
74
|
"python-dotenv>=1.0.0",
|
|
74
75
|
"nltk",
|
|
75
|
-
"renderers>=0.1.8.
|
|
76
|
+
"renderers>=0.1.8.dev4",
|
|
76
77
|
]
|
|
77
78
|
policy = [
|
|
78
79
|
"semgrep>=1.150.0",
|
|
@@ -95,7 +96,7 @@ openenv = [
|
|
|
95
96
|
"openenv-core>=0.3.0",
|
|
96
97
|
]
|
|
97
98
|
renderers = [
|
|
98
|
-
"renderers>=0.1.8.
|
|
99
|
+
"renderers>=0.1.8.dev4",
|
|
99
100
|
]
|
|
100
101
|
rl = [
|
|
101
102
|
"torch>=2.8.0,<2.9.0",
|
|
@@ -113,6 +114,7 @@ rl = [
|
|
|
113
114
|
[tool.uv]
|
|
114
115
|
preview = true
|
|
115
116
|
required-version = ">=0.11.1"
|
|
117
|
+
exclude-newer = "7 days"
|
|
116
118
|
conflicts = [
|
|
117
119
|
[
|
|
118
120
|
{ extra = "openenv" },
|
|
@@ -123,12 +125,12 @@ conflicts = [
|
|
|
123
125
|
name = "pypi"
|
|
124
126
|
url = "https://pypi.org/simple"
|
|
125
127
|
default = true
|
|
126
|
-
exclude-newer = "7 days"
|
|
127
128
|
|
|
128
129
|
[tool.uv.exclude-newer-package]
|
|
129
130
|
# PrimeIntellect-published on PyPI (trusted publisher)
|
|
130
131
|
prime-tunnel = false
|
|
131
132
|
prime-sandboxes = false
|
|
133
|
+
prime-pydantic-config = false
|
|
132
134
|
renderers = false
|
|
133
135
|
openenv-core = false
|
|
134
136
|
|
|
@@ -130,6 +130,9 @@ class _OverlongOpenAIChatClient:
|
|
|
130
130
|
def __init__(self, message: str) -> None:
|
|
131
131
|
self.chat = self._Chat(message)
|
|
132
132
|
|
|
133
|
+
async def post(self, *args, **kwargs): # noqa: ANN002, ANN003
|
|
134
|
+
return await self.chat.completions.create(*args, **kwargs)
|
|
135
|
+
|
|
133
136
|
|
|
134
137
|
@pytest.mark.parametrize(
|
|
135
138
|
"error_message",
|
|
@@ -101,18 +101,44 @@ def test_alphabet_sort_v1_validates_parameters():
|
|
|
101
101
|
spec.loader.exec_module(module)
|
|
102
102
|
|
|
103
103
|
with pytest.raises(ValueError, match="min_turns must be at least 1"):
|
|
104
|
-
module.
|
|
104
|
+
module.AlphabetSortTaskset(config=module.AlphabetSortTasksetConfig(min_turns=0))
|
|
105
105
|
with pytest.raises(
|
|
106
106
|
ValueError, match="min_turns must be less than or equal to max_turns"
|
|
107
107
|
):
|
|
108
|
-
module.
|
|
108
|
+
module.AlphabetSortTaskset(
|
|
109
|
+
config=module.AlphabetSortTasksetConfig(min_turns=3, max_turns=2)
|
|
110
|
+
)
|
|
109
111
|
with pytest.raises(ValueError, match="min_names_per_turn must be at least 1"):
|
|
110
|
-
module.
|
|
112
|
+
module.AlphabetSortTaskset(
|
|
113
|
+
config=module.AlphabetSortTasksetConfig(min_names_per_turn=0)
|
|
114
|
+
)
|
|
111
115
|
with pytest.raises(
|
|
112
116
|
ValueError,
|
|
113
117
|
match="min_names_per_turn must be less than or equal to max_names_per_turn",
|
|
114
118
|
):
|
|
115
|
-
module.
|
|
119
|
+
module.AlphabetSortTaskset(
|
|
120
|
+
config=module.AlphabetSortTasksetConfig(
|
|
121
|
+
min_names_per_turn=3,
|
|
122
|
+
max_names_per_turn=2,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@pytest.mark.parametrize("env_name", ["alphabet_sort", "math_python"])
|
|
128
|
+
def test_v1_wrapper_rejects_unknown_kwargs(env_name: str):
|
|
129
|
+
module_path = Path("environments") / env_name / f"{env_name}.py"
|
|
130
|
+
spec = importlib.util.spec_from_file_location(
|
|
131
|
+
f"{env_name}_wrapper_test", module_path
|
|
132
|
+
)
|
|
133
|
+
assert spec is not None and spec.loader is not None
|
|
134
|
+
module = importlib.util.module_from_spec(spec)
|
|
135
|
+
sys.modules[spec.name] = module
|
|
136
|
+
spec.loader.exec_module(module)
|
|
137
|
+
|
|
138
|
+
with pytest.raises(
|
|
139
|
+
TypeError, match="Unsupported v1 load_environment kwargs: extra"
|
|
140
|
+
):
|
|
141
|
+
module.load_environment(v1=True, extra=True)
|
|
116
142
|
|
|
117
143
|
|
|
118
144
|
@pytest.mark.slow
|
|
@@ -127,8 +153,12 @@ def test_env(env_dir: Path, tmp_path_factory: pytest.TempPathFactory):
|
|
|
127
153
|
repo_root = Path(__file__).parent.parent
|
|
128
154
|
cmd = (
|
|
129
155
|
f"cd {tmp_venv_dir} && uv venv --clear && source .venv/bin/activate && "
|
|
130
|
-
|
|
131
|
-
|
|
156
|
+
"uv pip install "
|
|
157
|
+
"--exclude-newer-package prime-pydantic-config=2026-05-20T00:00:00Z "
|
|
158
|
+
f"{repo_root.as_posix()} && "
|
|
159
|
+
"uv pip install "
|
|
160
|
+
"--exclude-newer-package prime-pydantic-config=2026-05-20T00:00:00Z "
|
|
161
|
+
f"{env_dir.absolute().as_posix()}"
|
|
132
162
|
)
|
|
133
163
|
try:
|
|
134
164
|
process = subprocess.run(
|
|
@@ -13,6 +13,7 @@ import verifiers.scripts.eval as vf_eval
|
|
|
13
13
|
import verifiers.utils.eval_utils
|
|
14
14
|
from verifiers.types import GenerateOutputs
|
|
15
15
|
from verifiers.utils.eval_utils import load_toml_config
|
|
16
|
+
from verifiers.utils.path_utils import get_eval_results_path
|
|
16
17
|
from verifiers.utils.save_utils import states_to_outputs
|
|
17
18
|
|
|
18
19
|
|
|
@@ -706,6 +707,34 @@ def test_load_toml_config_multi_env():
|
|
|
706
707
|
assert result[1]["env_id"] == "env2"
|
|
707
708
|
|
|
708
709
|
|
|
710
|
+
def test_load_toml_config_duplicate_envs_accept_names():
|
|
711
|
+
"""Duplicate env ids can be labeled and configured independently."""
|
|
712
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
713
|
+
f.write(
|
|
714
|
+
'[[eval]]\nid = "env1"\nname = "env1-short"\n'
|
|
715
|
+
"[eval.args]\n"
|
|
716
|
+
'split = "short"\n\n'
|
|
717
|
+
'[[eval]]\nid = "env1"\nname = "env1-long"\n'
|
|
718
|
+
"[eval.args]\n"
|
|
719
|
+
'split = "long"\n'
|
|
720
|
+
)
|
|
721
|
+
f.flush()
|
|
722
|
+
result = load_toml_config(Path(f.name))
|
|
723
|
+
|
|
724
|
+
assert len(result) == 2
|
|
725
|
+
assert [config["env_id"] for config in result] == ["env1", "env1"]
|
|
726
|
+
assert [config["name"] for config in result] == ["env1-short", "env1-long"]
|
|
727
|
+
assert [config["env_args"]["split"] for config in result] == ["short", "long"]
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def test_load_toml_config_rejects_global_name():
|
|
731
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
732
|
+
f.write('name = "shared-name"\n\n[[eval]]\nid = "env1"\n')
|
|
733
|
+
f.flush()
|
|
734
|
+
with pytest.raises(ValueError, match="Invalid global field"):
|
|
735
|
+
load_toml_config(Path(f.name))
|
|
736
|
+
|
|
737
|
+
|
|
709
738
|
def test_load_toml_config_with_env_args():
|
|
710
739
|
"""Multiple sections with env_args field loads correctly."""
|
|
711
740
|
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
@@ -720,6 +749,92 @@ def test_load_toml_config_with_env_args():
|
|
|
720
749
|
assert result[0]["env_args"]["max_examples"] == 100
|
|
721
750
|
|
|
722
751
|
|
|
752
|
+
def test_load_toml_config_sampling_section_mirrors_chat_template_kwargs():
|
|
753
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
754
|
+
f.write(
|
|
755
|
+
"[sampling]\n"
|
|
756
|
+
"max_tokens = 1024\n"
|
|
757
|
+
'reasoning_effort = "medium"\n'
|
|
758
|
+
"enable_thinking = false\n\n"
|
|
759
|
+
"[sampling.extra_body]\n"
|
|
760
|
+
'custom = "value"\n\n'
|
|
761
|
+
"[sampling.extra_body.chat_template_kwargs]\n"
|
|
762
|
+
"clear_thinking = true\n\n"
|
|
763
|
+
"[[eval]]\n"
|
|
764
|
+
'env_id = "env1"\n'
|
|
765
|
+
)
|
|
766
|
+
f.flush()
|
|
767
|
+
result = load_toml_config(Path(f.name))
|
|
768
|
+
|
|
769
|
+
assert result[0]["sampling_args"] == {
|
|
770
|
+
"max_tokens": 1024,
|
|
771
|
+
"reasoning_effort": "medium",
|
|
772
|
+
"enable_thinking": False,
|
|
773
|
+
"extra_body": {
|
|
774
|
+
"custom": "value",
|
|
775
|
+
"chat_template_kwargs": {
|
|
776
|
+
"clear_thinking": True,
|
|
777
|
+
"reasoning_effort": "medium",
|
|
778
|
+
"enable_thinking": False,
|
|
779
|
+
},
|
|
780
|
+
},
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
def test_load_toml_config_sampling_args_mirrors_chat_template_kwargs():
|
|
785
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
786
|
+
f.write(
|
|
787
|
+
"[[eval]]\n"
|
|
788
|
+
'env_id = "env1"\n'
|
|
789
|
+
'sampling_args = { max_tokens = 256, reasoning_effort = "high", enable_thinking = true }\n'
|
|
790
|
+
)
|
|
791
|
+
f.flush()
|
|
792
|
+
result = load_toml_config(Path(f.name))
|
|
793
|
+
|
|
794
|
+
assert result[0]["sampling_args"] == {
|
|
795
|
+
"max_tokens": 256,
|
|
796
|
+
"reasoning_effort": "high",
|
|
797
|
+
"enable_thinking": True,
|
|
798
|
+
"extra_body": {
|
|
799
|
+
"chat_template_kwargs": {
|
|
800
|
+
"reasoning_effort": "high",
|
|
801
|
+
"enable_thinking": True,
|
|
802
|
+
}
|
|
803
|
+
},
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
def test_cli_toml_eval_sampling_section_pipes_thinking_args(monkeypatch, run_cli):
|
|
808
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
809
|
+
f.write(
|
|
810
|
+
"[[eval]]\n"
|
|
811
|
+
'env_id = "env1"\n\n'
|
|
812
|
+
"[eval.sampling]\n"
|
|
813
|
+
"max_tokens = 512\n"
|
|
814
|
+
'reasoning_effort = "low"\n'
|
|
815
|
+
"enable_thinking = true\n"
|
|
816
|
+
)
|
|
817
|
+
f.flush()
|
|
818
|
+
captured = run_cli(
|
|
819
|
+
monkeypatch,
|
|
820
|
+
{
|
|
821
|
+
"env_id_or_config": f.name,
|
|
822
|
+
},
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
assert captured["sampling_args"] == {
|
|
826
|
+
"max_tokens": 512,
|
|
827
|
+
"reasoning_effort": "low",
|
|
828
|
+
"enable_thinking": True,
|
|
829
|
+
"extra_body": {
|
|
830
|
+
"chat_template_kwargs": {
|
|
831
|
+
"reasoning_effort": "low",
|
|
832
|
+
"enable_thinking": True,
|
|
833
|
+
}
|
|
834
|
+
},
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
|
|
723
838
|
def test_load_toml_config_with_args_taskset_harness():
|
|
724
839
|
"""args/taskset/harness sections normalize into load_environment kwargs."""
|
|
725
840
|
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
@@ -728,8 +843,10 @@ def test_load_toml_config_with_args_taskset_harness():
|
|
|
728
843
|
"[eval.args]\n"
|
|
729
844
|
'split = "train"\n\n'
|
|
730
845
|
"[eval.taskset]\n"
|
|
846
|
+
'id = "user/taskset-package"\n'
|
|
731
847
|
"num_examples = 10\n\n"
|
|
732
848
|
"[eval.harness]\n"
|
|
849
|
+
'id = "user/harness-package"\n'
|
|
733
850
|
"max_turns = 5\n"
|
|
734
851
|
)
|
|
735
852
|
f.flush()
|
|
@@ -740,8 +857,8 @@ def test_load_toml_config_with_args_taskset_harness():
|
|
|
740
857
|
assert result[0]["env_args"] == {
|
|
741
858
|
"split": "train",
|
|
742
859
|
"config": {
|
|
743
|
-
"taskset": {"num_examples": 10},
|
|
744
|
-
"harness": {"max_turns": 5},
|
|
860
|
+
"taskset": {"id": "user/taskset-package", "num_examples": 10},
|
|
861
|
+
"harness": {"id": "user/harness-package", "max_turns": 5},
|
|
745
862
|
},
|
|
746
863
|
}
|
|
747
864
|
assert "args" not in result[0]
|
|
@@ -815,6 +932,28 @@ def test_cli_multi_env_via_toml_config(monkeypatch, run_cli):
|
|
|
815
932
|
assert configs[1].env_id == "env2"
|
|
816
933
|
|
|
817
934
|
|
|
935
|
+
def test_cli_duplicate_env_names_disambiguate_result_paths(monkeypatch, run_cli):
|
|
936
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
937
|
+
f.write(
|
|
938
|
+
'[[eval]]\nid = "env1"\nname = "env1-short"\n'
|
|
939
|
+
"[eval.args]\n"
|
|
940
|
+
'split = "short"\n\n'
|
|
941
|
+
'[[eval]]\nid = "env1"\nname = "env1-long"\n'
|
|
942
|
+
"[eval.args]\n"
|
|
943
|
+
'split = "long"\n'
|
|
944
|
+
)
|
|
945
|
+
f.flush()
|
|
946
|
+
captured = run_cli(monkeypatch, {"env_id_or_config": f.name})
|
|
947
|
+
|
|
948
|
+
configs = captured["configs"]
|
|
949
|
+
assert len(configs) == 2
|
|
950
|
+
assert [config.env_id for config in configs] == ["env1", "env1"]
|
|
951
|
+
assert [config.name for config in configs] == ["env1-short", "env1-long"]
|
|
952
|
+
assert [config.env_args["split"] for config in configs] == ["short", "long"]
|
|
953
|
+
assert get_eval_results_path(configs[0]).parent.name.startswith("env1-short--")
|
|
954
|
+
assert get_eval_results_path(configs[1]).parent.name.startswith("env1-long--")
|
|
955
|
+
|
|
956
|
+
|
|
818
957
|
def test_cli_toml_ignores_cli_args(monkeypatch, run_cli):
|
|
819
958
|
"""TOML config ignores CLI args, uses defaults for unspecified values."""
|
|
820
959
|
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
@@ -838,6 +977,16 @@ def test_cli_toml_ignores_cli_args(monkeypatch, run_cli):
|
|
|
838
977
|
assert config.rollouts_per_example == 3 # DEFAULT_ROLLOUTS_PER_EXAMPLE
|
|
839
978
|
assert config.max_concurrent == 32 # default
|
|
840
979
|
assert config.sampling_args["max_tokens"] is None # default
|
|
980
|
+
assert config.save_results is True
|
|
981
|
+
|
|
982
|
+
|
|
983
|
+
def test_cli_toml_respects_save_results_false(monkeypatch, run_cli):
|
|
984
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
985
|
+
f.write('[[eval]]\nenv_id = "env1"\nsave_results = false\n')
|
|
986
|
+
f.flush()
|
|
987
|
+
captured = run_cli(monkeypatch, {"env_id_or_config": f.name})
|
|
988
|
+
|
|
989
|
+
assert captured["configs"][0].save_results is False
|
|
841
990
|
|
|
842
991
|
|
|
843
992
|
def test_cli_toml_per_env_num_examples(monkeypatch, run_cli):
|
|
@@ -1212,6 +1361,44 @@ def test_ablation_global_defaults_apply():
|
|
|
1212
1361
|
assert all(c["num_examples"] == 100 for c in configs)
|
|
1213
1362
|
|
|
1214
1363
|
|
|
1364
|
+
def test_ablation_sampling_sweep_merges_with_global_sampling_defaults():
|
|
1365
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
1366
|
+
f.write(
|
|
1367
|
+
"[sampling]\n"
|
|
1368
|
+
"max_tokens = 1024\n"
|
|
1369
|
+
'reasoning_effort = "medium"\n\n'
|
|
1370
|
+
'[[ablation]]\nenv_id = "my-env"\n\n'
|
|
1371
|
+
"[ablation.sweep]\n"
|
|
1372
|
+
"sampling = [{ temperature = 0.0 }, { temperature = 1.0, enable_thinking = false }]\n"
|
|
1373
|
+
)
|
|
1374
|
+
f.flush()
|
|
1375
|
+
configs = load_toml_config(Path(f.name))
|
|
1376
|
+
|
|
1377
|
+
assert len(configs) == 2
|
|
1378
|
+
assert configs[0]["sampling_args"] == {
|
|
1379
|
+
"max_tokens": 1024,
|
|
1380
|
+
"reasoning_effort": "medium",
|
|
1381
|
+
"temperature": 0.0,
|
|
1382
|
+
"extra_body": {
|
|
1383
|
+
"chat_template_kwargs": {
|
|
1384
|
+
"reasoning_effort": "medium",
|
|
1385
|
+
}
|
|
1386
|
+
},
|
|
1387
|
+
}
|
|
1388
|
+
assert configs[1]["sampling_args"] == {
|
|
1389
|
+
"max_tokens": 1024,
|
|
1390
|
+
"reasoning_effort": "medium",
|
|
1391
|
+
"temperature": 1.0,
|
|
1392
|
+
"enable_thinking": False,
|
|
1393
|
+
"extra_body": {
|
|
1394
|
+
"chat_template_kwargs": {
|
|
1395
|
+
"reasoning_effort": "medium",
|
|
1396
|
+
"enable_thinking": False,
|
|
1397
|
+
}
|
|
1398
|
+
},
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
|
|
1215
1402
|
def test_ablation_endpoint_id_override_removes_global_model():
|
|
1216
1403
|
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
1217
1404
|
f.write(
|
|
@@ -11,9 +11,11 @@ def make_config(
|
|
|
11
11
|
independent_scoring: bool = False,
|
|
12
12
|
endpoint_id: str | None = None,
|
|
13
13
|
client_config: ClientConfig | None = None,
|
|
14
|
+
name: str | None = None,
|
|
14
15
|
) -> EvalConfig:
|
|
15
16
|
return EvalConfig(
|
|
16
17
|
env_id="dummy-env",
|
|
18
|
+
name=name,
|
|
17
19
|
env_args={},
|
|
18
20
|
env_dir_path="./environments",
|
|
19
21
|
endpoint_id=endpoint_id,
|
|
@@ -82,6 +84,20 @@ def test_format_client_target_uses_single_resolved_base_url() -> None:
|
|
|
82
84
|
assert EvalDisplay._format_client_target(config) == "http://localhost:8001/v1"
|
|
83
85
|
|
|
84
86
|
|
|
87
|
+
def test_display_uses_eval_name_for_duplicate_env_labels() -> None:
|
|
88
|
+
display = EvalDisplay(
|
|
89
|
+
[
|
|
90
|
+
make_config(max_concurrent=1, name="dummy-env-short"),
|
|
91
|
+
make_config(max_concurrent=1, name="dummy-env-long"),
|
|
92
|
+
]
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
rendered = render_plain(display._make_compact_env_row(0))
|
|
96
|
+
|
|
97
|
+
assert "dummy-env-short" in rendered
|
|
98
|
+
assert "dummy-env-long" not in rendered
|
|
99
|
+
|
|
100
|
+
|
|
85
101
|
def render_plain(renderable) -> str:
|
|
86
102
|
console = Console(width=100, record=True)
|
|
87
103
|
console.print(renderable)
|
|
@@ -87,6 +87,22 @@ def test_print_results_single_rollout(capsys, make_metadata, make_state, make_in
|
|
|
87
87
|
assert "r1: [0.1, 0.2, 0.3]" in captured.out
|
|
88
88
|
|
|
89
89
|
|
|
90
|
+
def test_print_results_includes_eval_name(capsys, make_metadata, make_output):
|
|
91
|
+
from verifiers.utils.eval_utils import print_results
|
|
92
|
+
|
|
93
|
+
metadata = make_metadata(env_id="env1")
|
|
94
|
+
metadata["name"] = "env1-short"
|
|
95
|
+
results = GenerateOutputs(
|
|
96
|
+
outputs=[make_output(example_id=0, reward=1.0)],
|
|
97
|
+
metadata=metadata,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
print_results(results)
|
|
101
|
+
captured = capsys.readouterr()
|
|
102
|
+
|
|
103
|
+
assert "Environment: env1-short (env1)" in captured.out
|
|
104
|
+
|
|
105
|
+
|
|
90
106
|
def test_print_results_three_rollouts(capsys, make_metadata, make_state, make_input):
|
|
91
107
|
"""Test print_results with three rollouts per example."""
|
|
92
108
|
from verifiers.utils.eval_utils import print_results
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import importlib.util
|
|
2
2
|
import inspect
|
|
3
|
+
import sys
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
@@ -19,6 +20,7 @@ def _load_mcp_search_module() -> Any:
|
|
|
19
20
|
assert spec.loader is not None
|
|
20
21
|
|
|
21
22
|
module = importlib.util.module_from_spec(spec)
|
|
23
|
+
sys.modules[spec.name] = module
|
|
22
24
|
spec.loader.exec_module(module)
|
|
23
25
|
return module
|
|
24
26
|
|
|
@@ -39,10 +41,20 @@ def test_mcp_search_env_is_v1_only() -> None:
|
|
|
39
41
|
assert env.taskset.config.max_turns == 4
|
|
40
42
|
|
|
41
43
|
|
|
44
|
+
def test_mcp_search_env_preserves_harness_config() -> None:
|
|
45
|
+
module = _load_mcp_search_module()
|
|
46
|
+
|
|
47
|
+
env = module.load_environment(
|
|
48
|
+
config=module.MCPSearchEnvConfig(harness={"max_turns": 7})
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
assert env.harness.config.max_turns == 7
|
|
52
|
+
|
|
53
|
+
|
|
42
54
|
def test_mcp_search_default_taskset_has_stable_non_doc_fixture() -> None:
|
|
43
55
|
module = _load_mcp_search_module()
|
|
44
56
|
|
|
45
|
-
rows = module.
|
|
57
|
+
rows = module.MCPSearchTaskset(config=module.MCPSearchTasksetConfig()).rows()
|
|
46
58
|
|
|
47
59
|
assert len(rows) >= 10
|
|
48
60
|
assert len({row["answer"] for row in rows}) == len(rows)
|