verifiers 0.1.15.dev9__tar.gz → 0.1.15.dev11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/PKG-INFO +17 -19
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/README.md +14 -17
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/pyproject.toml +3 -2
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_client_multimodal_types.py +25 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_eval_cli.py +19 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_harbor_env_mcp.py +43 -89
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_imports.py +31 -0
- verifiers-0.1.15.dev11/tests/test_init_script.py +83 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_langchain_deep_agents_wikispeedia.py +5 -5
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_lean_task.py +10 -8
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_mcp_search_env.py +3 -3
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_opencode_harbor.py +9 -7
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_opencode_rlm_env.py +35 -44
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_openenv_client.py +89 -31
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_prime_plugin.py +5 -5
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_renderer_client.py +45 -14
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_renderer_e2e.py +28 -18
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_rlm_env.py +0 -24
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_bfcl.py +6 -5
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_config_extension.py +845 -271
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_example_counts.py +10 -10
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_harbor_cli.py +58 -30
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_mini_swe_agent.py +11 -10
- verifiers-0.1.15.dev11/tests/test_v1_rlm_swe.py +775 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_runtime_lifecycle.py +228 -145
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_scoring_functions.py +1 -1
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_taskset_bindings.py +65 -64
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_textarena_taskset.py +29 -11
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_wiki_search_v1.py +3 -3
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_wordle_v1_env.py +11 -2
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/__init__.py +19 -47
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/plugins/prime.py +1 -5
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/clients/anthropic_messages_client.py +27 -44
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/clients/client.py +12 -14
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/clients/openai_chat_completions_client.py +1 -6
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/clients/openai_chat_completions_token_client.py +14 -17
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/clients/openai_responses_client.py +13 -18
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/clients/renderer_client.py +42 -81
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/environment.py +14 -16
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/composable_env.py +13 -21
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/harnesses/rlm.py +7 -8
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/swe_debug_env.py +12 -19
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/task.py +9 -18
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +5 -18
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +1 -10
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +3 -7
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +2 -2
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +24 -34
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +34 -44
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/gym_env.py +22 -19
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/harbor_env/mcp.py +17 -28
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/mcp_env.py +6 -13
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/opencode_rlm_env.py +9 -16
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/rlm_env.py +40 -62
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/utils/git_checkout_cache.py +13 -31
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/openenv_env.py +75 -126
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/multiturn_env.py +1 -5
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/gepa/gepa_utils.py +6 -14
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rubrics/rubric.py +7 -12
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/build.py +17 -29
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/eval.py +3 -3
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/init.py +98 -67
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/serve/server/env_server.py +17 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/serve/server/env_worker.py +19 -4
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/types.py +18 -5
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/client_utils.py +19 -31
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/data_utils.py +10 -17
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/display_utils.py +2 -6
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/env_utils.py +96 -21
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/eval_utils.py +21 -38
- verifiers-0.1.15.dev11/verifiers/utils/import_utils.py +11 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/install_utils.py +10 -11
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/interception_utils.py +9 -11
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/logging_utils.py +11 -17
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/message_utils.py +9 -14
- verifiers-0.1.15.dev11/verifiers/utils/response_utils.py +102 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/save_utils.py +13 -21
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/thread_utils.py +2 -15
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/threaded_sandbox_client.py +2 -2
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +29 -45
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/README.md +84 -94
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/RE_MIGRATION.md +53 -46
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/__init__.py +10 -35
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/config.py +30 -5
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/env.py +4 -26
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/harness.py +37 -36
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/harnesses/command.py +17 -21
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/harnesses/opencode.py +1 -1
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/harnesses/pi.py +6 -10
- verifiers-0.1.15.dev11/verifiers/v1/packages/harnesses/rlm.py +601 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/harnesses/terminus_2.py +8 -13
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/tasksets/harbor.py +126 -113
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/tasksets/textarena.py +74 -50
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/runtime.py +24 -37
- verifiers-0.1.15.dev11/verifiers/v1/taskset.py +207 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/toolset.py +2 -1
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/types.py +4 -3
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/config_utils.py +52 -3
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/program_utils.py +2 -1
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/prompt_utils.py +91 -2
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/runtime_owner_utils.py +26 -45
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/sandbox_utils.py +2 -0
- verifiers-0.1.15.dev11/verifiers/v1/utils/taskset_registry_utils.py +115 -0
- verifiers-0.1.15.dev11/verifiers/v1/utils/taskset_utils.py +78 -0
- verifiers-0.1.15.dev9/tests/test_v1_rlm_swe.py +0 -390
- verifiers-0.1.15.dev9/verifiers/utils/import_utils.py +0 -16
- verifiers-0.1.15.dev9/verifiers/utils/response_utils.py +0 -94
- verifiers-0.1.15.dev9/verifiers/v1/packages/harnesses/rlm.py +0 -291
- verifiers-0.1.15.dev9/verifiers/v1/taskset.py +0 -190
- verifiers-0.1.15.dev9/verifiers/v1/utils/taskset_utils.py +0 -90
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/.gitignore +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/LICENSE +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/AGENTS.md +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/README.md +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/conftest.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_build_script.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_client_config.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_env_group.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_env_server.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_environment.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_envs.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_gepa_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_logging.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_parser.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_per_turn_timing.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_pricing_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_rubric.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_save_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_types.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_empty_completions.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_endpoint_protocols.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_v1_group_reward_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_wordle_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/decorators.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/harness.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/errors.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/env_config_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/eval_display.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/pricing_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/utils/version_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/harnesses/configs.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/packages/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/state.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/task.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/user.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/artifact_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/binding_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/config_callable_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/endpoint_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/json_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/judge_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/lifecycle_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/mcp_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/object_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/runtime_registry.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/scoring_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/serialization_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/task_freeze_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/timing_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/trajectory_utils.py +0 -0
- {verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/verifiers/v1/utils/usage_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.15.
|
|
3
|
+
Version: 0.1.15.dev11
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -48,6 +48,7 @@ Requires-Dist: tenacity>=8.5.0
|
|
|
48
48
|
Requires-Dist: textual
|
|
49
49
|
Requires-Dist: tomli; python_version < '3.11'
|
|
50
50
|
Requires-Dist: typing-extensions; python_version < '3.12'
|
|
51
|
+
Requires-Dist: uvloop>=0.21.0; sys_platform != 'win32' and sys_platform != 'cygwin' and platform_python_implementation != 'PyPy'
|
|
51
52
|
Provides-Extra: browser
|
|
52
53
|
Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
|
|
53
54
|
Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
|
|
@@ -55,7 +56,7 @@ Requires-Dist: stagehand>=3.0.0; extra == 'browser'
|
|
|
55
56
|
Provides-Extra: openenv
|
|
56
57
|
Requires-Dist: openenv-core>=0.3.0; extra == 'openenv'
|
|
57
58
|
Provides-Extra: renderers
|
|
58
|
-
Requires-Dist: renderers>=0.1.8.
|
|
59
|
+
Requires-Dist: renderers>=0.1.8.dev28; extra == 'renderers'
|
|
59
60
|
Provides-Extra: rg
|
|
60
61
|
Requires-Dist: reasoning-gym; extra == 'rg'
|
|
61
62
|
Provides-Extra: rl
|
|
@@ -219,19 +220,13 @@ custom harnesses, use the v1 Taskset/Harness path:
|
|
|
219
220
|
# my_env.py
|
|
220
221
|
import verifiers as vf
|
|
221
222
|
|
|
222
|
-
@vf.reward(weight=1.0)
|
|
223
|
-
async def contains_answer(task, state) -> float:
|
|
224
|
-
return float(task["answer"] in str(state.get("completion") or ""))
|
|
225
223
|
|
|
226
224
|
class MyTasksetConfig(vf.TasksetConfig):
|
|
227
225
|
split: str = "train"
|
|
228
226
|
|
|
229
227
|
|
|
230
|
-
class MyTaskset(vf.Taskset):
|
|
231
|
-
|
|
232
|
-
_default_rewards = (contains_answer,)
|
|
233
|
-
|
|
234
|
-
def rows(self) -> list[dict[str, object]]:
|
|
228
|
+
class MyTaskset(vf.Taskset[MyTasksetConfig]):
|
|
229
|
+
def load_tasks(self) -> vf.Tasks:
|
|
235
230
|
rows = [
|
|
236
231
|
{
|
|
237
232
|
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
@@ -242,28 +237,31 @@ class MyTaskset(vf.Taskset):
|
|
|
242
237
|
]
|
|
243
238
|
return [row for row in rows if row["split"] == self.config.split]
|
|
244
239
|
|
|
240
|
+
@vf.reward(weight=1.0)
|
|
241
|
+
async def contains_answer(self, task, state) -> float:
|
|
242
|
+
return float(task["answer"] in str(state.get("completion") or ""))
|
|
243
|
+
|
|
245
244
|
|
|
246
245
|
def load_taskset(config: MyTasksetConfig) -> MyTaskset:
|
|
247
|
-
assert isinstance(config, MyTasksetConfig)
|
|
248
246
|
return MyTaskset(config=config)
|
|
249
247
|
|
|
250
248
|
|
|
251
249
|
def load_environment(config: vf.EnvConfig) -> vf.Env:
|
|
252
|
-
|
|
253
|
-
assert isinstance(taskset_config, MyTasksetConfig)
|
|
254
|
-
return vf.Env(taskset=load_taskset(taskset_config))
|
|
250
|
+
return vf.Env(taskset=vf.load_taskset(config=config.taskset))
|
|
255
251
|
```
|
|
256
252
|
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
257
253
|
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
258
|
-
Reusable taskset and harness packages live under `verifiers.v1.packages
|
|
259
|
-
|
|
260
|
-
For example, Harbor task directories can run through the bundled OpenCode CLI
|
|
254
|
+
Reusable taskset and harness packages live under `verifiers.v1.packages`. For
|
|
255
|
+
example, Harbor task directories can run through the bundled OpenCode CLI
|
|
261
256
|
harness with:
|
|
262
257
|
|
|
263
258
|
```python
|
|
259
|
+
from verifiers.v1.packages.harnesses import OpenCode, OpenCodeConfig
|
|
260
|
+
from verifiers.v1.packages.tasksets import HarborTaskset, HarborTasksetConfig
|
|
261
|
+
|
|
264
262
|
env = vf.Env(
|
|
265
|
-
taskset=
|
|
266
|
-
harness=
|
|
263
|
+
taskset=HarborTaskset(config=HarborTasksetConfig()),
|
|
264
|
+
harness=OpenCode(config=OpenCodeConfig()),
|
|
267
265
|
)
|
|
268
266
|
```
|
|
269
267
|
|
|
@@ -143,19 +143,13 @@ custom harnesses, use the v1 Taskset/Harness path:
|
|
|
143
143
|
# my_env.py
|
|
144
144
|
import verifiers as vf
|
|
145
145
|
|
|
146
|
-
@vf.reward(weight=1.0)
|
|
147
|
-
async def contains_answer(task, state) -> float:
|
|
148
|
-
return float(task["answer"] in str(state.get("completion") or ""))
|
|
149
146
|
|
|
150
147
|
class MyTasksetConfig(vf.TasksetConfig):
|
|
151
148
|
split: str = "train"
|
|
152
149
|
|
|
153
150
|
|
|
154
|
-
class MyTaskset(vf.Taskset):
|
|
155
|
-
|
|
156
|
-
_default_rewards = (contains_answer,)
|
|
157
|
-
|
|
158
|
-
def rows(self) -> list[dict[str, object]]:
|
|
151
|
+
class MyTaskset(vf.Taskset[MyTasksetConfig]):
|
|
152
|
+
def load_tasks(self) -> vf.Tasks:
|
|
159
153
|
rows = [
|
|
160
154
|
{
|
|
161
155
|
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
@@ -166,28 +160,31 @@ class MyTaskset(vf.Taskset):
|
|
|
166
160
|
]
|
|
167
161
|
return [row for row in rows if row["split"] == self.config.split]
|
|
168
162
|
|
|
163
|
+
@vf.reward(weight=1.0)
|
|
164
|
+
async def contains_answer(self, task, state) -> float:
|
|
165
|
+
return float(task["answer"] in str(state.get("completion") or ""))
|
|
166
|
+
|
|
169
167
|
|
|
170
168
|
def load_taskset(config: MyTasksetConfig) -> MyTaskset:
|
|
171
|
-
assert isinstance(config, MyTasksetConfig)
|
|
172
169
|
return MyTaskset(config=config)
|
|
173
170
|
|
|
174
171
|
|
|
175
172
|
def load_environment(config: vf.EnvConfig) -> vf.Env:
|
|
176
|
-
|
|
177
|
-
assert isinstance(taskset_config, MyTasksetConfig)
|
|
178
|
-
return vf.Env(taskset=load_taskset(taskset_config))
|
|
173
|
+
return vf.Env(taskset=vf.load_taskset(config=config.taskset))
|
|
179
174
|
```
|
|
180
175
|
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
181
176
|
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
182
|
-
Reusable taskset and harness packages live under `verifiers.v1.packages
|
|
183
|
-
|
|
184
|
-
For example, Harbor task directories can run through the bundled OpenCode CLI
|
|
177
|
+
Reusable taskset and harness packages live under `verifiers.v1.packages`. For
|
|
178
|
+
example, Harbor task directories can run through the bundled OpenCode CLI
|
|
185
179
|
harness with:
|
|
186
180
|
|
|
187
181
|
```python
|
|
182
|
+
from verifiers.v1.packages.harnesses import OpenCode, OpenCodeConfig
|
|
183
|
+
from verifiers.v1.packages.tasksets import HarborTaskset, HarborTasksetConfig
|
|
184
|
+
|
|
188
185
|
env = vf.Env(
|
|
189
|
-
taskset=
|
|
190
|
-
harness=
|
|
186
|
+
taskset=HarborTaskset(config=HarborTasksetConfig()),
|
|
187
|
+
harness=OpenCode(config=OpenCodeConfig()),
|
|
191
188
|
)
|
|
192
189
|
```
|
|
193
190
|
|
|
@@ -54,6 +54,7 @@ dependencies = [
|
|
|
54
54
|
"regex<2026.4.4",
|
|
55
55
|
"httpx>=0.27.0",
|
|
56
56
|
"prime-pydantic-config[toml]",
|
|
57
|
+
"uvloop>=0.21.0; sys_platform != 'win32' and sys_platform != 'cygwin' and platform_python_implementation != 'PyPy'",
|
|
57
58
|
]
|
|
58
59
|
|
|
59
60
|
[dependency-groups]
|
|
@@ -73,7 +74,7 @@ dev = [
|
|
|
73
74
|
"aiohttp>=3.9.0",
|
|
74
75
|
"python-dotenv>=1.0.0",
|
|
75
76
|
"nltk",
|
|
76
|
-
"renderers>=0.1.8.
|
|
77
|
+
"renderers>=0.1.8.dev28",
|
|
77
78
|
]
|
|
78
79
|
policy = [
|
|
79
80
|
"semgrep>=1.150.0",
|
|
@@ -96,7 +97,7 @@ openenv = [
|
|
|
96
97
|
"openenv-core>=0.3.0",
|
|
97
98
|
]
|
|
98
99
|
renderers = [
|
|
99
|
-
"renderers>=0.1.8.
|
|
100
|
+
"renderers>=0.1.8.dev28",
|
|
100
101
|
]
|
|
101
102
|
rl = [
|
|
102
103
|
"torch>=2.8.0,<2.9.0",
|
|
@@ -98,6 +98,31 @@ async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts():
|
|
|
98
98
|
]
|
|
99
99
|
|
|
100
100
|
|
|
101
|
+
@pytest.mark.asyncio
|
|
102
|
+
async def test_anthropic_to_native_prompt_marks_unsupported_images_in_mixed_content():
|
|
103
|
+
pytest.importorskip("anthropic")
|
|
104
|
+
from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
|
|
105
|
+
|
|
106
|
+
client = AnthropicMessagesClient(object())
|
|
107
|
+
messages = [
|
|
108
|
+
UserMessage(
|
|
109
|
+
content=[
|
|
110
|
+
TextContentPart(text="describe this"),
|
|
111
|
+
ImageUrlContentPart(
|
|
112
|
+
image_url=ImageUrlSource(url="https://example.com/image.png")
|
|
113
|
+
),
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
prompt, kwargs = await client.to_native_prompt(messages)
|
|
119
|
+
assert kwargs["system"] == ""
|
|
120
|
+
assert prompt[0]["content"] == [
|
|
121
|
+
{"type": "text", "text": "describe this"},
|
|
122
|
+
{"type": "text", "text": "[image]"},
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
|
|
101
126
|
@pytest.mark.asyncio
|
|
102
127
|
async def test_anthropic_assistant_tool_calls_use_text_chunks_not_model_repr():
|
|
103
128
|
pytest.importorskip("anthropic")
|
|
@@ -288,6 +288,25 @@ def test_cli_headers_table_and_list_merge(monkeypatch, run_cli):
|
|
|
288
288
|
}
|
|
289
289
|
|
|
290
290
|
|
|
291
|
+
def test_cli_defaults_session_header_to_trajectory_id(monkeypatch, run_cli):
|
|
292
|
+
captured = run_cli(monkeypatch, {})
|
|
293
|
+
|
|
294
|
+
assert captured["configs"][0].client_config.extra_headers_from_state == {
|
|
295
|
+
"X-Session-ID": "trajectory_id"
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def test_cli_header_from_state_overrides_default_session_header(monkeypatch, run_cli):
|
|
300
|
+
captured = run_cli(
|
|
301
|
+
monkeypatch,
|
|
302
|
+
{"header_from_state": ["X-Session-ID: example_id"]},
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
assert captured["configs"][0].client_config.extra_headers_from_state == {
|
|
306
|
+
"X-Session-ID": "example_id"
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
|
|
291
310
|
def test_cli_registry_headers_merged_with_eval_toml(tmp_path, monkeypatch, run_cli):
|
|
292
311
|
cfg = tmp_path / "eval.toml"
|
|
293
312
|
cfg.write_text(
|
|
@@ -239,83 +239,20 @@ class TestLaunchCommandResolution:
|
|
|
239
239
|
)
|
|
240
240
|
|
|
241
241
|
|
|
242
|
-
class TestStartStopCommands:
|
|
243
|
-
def test_start_cmd_tracks_process_group_leader_pid(self):
|
|
244
|
-
"""Start command must capture `$!` (the backgrounded pgroup leader),
|
|
245
|
-
not `$$` (the outer shell), and must end with `wait` so the recorded
|
|
246
|
-
exit code reflects the launched daemon's.
|
|
247
|
-
"""
|
|
248
|
-
cmd = _DummyEnv()._mcp_start_cmd("svc", "python -u /opt/x/server.py")
|
|
249
|
-
assert "echo $!" in cmd
|
|
250
|
-
assert "echo $$" not in cmd
|
|
251
|
-
assert cmd.rstrip().endswith("wait")
|
|
252
|
-
assert "/tmp/harbor-mcp-svc.pid" in cmd
|
|
253
|
-
assert "python -u /opt/x/server.py" in cmd
|
|
254
|
-
|
|
255
|
-
def test_start_cmd_wraps_in_setsid_for_process_group_semantics(self):
|
|
256
|
-
"""Wrapping the user's command in `setsid sh -c ...` is what makes
|
|
257
|
-
`$!` a process-group leader, so `kill -9 -$PID` can reap the whole
|
|
258
|
-
daemon tree on stop. Compound commands (e.g. `cd /x && python y.py`)
|
|
259
|
-
must be preserved verbatim inside the sh -c payload so their own
|
|
260
|
-
semantics are unchanged."""
|
|
261
|
-
cmd = _DummyEnv()._mcp_start_cmd("svc", "cd /opt && python server.py")
|
|
262
|
-
assert "setsid sh -c " in cmd
|
|
263
|
-
assert "'cd /opt && python server.py'" in cmd
|
|
264
|
-
|
|
265
|
-
def test_stop_cmd_is_one_line_sigkill_plus_rm(self):
|
|
266
|
-
"""Default: one SIGKILL to the process group, then unlink the
|
|
267
|
-
pidfile — no poll/sleep loop."""
|
|
268
|
-
cmd = _DummyEnv()._mcp_stop_cmd("svc")
|
|
269
|
-
assert "kill -9" in cmd
|
|
270
|
-
assert "rm -f" in cmd
|
|
271
|
-
assert "/tmp/harbor-mcp-svc.pid" in cmd
|
|
272
|
-
assert "kill -0" not in cmd
|
|
273
|
-
assert "sleep" not in cmd
|
|
274
|
-
assert "\n" not in cmd
|
|
275
|
-
assert len(cmd) < 120
|
|
276
|
-
|
|
277
|
-
def test_stop_cmd_targets_process_group_not_single_pid(self):
|
|
278
|
-
"""The `-` prefix on the `$(cat …)` expansion is what turns kill(1)
|
|
279
|
-
into a process-group kill — without it, SIGKILL only lands on the
|
|
280
|
-
wrapping shell and e.g. a `python` child spawned via `cd && python`
|
|
281
|
-
leaks as an orphan."""
|
|
282
|
-
cmd = _DummyEnv()._mcp_stop_cmd("svc")
|
|
283
|
-
assert 'kill -9 -"$(cat' in cmd
|
|
284
|
-
|
|
285
|
-
def test_server_name_with_shell_metachars_is_quoted(self):
|
|
286
|
-
"""Server name is task-author-controlled; every pidfile reference
|
|
287
|
-
must appear only inside single-quoted spans."""
|
|
288
|
-
env = _DummyEnv()
|
|
289
|
-
unquoted = "/tmp/harbor-mcp-evil$(whoami).pid"
|
|
290
|
-
quoted = f"'{unquoted}'"
|
|
291
|
-
for cmd in (
|
|
292
|
-
env._mcp_start_cmd("evil$(whoami)", "x"),
|
|
293
|
-
env._mcp_stop_cmd("evil$(whoami)"),
|
|
294
|
-
):
|
|
295
|
-
assert quoted in cmd
|
|
296
|
-
# Every raw occurrence must be inside an already-quoted span.
|
|
297
|
-
assert cmd.count(unquoted) == cmd.count(quoted)
|
|
298
|
-
|
|
299
|
-
def test_launch_command_with_shell_metachars_is_quoted(self):
|
|
300
|
-
"""Same for the user's launch command: it's task-author-controlled,
|
|
301
|
-
must land inside a single-quoted span once wrapped in `sh -c`."""
|
|
302
|
-
env = _DummyEnv()
|
|
303
|
-
evil_cmd = "python -c 'print(1)' && touch /pwned"
|
|
304
|
-
quoted = f"'{evil_cmd}'".replace("'", "'\"'\"'")
|
|
305
|
-
# shlex-quoted output contains the evil string only inside quotes.
|
|
306
|
-
cmd = env._mcp_start_cmd("svc", evil_cmd)
|
|
307
|
-
assert "setsid sh -c " in cmd
|
|
308
|
-
# No unquoted `&& touch /pwned` outside a single-quoted span.
|
|
309
|
-
assert cmd.count(evil_cmd) == 0 or quoted in cmd
|
|
310
|
-
|
|
311
|
-
|
|
312
242
|
class TestLifecycle:
|
|
313
243
|
@pytest.mark.asyncio
|
|
314
244
|
async def test_starts_server_with_registered_launch_command(self):
|
|
315
|
-
env = _DummyEnv(mcp_launch_commands={"svc": "python server.py"})
|
|
245
|
+
env = _DummyEnv(mcp_launch_commands={"svc": "cd /opt && python server.py"})
|
|
316
246
|
state: dict[str, Any] = {}
|
|
317
247
|
await env.start_mcp_servers("sbx", _config_with_server(), state)
|
|
318
248
|
assert set(state["harbor_mcp_jobs"].keys()) == {"svc"}
|
|
249
|
+
_, start_cmd = env.started_jobs[0]
|
|
250
|
+
assert "echo $!" in start_cmd
|
|
251
|
+
assert "echo $$" not in start_cmd
|
|
252
|
+
assert start_cmd.rstrip().endswith("wait")
|
|
253
|
+
assert "/tmp/harbor-mcp-svc.pid" in start_cmd
|
|
254
|
+
assert "setsid sh -c " in start_cmd
|
|
255
|
+
assert "'cd /opt && python server.py'" in start_cmd
|
|
319
256
|
|
|
320
257
|
@pytest.mark.asyncio
|
|
321
258
|
async def test_externally_managed_server_is_skipped(self):
|
|
@@ -342,9 +279,38 @@ class TestLifecycle:
|
|
|
342
279
|
if "kill -9" in c.args[1]
|
|
343
280
|
]
|
|
344
281
|
assert len(stop_calls) == 1
|
|
345
|
-
|
|
282
|
+
stop_cmd = stop_calls[0]
|
|
283
|
+
assert "harbor-mcp-svc.pid" in stop_cmd
|
|
284
|
+
assert 'kill -9 -"$(cat' in stop_cmd
|
|
285
|
+
assert "rm -f" in stop_cmd
|
|
286
|
+
assert "kill -0" not in stop_cmd
|
|
287
|
+
assert "sleep" not in stop_cmd
|
|
288
|
+
assert "\n" not in stop_cmd
|
|
289
|
+
assert len(stop_cmd) < 120
|
|
346
290
|
assert state["harbor_mcp_jobs"] == {}
|
|
347
291
|
|
|
292
|
+
@pytest.mark.asyncio
|
|
293
|
+
async def test_launch_and_stop_commands_quote_task_authored_shell_text(self):
|
|
294
|
+
env = _DummyEnv(
|
|
295
|
+
mcp_launch_commands={
|
|
296
|
+
"evil$(whoami)": "python -c 'print(1)' && touch /pwned"
|
|
297
|
+
}
|
|
298
|
+
)
|
|
299
|
+
state: dict[str, Any] = {"sandbox_id": "sbx"}
|
|
300
|
+
await env.start_mcp_servers(
|
|
301
|
+
"sbx", _config_with_server(name="evil$(whoami)"), state
|
|
302
|
+
)
|
|
303
|
+
_, start_cmd = env.started_jobs[0]
|
|
304
|
+
quoted_pidfile = "'/tmp/harbor-mcp-evil$(whoami).pid'"
|
|
305
|
+
assert quoted_pidfile in start_cmd
|
|
306
|
+
assert "setsid sh -c " in start_cmd
|
|
307
|
+
assert "'\"'\"'print(1)'\"'\"'" in start_cmd
|
|
308
|
+
|
|
309
|
+
env.sandbox_client.execute_command.reset_mock()
|
|
310
|
+
await env.stop_mcp_servers(state)
|
|
311
|
+
stop_cmd = env.sandbox_client.execute_command.call_args.args[1]
|
|
312
|
+
assert quoted_pidfile in stop_cmd
|
|
313
|
+
|
|
348
314
|
@pytest.mark.asyncio
|
|
349
315
|
async def test_stop_without_sandbox_id_is_a_noop(self):
|
|
350
316
|
env = _DummyEnv()
|
|
@@ -530,22 +496,6 @@ class TestBackgroundJob:
|
|
|
530
496
|
class TestHealthCheck:
|
|
531
497
|
"""Readiness probing — default `/proc/net/tcp` + user override."""
|
|
532
498
|
|
|
533
|
-
def test_default_probe_shape(self):
|
|
534
|
-
"""Portable awk on /proc/net/tcp{,6}, matching LISTEN state only,
|
|
535
|
-
with no bash-ism dependency like /dev/tcp."""
|
|
536
|
-
cmd = HarborMCPMixin._default_mcp_health_cmd(8000)
|
|
537
|
-
assert "bash" not in cmd and "/dev/tcp" not in cmd
|
|
538
|
-
assert "/proc/net/tcp" in cmd and "/proc/net/tcp6" in cmd
|
|
539
|
-
assert '$4 == "0A"' in cmd # LISTEN state
|
|
540
|
-
|
|
541
|
-
@pytest.mark.parametrize(
|
|
542
|
-
"port,hex_expected",
|
|
543
|
-
[(80, "0050"), (8000, "1F40"), (65535, "FFFF"), (1, "0001")],
|
|
544
|
-
)
|
|
545
|
-
def test_default_probe_encodes_port_as_uppercase_hex(self, port, hex_expected):
|
|
546
|
-
cmd = HarborMCPMixin._default_mcp_health_cmd(port)
|
|
547
|
-
assert f":{hex_expected}$" in cmd
|
|
548
|
-
|
|
549
499
|
@pytest.mark.asyncio
|
|
550
500
|
async def test_custom_healthcheck_command_templated_with_port(self):
|
|
551
501
|
env = _DummyEnv(mcp_launch_commands={"svc": "python x"})
|
|
@@ -580,7 +530,11 @@ class TestHealthCheck:
|
|
|
580
530
|
if "/proc/net/tcp" in c.args[1]
|
|
581
531
|
]
|
|
582
532
|
assert len(health_calls) == 1
|
|
583
|
-
|
|
533
|
+
health_cmd = health_calls[0]
|
|
534
|
+
assert "bash" not in health_cmd and "/dev/tcp" not in health_cmd
|
|
535
|
+
assert "/proc/net/tcp6" in health_cmd
|
|
536
|
+
assert '$4 == "0A"' in health_cmd
|
|
537
|
+
assert ":1F40$" in health_cmd
|
|
584
538
|
|
|
585
539
|
@pytest.mark.asyncio
|
|
586
540
|
async def test_probe_timeout_is_respected(self):
|
|
@@ -4,6 +4,37 @@ import sys
|
|
|
4
4
|
import verifiers
|
|
5
5
|
|
|
6
6
|
|
|
7
|
+
PACKAGE_SYMBOLS = {
|
|
8
|
+
"HarborTaskset",
|
|
9
|
+
"HarborTasksetConfig",
|
|
10
|
+
"MiniSWEAgent",
|
|
11
|
+
"MiniSWEAgentConfig",
|
|
12
|
+
"OpenCode",
|
|
13
|
+
"OpenCodeConfig",
|
|
14
|
+
"Pi",
|
|
15
|
+
"PiConfig",
|
|
16
|
+
"RLM",
|
|
17
|
+
"RLMConfig",
|
|
18
|
+
"Terminus2",
|
|
19
|
+
"Terminus2Config",
|
|
20
|
+
"TextArenaTaskset",
|
|
21
|
+
"TextArenaTasksetConfig",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_package_tasksets_and_harnesses_are_not_root_exports():
|
|
26
|
+
for name in PACKAGE_SYMBOLS:
|
|
27
|
+
assert name not in verifiers.__all__
|
|
28
|
+
assert not hasattr(verifiers, name)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_package_tasksets_and_harnesses_are_not_v1_exports():
|
|
32
|
+
v1 = importlib.import_module("verifiers.v1")
|
|
33
|
+
for name in PACKAGE_SYMBOLS:
|
|
34
|
+
assert name not in v1.__all__
|
|
35
|
+
assert not hasattr(v1, name)
|
|
36
|
+
|
|
37
|
+
|
|
7
38
|
def test_v1_taskset_imports_do_not_import_textarena():
|
|
8
39
|
textarena_module = "verifiers.v1.packages.tasksets.textarena"
|
|
9
40
|
sys.modules.pop(textarena_module, None)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
import verifiers as vf
|
|
5
|
+
from verifiers.scripts.init import init_environment
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def read_env_file(root: Path, env_id: str) -> str:
|
|
9
|
+
module_name = env_id.replace("-", "_")
|
|
10
|
+
return (root / module_name / f"{module_name}.py").read_text()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_init_default_writes_v0_stub(tmp_path: Path) -> None:
|
|
14
|
+
root = init_environment("foo", path=str(tmp_path))
|
|
15
|
+
content = read_env_file(tmp_path, "foo")
|
|
16
|
+
|
|
17
|
+
assert root == tmp_path / "foo"
|
|
18
|
+
assert "def load_environment(**kwargs) -> vf.Environment:" in content
|
|
19
|
+
assert "NotImplementedError" in content
|
|
20
|
+
assert "load_taskset" not in content
|
|
21
|
+
assert "EnvTaskset" not in content
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_init_v1_writes_thin_taskset_template(tmp_path: Path) -> None:
|
|
25
|
+
init_environment("bar", path=str(tmp_path), v1=True)
|
|
26
|
+
content = read_env_file(tmp_path, "bar")
|
|
27
|
+
|
|
28
|
+
assert "class BarTasksetConfig(vf.TasksetConfig):" in content
|
|
29
|
+
assert "class BarTaskset(vf.Taskset[BarTasksetConfig]):" in content
|
|
30
|
+
assert "def load_tasks(self) -> vf.Tasks:" in content
|
|
31
|
+
assert "def load_system_prompt(self) -> vf.SystemPrompt:" in content
|
|
32
|
+
assert "async def correct_answer(self, task: vf.Task, state: vf.State)" in content
|
|
33
|
+
assert "def load_taskset(config: BarTasksetConfig) -> BarTaskset:" in content
|
|
34
|
+
assert "return BarTaskset(config=config)" in content
|
|
35
|
+
assert "vf.load_taskset(config=config.taskset)" in content
|
|
36
|
+
assert "class EnvTaskset(" not in content
|
|
37
|
+
assert "_default_" not in content
|
|
38
|
+
assert "assert isinstance" not in content
|
|
39
|
+
assert 'tasks: str = "load_tasks"' not in content
|
|
40
|
+
assert 'rewards: list[str] = ["correct_answer"]' not in content
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_init_v1_template_loads_with_vf_load_environment(
|
|
44
|
+
tmp_path: Path, monkeypatch
|
|
45
|
+
) -> None:
|
|
46
|
+
init_environment("loadable-v1", path=str(tmp_path), v1=True)
|
|
47
|
+
monkeypatch.syspath_prepend(str(tmp_path / "loadable_v1"))
|
|
48
|
+
|
|
49
|
+
with pytest.raises(RuntimeError, match="Load the system prompt"):
|
|
50
|
+
vf.load_environment("loadable-v1")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_init_v1_with_harness_writes_harness_stub(tmp_path: Path) -> None:
|
|
54
|
+
init_environment("baz", path=str(tmp_path), v1=True, with_harness=True)
|
|
55
|
+
content = read_env_file(tmp_path, "baz")
|
|
56
|
+
|
|
57
|
+
assert "class BazTaskset(vf.Taskset[BazTasksetConfig]):" in content
|
|
58
|
+
assert "class BazHarnessConfig(vf.HarnessConfig):" in content
|
|
59
|
+
assert "class BazHarness(vf.Harness):" in content
|
|
60
|
+
assert "def load_harness(config: BazHarnessConfig) -> BazHarness:" in content
|
|
61
|
+
assert "vf.load_harness(config=config.harness)" in content
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_init_with_harness_without_v1_warns_and_uses_v0(tmp_path: Path, capsys) -> None:
|
|
65
|
+
init_environment("plain", path=str(tmp_path), with_harness=True)
|
|
66
|
+
content = read_env_file(tmp_path, "plain")
|
|
67
|
+
captured = capsys.readouterr()
|
|
68
|
+
|
|
69
|
+
assert "--with-harness only applies with --v1; ignoring." in captured.out
|
|
70
|
+
assert "def load_environment(**kwargs) -> vf.Environment:" in content
|
|
71
|
+
assert "load_harness" not in content
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_init_v1_multifile_exports_component_loaders(tmp_path: Path) -> None:
|
|
75
|
+
init_environment("pkg-env", path=str(tmp_path), v1=True, multi_file=True)
|
|
76
|
+
package_dir = tmp_path / "pkg_env" / "pkg_env"
|
|
77
|
+
init_content = (package_dir / "__init__.py").read_text()
|
|
78
|
+
env_content = (package_dir / "pkg_env.py").read_text()
|
|
79
|
+
|
|
80
|
+
assert "from .pkg_env import load_environment, load_taskset" in init_content
|
|
81
|
+
assert "__all__ = ['load_environment', 'load_taskset']" in init_content
|
|
82
|
+
assert "class PkgEnvTaskset(vf.Taskset[PkgEnvTasksetConfig]):" in env_content
|
|
83
|
+
assert "return PkgEnvTaskset(config=config)" in env_content
|
{verifiers-0.1.15.dev9 → verifiers-0.1.15.dev11}/tests/test_langchain_deep_agents_wikispeedia.py
RENAMED
|
@@ -91,8 +91,8 @@ def test_wikispeedia_env_config_reaches_taskset_and_harness(
|
|
|
91
91
|
)
|
|
92
92
|
)
|
|
93
93
|
|
|
94
|
-
train_rows =
|
|
95
|
-
eval_rows =
|
|
94
|
+
train_rows = [env.taskset.to_task(row) for row in env.taskset.get_dataset()]
|
|
95
|
+
eval_rows = [env.taskset.to_task(row) for row in env.taskset.get_eval_dataset()]
|
|
96
96
|
|
|
97
97
|
assert len(train_rows) == 2
|
|
98
98
|
assert len(eval_rows) == 1
|
|
@@ -136,8 +136,8 @@ def test_wikispeedia_taskset_sources_use_disjoint_target_split(
|
|
|
136
136
|
)
|
|
137
137
|
)
|
|
138
138
|
|
|
139
|
-
train_rows =
|
|
140
|
-
eval_rows =
|
|
139
|
+
train_rows = [taskset.to_task(row) for row in taskset.get_dataset()]
|
|
140
|
+
eval_rows = [taskset.to_task(row) for row in taskset.get_eval_dataset()]
|
|
141
141
|
|
|
142
142
|
assert len(train_rows) == 2
|
|
143
143
|
assert len(eval_rows) == 1
|
|
@@ -218,7 +218,7 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime(
|
|
|
218
218
|
),
|
|
219
219
|
harness=module.load_harness(config=module.WikispeediaHarnessConfig()),
|
|
220
220
|
)
|
|
221
|
-
task =
|
|
221
|
+
task = env.taskset.to_task(env.taskset.get_dataset()[0])
|
|
222
222
|
state = module.vf.State.for_task(task)
|
|
223
223
|
state = await env.harness.setup_state(task, state)
|
|
224
224
|
|
|
@@ -9,10 +9,8 @@ from verifiers.envs.experimental.composable.tasksets.lean.lean_task import (
|
|
|
9
9
|
LEAN_GUARD_END_MARKER,
|
|
10
10
|
LeanRubric,
|
|
11
11
|
_build_starter_file,
|
|
12
|
-
_expected_protected_region,
|
|
13
12
|
_extract_protected_region,
|
|
14
13
|
_normalize_signature,
|
|
15
|
-
_wrap_with_lean_guard,
|
|
16
14
|
)
|
|
17
15
|
|
|
18
16
|
|
|
@@ -80,11 +78,13 @@ class TestNormalizeSignature:
|
|
|
80
78
|
)
|
|
81
79
|
|
|
82
80
|
|
|
83
|
-
class
|
|
81
|
+
class TestBuildStarterFileLeanGuardLayout:
|
|
84
82
|
def test_marker_layout(self) -> None:
|
|
85
83
|
signature = "theorem foo (x : ℝ) : x = x := by"
|
|
86
|
-
|
|
87
|
-
|
|
84
|
+
starter = _build_starter_file(
|
|
85
|
+
{"formal_statement": signature, "header": "", "imports": ""}
|
|
86
|
+
)
|
|
87
|
+
assert starter == (
|
|
88
88
|
"-- lean-guard: begin protected\n"
|
|
89
89
|
"theorem foo (x : ℝ) : x = x := by\n"
|
|
90
90
|
"-- lean-guard: end protected\n"
|
|
@@ -93,8 +93,10 @@ class TestWrapWithLeanGuard:
|
|
|
93
93
|
|
|
94
94
|
def test_round_trip_via_extract(self) -> None:
|
|
95
95
|
signature = "theorem foo : True := by"
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
starter = _build_starter_file(
|
|
97
|
+
{"formal_statement": signature, "header": "", "imports": ""}
|
|
98
|
+
)
|
|
99
|
+
region = _extract_protected_region(starter)
|
|
98
100
|
assert region is not None
|
|
99
101
|
assert LEAN_GUARD_BEGIN_MARKER in region
|
|
100
102
|
assert LEAN_GUARD_END_MARKER in region
|
|
@@ -212,7 +214,7 @@ class TestBuildStarterFile:
|
|
|
212
214
|
"header": "import Mathlib",
|
|
213
215
|
}
|
|
214
216
|
starter = _build_starter_file(info)
|
|
215
|
-
expected =
|
|
217
|
+
expected = _extract_protected_region(_build_starter_file(info)) or ""
|
|
216
218
|
actual = _extract_protected_region(starter)
|
|
217
219
|
assert expected == actual
|
|
218
220
|
assert expected != ""
|
|
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
7
|
import pytest
|
|
8
|
-
import verifiers
|
|
8
|
+
import verifiers as vf
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _load_mcp_search_module() -> Any:
|
|
@@ -54,7 +54,7 @@ def test_mcp_search_env_preserves_harness_config() -> None:
|
|
|
54
54
|
def test_mcp_search_default_taskset_has_stable_non_doc_fixture() -> None:
|
|
55
55
|
module = _load_mcp_search_module()
|
|
56
56
|
|
|
57
|
-
rows =
|
|
57
|
+
rows = list(module.load_tasks())
|
|
58
58
|
|
|
59
59
|
assert len(rows) >= 10
|
|
60
60
|
assert len({row["answer"] for row in rows}) == len(rows)
|
|
@@ -68,7 +68,7 @@ def test_mcp_search_taskset_accepts_v1_taskset_config() -> None:
|
|
|
68
68
|
env = module.load_environment(
|
|
69
69
|
config=module.MCPSearchEnvConfig(taskset={"max_turns": 3}),
|
|
70
70
|
)
|
|
71
|
-
rows = env.taskset.
|
|
71
|
+
rows = [env.taskset.to_task(row) for row in env.taskset.get_dataset()]
|
|
72
72
|
|
|
73
73
|
assert env.taskset.config.max_turns == 3
|
|
74
74
|
assert all(row["max_turns"] == 3 for row in rows)
|