verifiers 0.1.15.dev8__tar.gz → 0.1.15.dev10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/PKG-INFO +11 -6
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/README.md +9 -5
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/pyproject.toml +1 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_client_multimodal_types.py +25 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_envs.py +7 -3
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_eval_cli.py +19 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_harbor_env_mcp.py +43 -89
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_imports.py +20 -0
- verifiers-0.1.15.dev10/tests/test_init_script.py +80 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_lean_task.py +10 -8
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_opencode_rlm_env.py +35 -44
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_openenv_client.py +89 -31
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_prime_plugin.py +5 -5
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_renderer_client.py +32 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_rlm_env.py +1 -64
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_tool_utils.py +11 -6
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_config_extension.py +273 -695
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_harbor_cli.py +5 -0
- verifiers-0.1.15.dev10/tests/test_v1_rlm_swe.py +780 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_runtime_lifecycle.py +130 -73
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_taskset_bindings.py +84 -2
- verifiers-0.1.15.dev10/tests/test_v1_textarena_taskset.py +219 -0
- verifiers-0.1.15.dev10/tests/test_wordle_v1_env.py +118 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/__init__.py +15 -7
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/plugins/prime.py +1 -5
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/clients/anthropic_messages_client.py +27 -44
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/clients/client.py +12 -14
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/clients/openai_chat_completions_client.py +1 -6
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/clients/openai_chat_completions_token_client.py +14 -17
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/clients/openai_responses_client.py +13 -18
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/clients/renderer_client.py +30 -62
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/env_group.py +0 -16
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/environment.py +14 -27
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/composable_env.py +13 -21
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/harnesses/rlm.py +7 -8
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/swe_debug_env.py +12 -19
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/task.py +9 -18
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +5 -18
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +1 -10
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +3 -7
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +2 -2
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +24 -34
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +34 -44
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/gym_env.py +22 -19
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/harbor_env/mcp.py +17 -28
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/mcp_env.py +6 -13
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/opencode_rlm_env.py +9 -16
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/rlm_env.py +40 -62
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/utils/git_checkout_cache.py +13 -31
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/openenv_env.py +75 -126
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/multiturn_env.py +1 -5
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/sandbox_env.py +1 -5
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/gepa/display.py +2 -2
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/gepa/gepa_utils.py +6 -14
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rubrics/rubric.py +7 -33
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/build.py +17 -29
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/eval.py +3 -3
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/init.py +91 -68
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/serve/server/env_server.py +17 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/serve/server/env_worker.py +19 -4
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/async_utils.py +0 -8
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/client_utils.py +19 -38
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/data_utils.py +10 -69
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/display_utils.py +3 -29
- verifiers-0.1.15.dev10/verifiers/utils/env_utils.py +317 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/error_utils.py +0 -10
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/eval_utils.py +21 -38
- verifiers-0.1.15.dev10/verifiers/utils/import_utils.py +11 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/install_utils.py +10 -11
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/interception_utils.py +9 -11
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/logging_utils.py +11 -17
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/message_utils.py +9 -20
- verifiers-0.1.15.dev10/verifiers/utils/response_utils.py +102 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/save_utils.py +13 -21
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/thread_utils.py +2 -27
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/threaded_sandbox_client.py +2 -2
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/tool_utils.py +1 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +82 -88
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/README.md +63 -68
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/RE_MIGRATION.md +23 -16
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/__init__.py +15 -1
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/config.py +6 -129
- verifiers-0.1.15.dev10/verifiers/v1/env.py +180 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/harness.py +11 -13
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/packages/harnesses/command.py +18 -22
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/packages/harnesses/configs.py +1 -1
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/packages/harnesses/mini_swe_agent.py +3 -3
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/packages/harnesses/opencode.py +4 -4
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/packages/harnesses/pi.py +9 -13
- verifiers-0.1.15.dev10/verifiers/v1/packages/harnesses/rlm.py +601 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/packages/harnesses/terminus_2.py +11 -16
- verifiers-0.1.15.dev10/verifiers/v1/packages/tasksets/__init__.py +17 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/packages/tasksets/harbor.py +4 -2
- verifiers-0.1.15.dev10/verifiers/v1/packages/tasksets/textarena.py +153 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/runtime.py +61 -42
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/taskset.py +12 -13
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/types.py +1 -1
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/config_utils.py +2 -68
- verifiers-0.1.15.dev10/verifiers/v1/utils/object_utils.py +59 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/program_utils.py +2 -1
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/runtime_owner_utils.py +1 -6
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/sandbox_utils.py +2 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/scoring_utils.py +0 -13
- verifiers-0.1.15.dev8/tests/test_v1_rlm_swe.py +0 -390
- verifiers-0.1.15.dev8/verifiers/utils/env_utils.py +0 -195
- verifiers-0.1.15.dev8/verifiers/utils/import_utils.py +0 -16
- verifiers-0.1.15.dev8/verifiers/utils/response_utils.py +0 -94
- verifiers-0.1.15.dev8/verifiers/utils/tunnel_utils.py +0 -266
- verifiers-0.1.15.dev8/verifiers/v1/env.py +0 -351
- verifiers-0.1.15.dev8/verifiers/v1/packages/harnesses/rlm.py +0 -290
- verifiers-0.1.15.dev8/verifiers/v1/packages/tasksets/__init__.py +0 -3
- verifiers-0.1.15.dev8/verifiers/v1/utils/component_utils.py +0 -136
- verifiers-0.1.15.dev8/verifiers/v1/utils/object_utils.py +0 -52
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/.gitignore +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/LICENSE +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/AGENTS.md +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/README.md +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/conftest.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_build_script.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_client_config.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_env_group.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_env_server.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_environment.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_gepa_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_langchain_deep_agents_wikispeedia.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_logging.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_mcp_search_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_parser.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_per_turn_timing.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_pricing_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_renderer_e2e.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_rubric.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_save_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_types.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_bfcl.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_empty_completions.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_endpoint_protocols.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_example_counts.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_group_reward_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_v1_scoring_functions.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_wiki_search_v1.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_wordle_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/decorators.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/harness.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/errors.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/types.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/env_config_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/eval_display.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/pricing_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/utils/version_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/packages/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/packages/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/state.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/task.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/toolset.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/user.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/artifact_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/binding_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/config_callable_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/endpoint_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/json_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/judge_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/lifecycle_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/mcp_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/prompt_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/runtime_registry.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/serialization_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/task_freeze_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/taskset_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/timing_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/trajectory_utils.py +0 -0
- {verifiers-0.1.15.dev8 → verifiers-0.1.15.dev10}/verifiers/v1/utils/usage_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.15.
|
|
3
|
+
Version: 0.1.15.dev10
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -48,6 +48,7 @@ Requires-Dist: tenacity>=8.5.0
|
|
|
48
48
|
Requires-Dist: textual
|
|
49
49
|
Requires-Dist: tomli; python_version < '3.11'
|
|
50
50
|
Requires-Dist: typing-extensions; python_version < '3.12'
|
|
51
|
+
Requires-Dist: uvloop>=0.21.0; sys_platform != 'win32' and sys_platform != 'cygwin' and platform_python_implementation != 'PyPy'
|
|
51
52
|
Provides-Extra: browser
|
|
52
53
|
Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
|
|
53
54
|
Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
|
|
@@ -227,7 +228,8 @@ class MyTasksetConfig(vf.TasksetConfig):
|
|
|
227
228
|
split: str = "train"
|
|
228
229
|
|
|
229
230
|
|
|
230
|
-
class MyTaskset(vf.Taskset
|
|
231
|
+
class MyTaskset(vf.Taskset):
|
|
232
|
+
config: MyTasksetConfig
|
|
231
233
|
_default_rewards = (contains_answer,)
|
|
232
234
|
|
|
233
235
|
def rows(self) -> list[dict[str, object]]:
|
|
@@ -242,12 +244,15 @@ class MyTaskset(vf.Taskset[MyTasksetConfig]):
|
|
|
242
244
|
return [row for row in rows if row["split"] == self.config.split]
|
|
243
245
|
|
|
244
246
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
+
def load_taskset(config: MyTasksetConfig) -> MyTaskset:
|
|
248
|
+
assert isinstance(config, MyTasksetConfig)
|
|
249
|
+
return MyTaskset(config=config)
|
|
247
250
|
|
|
248
251
|
|
|
249
|
-
def load_environment(config:
|
|
250
|
-
|
|
252
|
+
def load_environment(config: vf.EnvConfig) -> vf.Env:
|
|
253
|
+
taskset_config = config.taskset
|
|
254
|
+
assert isinstance(taskset_config, MyTasksetConfig)
|
|
255
|
+
return vf.Env(taskset=load_taskset(taskset_config))
|
|
251
256
|
```
|
|
252
257
|
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
253
258
|
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
@@ -151,7 +151,8 @@ class MyTasksetConfig(vf.TasksetConfig):
|
|
|
151
151
|
split: str = "train"
|
|
152
152
|
|
|
153
153
|
|
|
154
|
-
class MyTaskset(vf.Taskset
|
|
154
|
+
class MyTaskset(vf.Taskset):
|
|
155
|
+
config: MyTasksetConfig
|
|
155
156
|
_default_rewards = (contains_answer,)
|
|
156
157
|
|
|
157
158
|
def rows(self) -> list[dict[str, object]]:
|
|
@@ -166,12 +167,15 @@ class MyTaskset(vf.Taskset[MyTasksetConfig]):
|
|
|
166
167
|
return [row for row in rows if row["split"] == self.config.split]
|
|
167
168
|
|
|
168
169
|
|
|
169
|
-
|
|
170
|
-
|
|
170
|
+
def load_taskset(config: MyTasksetConfig) -> MyTaskset:
|
|
171
|
+
assert isinstance(config, MyTasksetConfig)
|
|
172
|
+
return MyTaskset(config=config)
|
|
171
173
|
|
|
172
174
|
|
|
173
|
-
def load_environment(config:
|
|
174
|
-
|
|
175
|
+
def load_environment(config: vf.EnvConfig) -> vf.Env:
|
|
176
|
+
taskset_config = config.taskset
|
|
177
|
+
assert isinstance(taskset_config, MyTasksetConfig)
|
|
178
|
+
return vf.Env(taskset=load_taskset(taskset_config))
|
|
175
179
|
```
|
|
176
180
|
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
177
181
|
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
@@ -98,6 +98,31 @@ async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts():
|
|
|
98
98
|
]
|
|
99
99
|
|
|
100
100
|
|
|
101
|
+
@pytest.mark.asyncio
|
|
102
|
+
async def test_anthropic_to_native_prompt_marks_unsupported_images_in_mixed_content():
|
|
103
|
+
pytest.importorskip("anthropic")
|
|
104
|
+
from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
|
|
105
|
+
|
|
106
|
+
client = AnthropicMessagesClient(object())
|
|
107
|
+
messages = [
|
|
108
|
+
UserMessage(
|
|
109
|
+
content=[
|
|
110
|
+
TextContentPart(text="describe this"),
|
|
111
|
+
ImageUrlContentPart(
|
|
112
|
+
image_url=ImageUrlSource(url="https://example.com/image.png")
|
|
113
|
+
),
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
prompt, kwargs = await client.to_native_prompt(messages)
|
|
119
|
+
assert kwargs["system"] == ""
|
|
120
|
+
assert prompt[0]["content"] == [
|
|
121
|
+
{"type": "text", "text": "describe this"},
|
|
122
|
+
{"type": "text", "text": "[image]"},
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
|
|
101
126
|
@pytest.mark.asyncio
|
|
102
127
|
async def test_anthropic_assistant_tool_calls_use_text_chunks_not_model_repr():
|
|
103
128
|
pytest.importorskip("anthropic")
|
|
@@ -216,10 +216,14 @@ def help_test_can_load_env(tmp_venv_dir: Path, env_dir: Path):
|
|
|
216
216
|
|
|
217
217
|
def help_test_can_eval_env(tmp_venv_dir: Path, env_dir: Path):
|
|
218
218
|
"""Test that the environment can be run via vf-eval."""
|
|
219
|
-
if os.getenv("
|
|
220
|
-
|
|
221
|
-
|
|
219
|
+
if env_dir.name == "tau2_bench_v1" and not os.getenv("PRIME_API_KEY"):
|
|
220
|
+
pytest.skip(
|
|
221
|
+
"Skipping tau2 default eval because PRIME_API_KEY is not configured"
|
|
222
|
+
)
|
|
223
|
+
if os.getenv("PRIME_API_KEY"):
|
|
222
224
|
model_flags = "-m openai/gpt-4.1-mini -b https://api.pinference.ai/api/v1 -k PRIME_API_KEY"
|
|
225
|
+
elif os.getenv("OPENAI_API_KEY"):
|
|
226
|
+
model_flags = "-m gpt-4.1-mini -b https://api.openai.com/v1 -k OPENAI_API_KEY"
|
|
223
227
|
else:
|
|
224
228
|
pytest.skip("Skipping vf-eval smoke test because no API key is configured")
|
|
225
229
|
|
|
@@ -288,6 +288,25 @@ def test_cli_headers_table_and_list_merge(monkeypatch, run_cli):
|
|
|
288
288
|
}
|
|
289
289
|
|
|
290
290
|
|
|
291
|
+
def test_cli_defaults_session_header_to_trajectory_id(monkeypatch, run_cli):
|
|
292
|
+
captured = run_cli(monkeypatch, {})
|
|
293
|
+
|
|
294
|
+
assert captured["configs"][0].client_config.extra_headers_from_state == {
|
|
295
|
+
"X-Session-ID": "trajectory_id"
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def test_cli_header_from_state_overrides_default_session_header(monkeypatch, run_cli):
|
|
300
|
+
captured = run_cli(
|
|
301
|
+
monkeypatch,
|
|
302
|
+
{"header_from_state": ["X-Session-ID: example_id"]},
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
assert captured["configs"][0].client_config.extra_headers_from_state == {
|
|
306
|
+
"X-Session-ID": "example_id"
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
|
|
291
310
|
def test_cli_registry_headers_merged_with_eval_toml(tmp_path, monkeypatch, run_cli):
|
|
292
311
|
cfg = tmp_path / "eval.toml"
|
|
293
312
|
cfg.write_text(
|
|
@@ -239,83 +239,20 @@ class TestLaunchCommandResolution:
|
|
|
239
239
|
)
|
|
240
240
|
|
|
241
241
|
|
|
242
|
-
class TestStartStopCommands:
|
|
243
|
-
def test_start_cmd_tracks_process_group_leader_pid(self):
|
|
244
|
-
"""Start command must capture `$!` (the backgrounded pgroup leader),
|
|
245
|
-
not `$$` (the outer shell), and must end with `wait` so the recorded
|
|
246
|
-
exit code reflects the launched daemon's.
|
|
247
|
-
"""
|
|
248
|
-
cmd = _DummyEnv()._mcp_start_cmd("svc", "python -u /opt/x/server.py")
|
|
249
|
-
assert "echo $!" in cmd
|
|
250
|
-
assert "echo $$" not in cmd
|
|
251
|
-
assert cmd.rstrip().endswith("wait")
|
|
252
|
-
assert "/tmp/harbor-mcp-svc.pid" in cmd
|
|
253
|
-
assert "python -u /opt/x/server.py" in cmd
|
|
254
|
-
|
|
255
|
-
def test_start_cmd_wraps_in_setsid_for_process_group_semantics(self):
|
|
256
|
-
"""Wrapping the user's command in `setsid sh -c ...` is what makes
|
|
257
|
-
`$!` a process-group leader, so `kill -9 -$PID` can reap the whole
|
|
258
|
-
daemon tree on stop. Compound commands (e.g. `cd /x && python y.py`)
|
|
259
|
-
must be preserved verbatim inside the sh -c payload so their own
|
|
260
|
-
semantics are unchanged."""
|
|
261
|
-
cmd = _DummyEnv()._mcp_start_cmd("svc", "cd /opt && python server.py")
|
|
262
|
-
assert "setsid sh -c " in cmd
|
|
263
|
-
assert "'cd /opt && python server.py'" in cmd
|
|
264
|
-
|
|
265
|
-
def test_stop_cmd_is_one_line_sigkill_plus_rm(self):
|
|
266
|
-
"""Default: one SIGKILL to the process group, then unlink the
|
|
267
|
-
pidfile — no poll/sleep loop."""
|
|
268
|
-
cmd = _DummyEnv()._mcp_stop_cmd("svc")
|
|
269
|
-
assert "kill -9" in cmd
|
|
270
|
-
assert "rm -f" in cmd
|
|
271
|
-
assert "/tmp/harbor-mcp-svc.pid" in cmd
|
|
272
|
-
assert "kill -0" not in cmd
|
|
273
|
-
assert "sleep" not in cmd
|
|
274
|
-
assert "\n" not in cmd
|
|
275
|
-
assert len(cmd) < 120
|
|
276
|
-
|
|
277
|
-
def test_stop_cmd_targets_process_group_not_single_pid(self):
|
|
278
|
-
"""The `-` prefix on the `$(cat …)` expansion is what turns kill(1)
|
|
279
|
-
into a process-group kill — without it, SIGKILL only lands on the
|
|
280
|
-
wrapping shell and e.g. a `python` child spawned via `cd && python`
|
|
281
|
-
leaks as an orphan."""
|
|
282
|
-
cmd = _DummyEnv()._mcp_stop_cmd("svc")
|
|
283
|
-
assert 'kill -9 -"$(cat' in cmd
|
|
284
|
-
|
|
285
|
-
def test_server_name_with_shell_metachars_is_quoted(self):
|
|
286
|
-
"""Server name is task-author-controlled; every pidfile reference
|
|
287
|
-
must appear only inside single-quoted spans."""
|
|
288
|
-
env = _DummyEnv()
|
|
289
|
-
unquoted = "/tmp/harbor-mcp-evil$(whoami).pid"
|
|
290
|
-
quoted = f"'{unquoted}'"
|
|
291
|
-
for cmd in (
|
|
292
|
-
env._mcp_start_cmd("evil$(whoami)", "x"),
|
|
293
|
-
env._mcp_stop_cmd("evil$(whoami)"),
|
|
294
|
-
):
|
|
295
|
-
assert quoted in cmd
|
|
296
|
-
# Every raw occurrence must be inside an already-quoted span.
|
|
297
|
-
assert cmd.count(unquoted) == cmd.count(quoted)
|
|
298
|
-
|
|
299
|
-
def test_launch_command_with_shell_metachars_is_quoted(self):
|
|
300
|
-
"""Same for the user's launch command: it's task-author-controlled,
|
|
301
|
-
must land inside a single-quoted span once wrapped in `sh -c`."""
|
|
302
|
-
env = _DummyEnv()
|
|
303
|
-
evil_cmd = "python -c 'print(1)' && touch /pwned"
|
|
304
|
-
quoted = f"'{evil_cmd}'".replace("'", "'\"'\"'")
|
|
305
|
-
# shlex-quoted output contains the evil string only inside quotes.
|
|
306
|
-
cmd = env._mcp_start_cmd("svc", evil_cmd)
|
|
307
|
-
assert "setsid sh -c " in cmd
|
|
308
|
-
# No unquoted `&& touch /pwned` outside a single-quoted span.
|
|
309
|
-
assert cmd.count(evil_cmd) == 0 or quoted in cmd
|
|
310
|
-
|
|
311
|
-
|
|
312
242
|
class TestLifecycle:
|
|
313
243
|
@pytest.mark.asyncio
|
|
314
244
|
async def test_starts_server_with_registered_launch_command(self):
|
|
315
|
-
env = _DummyEnv(mcp_launch_commands={"svc": "python server.py"})
|
|
245
|
+
env = _DummyEnv(mcp_launch_commands={"svc": "cd /opt && python server.py"})
|
|
316
246
|
state: dict[str, Any] = {}
|
|
317
247
|
await env.start_mcp_servers("sbx", _config_with_server(), state)
|
|
318
248
|
assert set(state["harbor_mcp_jobs"].keys()) == {"svc"}
|
|
249
|
+
_, start_cmd = env.started_jobs[0]
|
|
250
|
+
assert "echo $!" in start_cmd
|
|
251
|
+
assert "echo $$" not in start_cmd
|
|
252
|
+
assert start_cmd.rstrip().endswith("wait")
|
|
253
|
+
assert "/tmp/harbor-mcp-svc.pid" in start_cmd
|
|
254
|
+
assert "setsid sh -c " in start_cmd
|
|
255
|
+
assert "'cd /opt && python server.py'" in start_cmd
|
|
319
256
|
|
|
320
257
|
@pytest.mark.asyncio
|
|
321
258
|
async def test_externally_managed_server_is_skipped(self):
|
|
@@ -342,9 +279,38 @@ class TestLifecycle:
|
|
|
342
279
|
if "kill -9" in c.args[1]
|
|
343
280
|
]
|
|
344
281
|
assert len(stop_calls) == 1
|
|
345
|
-
|
|
282
|
+
stop_cmd = stop_calls[0]
|
|
283
|
+
assert "harbor-mcp-svc.pid" in stop_cmd
|
|
284
|
+
assert 'kill -9 -"$(cat' in stop_cmd
|
|
285
|
+
assert "rm -f" in stop_cmd
|
|
286
|
+
assert "kill -0" not in stop_cmd
|
|
287
|
+
assert "sleep" not in stop_cmd
|
|
288
|
+
assert "\n" not in stop_cmd
|
|
289
|
+
assert len(stop_cmd) < 120
|
|
346
290
|
assert state["harbor_mcp_jobs"] == {}
|
|
347
291
|
|
|
292
|
+
@pytest.mark.asyncio
|
|
293
|
+
async def test_launch_and_stop_commands_quote_task_authored_shell_text(self):
|
|
294
|
+
env = _DummyEnv(
|
|
295
|
+
mcp_launch_commands={
|
|
296
|
+
"evil$(whoami)": "python -c 'print(1)' && touch /pwned"
|
|
297
|
+
}
|
|
298
|
+
)
|
|
299
|
+
state: dict[str, Any] = {"sandbox_id": "sbx"}
|
|
300
|
+
await env.start_mcp_servers(
|
|
301
|
+
"sbx", _config_with_server(name="evil$(whoami)"), state
|
|
302
|
+
)
|
|
303
|
+
_, start_cmd = env.started_jobs[0]
|
|
304
|
+
quoted_pidfile = "'/tmp/harbor-mcp-evil$(whoami).pid'"
|
|
305
|
+
assert quoted_pidfile in start_cmd
|
|
306
|
+
assert "setsid sh -c " in start_cmd
|
|
307
|
+
assert "'\"'\"'print(1)'\"'\"'" in start_cmd
|
|
308
|
+
|
|
309
|
+
env.sandbox_client.execute_command.reset_mock()
|
|
310
|
+
await env.stop_mcp_servers(state)
|
|
311
|
+
stop_cmd = env.sandbox_client.execute_command.call_args.args[1]
|
|
312
|
+
assert quoted_pidfile in stop_cmd
|
|
313
|
+
|
|
348
314
|
@pytest.mark.asyncio
|
|
349
315
|
async def test_stop_without_sandbox_id_is_a_noop(self):
|
|
350
316
|
env = _DummyEnv()
|
|
@@ -530,22 +496,6 @@ class TestBackgroundJob:
|
|
|
530
496
|
class TestHealthCheck:
|
|
531
497
|
"""Readiness probing — default `/proc/net/tcp` + user override."""
|
|
532
498
|
|
|
533
|
-
def test_default_probe_shape(self):
|
|
534
|
-
"""Portable awk on /proc/net/tcp{,6}, matching LISTEN state only,
|
|
535
|
-
with no bash-ism dependency like /dev/tcp."""
|
|
536
|
-
cmd = HarborMCPMixin._default_mcp_health_cmd(8000)
|
|
537
|
-
assert "bash" not in cmd and "/dev/tcp" not in cmd
|
|
538
|
-
assert "/proc/net/tcp" in cmd and "/proc/net/tcp6" in cmd
|
|
539
|
-
assert '$4 == "0A"' in cmd # LISTEN state
|
|
540
|
-
|
|
541
|
-
@pytest.mark.parametrize(
|
|
542
|
-
"port,hex_expected",
|
|
543
|
-
[(80, "0050"), (8000, "1F40"), (65535, "FFFF"), (1, "0001")],
|
|
544
|
-
)
|
|
545
|
-
def test_default_probe_encodes_port_as_uppercase_hex(self, port, hex_expected):
|
|
546
|
-
cmd = HarborMCPMixin._default_mcp_health_cmd(port)
|
|
547
|
-
assert f":{hex_expected}$" in cmd
|
|
548
|
-
|
|
549
499
|
@pytest.mark.asyncio
|
|
550
500
|
async def test_custom_healthcheck_command_templated_with_port(self):
|
|
551
501
|
env = _DummyEnv(mcp_launch_commands={"svc": "python x"})
|
|
@@ -580,7 +530,11 @@ class TestHealthCheck:
|
|
|
580
530
|
if "/proc/net/tcp" in c.args[1]
|
|
581
531
|
]
|
|
582
532
|
assert len(health_calls) == 1
|
|
583
|
-
|
|
533
|
+
health_cmd = health_calls[0]
|
|
534
|
+
assert "bash" not in health_cmd and "/dev/tcp" not in health_cmd
|
|
535
|
+
assert "/proc/net/tcp6" in health_cmd
|
|
536
|
+
assert '$4 == "0A"' in health_cmd
|
|
537
|
+
assert ":1F40$" in health_cmd
|
|
584
538
|
|
|
585
539
|
@pytest.mark.asyncio
|
|
586
540
|
async def test_probe_timeout_is_respected(self):
|
|
@@ -1,6 +1,26 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import sys
|
|
3
|
+
|
|
1
4
|
import verifiers
|
|
2
5
|
|
|
3
6
|
|
|
7
|
+
def test_v1_taskset_imports_do_not_import_textarena():
|
|
8
|
+
textarena_module = "verifiers.v1.packages.tasksets.textarena"
|
|
9
|
+
sys.modules.pop(textarena_module, None)
|
|
10
|
+
|
|
11
|
+
tasksets = importlib.import_module("verifiers.v1.packages.tasksets")
|
|
12
|
+
tasksets.__dict__.pop("TextArenaTaskset", None)
|
|
13
|
+
tasksets.__dict__.pop("TextArenaTasksetConfig", None)
|
|
14
|
+
importlib.reload(tasksets)
|
|
15
|
+
assert textarena_module not in sys.modules
|
|
16
|
+
|
|
17
|
+
v1 = importlib.import_module("verifiers.v1")
|
|
18
|
+
v1.__dict__.pop("TextArenaTaskset", None)
|
|
19
|
+
v1.__dict__.pop("TextArenaTasksetConfig", None)
|
|
20
|
+
importlib.reload(v1)
|
|
21
|
+
assert textarena_module not in sys.modules
|
|
22
|
+
|
|
23
|
+
|
|
4
24
|
class TestImports:
|
|
5
25
|
"""Test that all public API imports work correctly.
|
|
6
26
|
This was inspired by issue #349.
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import verifiers as vf
|
|
4
|
+
from verifiers.scripts.init import init_environment
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def read_env_file(root: Path, env_id: str) -> str:
|
|
8
|
+
module_name = env_id.replace("-", "_")
|
|
9
|
+
return (root / module_name / f"{module_name}.py").read_text()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_init_default_writes_v0_stub(tmp_path: Path) -> None:
|
|
13
|
+
root = init_environment("foo", path=str(tmp_path))
|
|
14
|
+
content = read_env_file(tmp_path, "foo")
|
|
15
|
+
|
|
16
|
+
assert root == tmp_path / "foo"
|
|
17
|
+
assert "def load_environment(**kwargs) -> vf.Environment:" in content
|
|
18
|
+
assert "NotImplementedError" in content
|
|
19
|
+
assert "load_taskset" not in content
|
|
20
|
+
assert "EnvTaskset" not in content
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_init_v1_writes_thin_taskset_template(tmp_path: Path) -> None:
|
|
24
|
+
init_environment("bar", path=str(tmp_path), v1=True)
|
|
25
|
+
content = read_env_file(tmp_path, "bar")
|
|
26
|
+
|
|
27
|
+
assert 'ENV_ID = "bar"' in content
|
|
28
|
+
assert "def load_tasks():" in content
|
|
29
|
+
assert "class EnvTasksetConfig(vf.TasksetConfig):" in content
|
|
30
|
+
assert 'source: str = "bar:load_tasks"' in content
|
|
31
|
+
assert 'rewards: list[str] = ["bar:exact_answer"]' in content
|
|
32
|
+
assert "def load_taskset(config: EnvTasksetConfig) -> vf.Taskset:" in content
|
|
33
|
+
assert "vf.load_taskset(ENV_ID, config=config.taskset)" in content
|
|
34
|
+
assert "class EnvTaskset(" not in content
|
|
35
|
+
assert "_default_" not in content
|
|
36
|
+
assert "assert isinstance" not in content
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_init_v1_template_loads_with_vf_load_environment(
|
|
40
|
+
tmp_path: Path, monkeypatch
|
|
41
|
+
) -> None:
|
|
42
|
+
init_environment("loadable-v1", path=str(tmp_path), v1=True)
|
|
43
|
+
monkeypatch.syspath_prepend(str(tmp_path / "loadable_v1"))
|
|
44
|
+
|
|
45
|
+
env = vf.load_environment("loadable-v1")
|
|
46
|
+
|
|
47
|
+
assert isinstance(env, vf.Env)
|
|
48
|
+
assert env.taskset.rows()[0]["answer"] == "cba"
|
|
49
|
+
assert env.taskset.rewards[0].__name__ == "exact_answer"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_init_v1_with_harness_writes_harness_stub(tmp_path: Path) -> None:
|
|
53
|
+
init_environment("baz", path=str(tmp_path), v1=True, with_harness=True)
|
|
54
|
+
content = read_env_file(tmp_path, "baz")
|
|
55
|
+
|
|
56
|
+
assert "class EnvHarnessConfig(vf.HarnessConfig):" in content
|
|
57
|
+
assert "class EnvHarness(vf.Harness):" in content
|
|
58
|
+
assert "def load_harness(config: EnvHarnessConfig) -> EnvHarness:" in content
|
|
59
|
+
assert "vf.load_harness(ENV_ID, config=config.harness)" in content
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_init_with_harness_without_v1_warns_and_uses_v0(tmp_path: Path, capsys) -> None:
|
|
63
|
+
init_environment("plain", path=str(tmp_path), with_harness=True)
|
|
64
|
+
content = read_env_file(tmp_path, "plain")
|
|
65
|
+
captured = capsys.readouterr()
|
|
66
|
+
|
|
67
|
+
assert "--with-harness only applies with --v1; ignoring." in captured.out
|
|
68
|
+
assert "def load_environment(**kwargs) -> vf.Environment:" in content
|
|
69
|
+
assert "load_harness" not in content
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_init_v1_multifile_exports_component_loaders(tmp_path: Path) -> None:
|
|
73
|
+
init_environment("pkg-env", path=str(tmp_path), v1=True, multi_file=True)
|
|
74
|
+
package_dir = tmp_path / "pkg_env" / "pkg_env"
|
|
75
|
+
init_content = (package_dir / "__init__.py").read_text()
|
|
76
|
+
env_content = (package_dir / "pkg_env.py").read_text()
|
|
77
|
+
|
|
78
|
+
assert "from .pkg_env import load_environment, load_taskset" in init_content
|
|
79
|
+
assert "__all__ = ['load_environment', 'load_taskset']" in init_content
|
|
80
|
+
assert 'source: str = "pkg_env.pkg_env:load_tasks"' in env_content
|
|
@@ -9,10 +9,8 @@ from verifiers.envs.experimental.composable.tasksets.lean.lean_task import (
|
|
|
9
9
|
LEAN_GUARD_END_MARKER,
|
|
10
10
|
LeanRubric,
|
|
11
11
|
_build_starter_file,
|
|
12
|
-
_expected_protected_region,
|
|
13
12
|
_extract_protected_region,
|
|
14
13
|
_normalize_signature,
|
|
15
|
-
_wrap_with_lean_guard,
|
|
16
14
|
)
|
|
17
15
|
|
|
18
16
|
|
|
@@ -80,11 +78,13 @@ class TestNormalizeSignature:
|
|
|
80
78
|
)
|
|
81
79
|
|
|
82
80
|
|
|
83
|
-
class
|
|
81
|
+
class TestBuildStarterFileLeanGuardLayout:
|
|
84
82
|
def test_marker_layout(self) -> None:
|
|
85
83
|
signature = "theorem foo (x : ℝ) : x = x := by"
|
|
86
|
-
|
|
87
|
-
|
|
84
|
+
starter = _build_starter_file(
|
|
85
|
+
{"formal_statement": signature, "header": "", "imports": ""}
|
|
86
|
+
)
|
|
87
|
+
assert starter == (
|
|
88
88
|
"-- lean-guard: begin protected\n"
|
|
89
89
|
"theorem foo (x : ℝ) : x = x := by\n"
|
|
90
90
|
"-- lean-guard: end protected\n"
|
|
@@ -93,8 +93,10 @@ class TestWrapWithLeanGuard:
|
|
|
93
93
|
|
|
94
94
|
def test_round_trip_via_extract(self) -> None:
|
|
95
95
|
signature = "theorem foo : True := by"
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
starter = _build_starter_file(
|
|
97
|
+
{"formal_statement": signature, "header": "", "imports": ""}
|
|
98
|
+
)
|
|
99
|
+
region = _extract_protected_region(starter)
|
|
98
100
|
assert region is not None
|
|
99
101
|
assert LEAN_GUARD_BEGIN_MARKER in region
|
|
100
102
|
assert LEAN_GUARD_END_MARKER in region
|
|
@@ -212,7 +214,7 @@ class TestBuildStarterFile:
|
|
|
212
214
|
"header": "import Mathlib",
|
|
213
215
|
}
|
|
214
216
|
starter = _build_starter_file(info)
|
|
215
|
-
expected =
|
|
217
|
+
expected = _extract_protected_region(_build_starter_file(info)) or ""
|
|
216
218
|
actual = _extract_protected_region(starter)
|
|
217
219
|
assert expected == actual
|
|
218
220
|
assert expected != ""
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Tests for the OpenCodeRLMEnv class."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import json
|
|
4
5
|
import subprocess
|
|
5
6
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
@@ -7,6 +8,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
7
8
|
import pytest
|
|
8
9
|
from datasets import Dataset
|
|
9
10
|
|
|
11
|
+
import verifiers as vf
|
|
10
12
|
from verifiers.envs.experimental.opencode_rlm_env import (
|
|
11
13
|
OpenCodeRLMEnv,
|
|
12
14
|
OpenCodeRLMMonitorRubric,
|
|
@@ -239,45 +241,6 @@ class TestBuildEnvVars:
|
|
|
239
241
|
assert "RLM_SUB_MODEL_ID" not in env_vars
|
|
240
242
|
|
|
241
243
|
|
|
242
|
-
# =============================================================================
|
|
243
|
-
# Sub-LLM detection (header-based)
|
|
244
|
-
# =============================================================================
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
class TestIsSubLLMRequest:
|
|
248
|
-
def test_detects_sub_header(self):
|
|
249
|
-
assert (
|
|
250
|
-
OpenCodeRLMEnv._is_sub_llm_request({"headers": {"x-rlm-role": "sub"}})
|
|
251
|
-
is True
|
|
252
|
-
)
|
|
253
|
-
|
|
254
|
-
def test_rejects_no_headers(self):
|
|
255
|
-
assert OpenCodeRLMEnv._is_sub_llm_request({}) is False
|
|
256
|
-
|
|
257
|
-
def test_rejects_empty_headers(self):
|
|
258
|
-
assert OpenCodeRLMEnv._is_sub_llm_request({"headers": {}}) is False
|
|
259
|
-
|
|
260
|
-
def test_rejects_wrong_value(self):
|
|
261
|
-
assert (
|
|
262
|
-
OpenCodeRLMEnv._is_sub_llm_request({"headers": {"x-rlm-role": "main"}})
|
|
263
|
-
is False
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
def test_ignores_model_field(self):
|
|
267
|
-
"""Model name should NOT be used for detection."""
|
|
268
|
-
assert (
|
|
269
|
-
OpenCodeRLMEnv._is_sub_llm_request({"model": "sub", "headers": {}}) is False
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
def test_header_takes_precedence(self):
|
|
273
|
-
assert (
|
|
274
|
-
OpenCodeRLMEnv._is_sub_llm_request(
|
|
275
|
-
{"model": "openai/gpt-5-mini", "headers": {"x-rlm-role": "sub"}}
|
|
276
|
-
)
|
|
277
|
-
is True
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
|
|
281
244
|
# =============================================================================
|
|
282
245
|
# State setup
|
|
283
246
|
# =============================================================================
|
|
@@ -330,17 +293,45 @@ class TestMetrics:
|
|
|
330
293
|
response = MagicMock(spec=[]) # no usage attr
|
|
331
294
|
assert OpenCodeRLMEnv._extract_token_counts(response) == (0, 0)
|
|
332
295
|
|
|
333
|
-
|
|
296
|
+
@pytest.mark.asyncio
|
|
297
|
+
async def test_handle_sub_llm_request_updates_sub_metrics(self):
|
|
334
298
|
env = build_env()
|
|
335
299
|
state = {
|
|
300
|
+
"trajectory": [],
|
|
301
|
+
"model": "main-model",
|
|
336
302
|
"sub_llm_turns": 0,
|
|
337
303
|
"sub_llm_prompt_tokens": 0,
|
|
338
304
|
"sub_llm_completion_tokens": 0,
|
|
339
305
|
}
|
|
340
|
-
response =
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
306
|
+
response = vf.Response(
|
|
307
|
+
id="resp",
|
|
308
|
+
created=0,
|
|
309
|
+
model="sub-model",
|
|
310
|
+
message=vf.ResponseMessage(
|
|
311
|
+
content="ok", finish_reason="stop", is_truncated=False
|
|
312
|
+
),
|
|
313
|
+
usage=vf.Usage(
|
|
314
|
+
prompt_tokens=50,
|
|
315
|
+
completion_tokens=20,
|
|
316
|
+
reasoning_tokens=0,
|
|
317
|
+
total_tokens=70,
|
|
318
|
+
),
|
|
319
|
+
)
|
|
320
|
+
future = asyncio.get_running_loop().create_future()
|
|
321
|
+
intercept = {
|
|
322
|
+
"messages": [{"role": "user", "content": "hello"}],
|
|
323
|
+
"headers": {"x-rlm-role": "sub"},
|
|
324
|
+
"response_future": future,
|
|
325
|
+
}
|
|
326
|
+
env._require_interception_server().intercepts["req"] = intercept
|
|
327
|
+
with patch.object(
|
|
328
|
+
vf.Environment,
|
|
329
|
+
"get_model_response",
|
|
330
|
+
new=AsyncMock(return_value=response),
|
|
331
|
+
):
|
|
332
|
+
await env._handle_sub_llm_request(state, "req", intercept)
|
|
333
|
+
|
|
334
|
+
assert future.result() is response
|
|
344
335
|
assert state["sub_llm_turns"] == 1
|
|
345
336
|
assert state["sub_llm_prompt_tokens"] == 50
|
|
346
337
|
assert state["sub_llm_completion_tokens"] == 20
|