verifiers 0.1.13.dev6__tar.gz → 0.1.13.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/PKG-INFO +1 -1
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/conftest.py +1 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_cli_agent_env.py +24 -11
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_environment.py +1 -4
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_environment_extra.py +2 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_eval_cli.py +51 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_math_rubric.py +85 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_multiturn_env.py +81 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_opencode_rlm_env.py +7 -9
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_rlm_composable_env.py +331 -38
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_rlm_env.py +71 -71
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_rubric_group.py +84 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/__init__.py +1 -1
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/openai_chat_completions_token_client.py +14 -1
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/environment.py +4 -6
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/cli_agent_env.py +24 -35
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/composable_env.py +101 -25
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/harness.py +33 -15
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/harnesses/opencode.py +6 -0
- verifiers-0.1.13.dev8/verifiers/envs/experimental/composable/harnesses/rlm.py +281 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/task.py +40 -20
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -1
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -1
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +3 -3
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +3 -3
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +3 -3
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +11 -8
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +2 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +2 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +2 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/opencode_rlm_env.py +2 -3
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/rlm_env.py +3 -5
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/sandbox_mixin.py +51 -1
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/utils/git_checkout_cache.py +45 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/browser_env.py +3 -3
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/modes/base.py +2 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +1 -3
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +1 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/openenv_env.py +2 -3
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/textarena_env.py +7 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/multiturn_env.py +31 -11
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/python_env.py +2 -3
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/sandbox_env.py +2 -2
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/math_rubric.py +2 -1
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/rubric_group.py +13 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/eval.py +11 -1
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/types.py +1 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/eval_utils.py +1 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/logging_utils.py +18 -0
- verifiers-0.1.13.dev6/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -186
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/.gitignore +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/LICENSE +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/README.md +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/pyproject.toml +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/AGENTS.md +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/README.md +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_build_script.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_client_config.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_env_group.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_env_server.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_envs.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_imports.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_logging.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_parser.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_rubric.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_save_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/openai_chat_completions_client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/decorators.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/errors.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/prime_rl.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/eval_display.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/interception_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/response_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/save_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/version_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.13.
|
|
3
|
+
Version: 0.1.13.dev8
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -425,10 +425,9 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
|
|
|
425
425
|
super().__init__(tools=[offset_tool], **kwargs)
|
|
426
426
|
|
|
427
427
|
async def setup_state(self, state, **kwargs):
|
|
428
|
-
|
|
428
|
+
await super().setup_state(state, **kwargs)
|
|
429
429
|
state["offset"] = 3
|
|
430
430
|
state["update_calls"] = 0
|
|
431
|
-
return state
|
|
432
431
|
|
|
433
432
|
def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
|
|
434
433
|
state["update_calls"] += 1
|
|
@@ -62,7 +62,8 @@ class TestCliAgentEnv:
|
|
|
62
62
|
assert env.run_command == "python agent.py"
|
|
63
63
|
assert env.docker_image == "python:3.11-slim"
|
|
64
64
|
assert env.interception_port == 8765
|
|
65
|
-
assert env.timeout_seconds
|
|
65
|
+
assert env.timeout_seconds is None
|
|
66
|
+
assert env.sandbox_timeout_minutes is None
|
|
66
67
|
|
|
67
68
|
def test_init_custom_config(self, sample_dataset):
|
|
68
69
|
"""Test initialization with custom configuration."""
|
|
@@ -130,22 +131,34 @@ class TestCliAgentEnv:
|
|
|
130
131
|
state = {"agent_completed": True}
|
|
131
132
|
assert await env.agent_completed(state) is True
|
|
132
133
|
|
|
133
|
-
@pytest.mark.
|
|
134
|
-
|
|
135
|
-
|
|
134
|
+
@pytest.mark.parametrize(
|
|
135
|
+
"timeout_seconds,expected_minutes",
|
|
136
|
+
[
|
|
137
|
+
(None, 24 * 60), # no rollout cap → SDK ceiling
|
|
138
|
+
(600.0, 10 + 60), # finite → ceil + scoring buffer
|
|
139
|
+
(24 * 3600.0, 24 * 60), # buffer would overflow → clamped to ceiling
|
|
140
|
+
],
|
|
141
|
+
)
|
|
142
|
+
def test_sandbox_timeout_auto_derived(
|
|
143
|
+
self, sample_dataset, timeout_seconds, expected_minutes
|
|
144
|
+
):
|
|
136
145
|
env = vf.CliAgentEnv(
|
|
137
146
|
run_command="python agent.py",
|
|
138
147
|
dataset=sample_dataset,
|
|
139
148
|
rubric=vf.Rubric(),
|
|
140
|
-
timeout_seconds=
|
|
149
|
+
timeout_seconds=timeout_seconds,
|
|
141
150
|
)
|
|
142
|
-
|
|
151
|
+
assert env.get_sandbox_resources({})["timeout_minutes"] == expected_minutes
|
|
143
152
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
153
|
+
def test_sandbox_timeout_explicit_override(self, sample_dataset):
|
|
154
|
+
env = vf.CliAgentEnv(
|
|
155
|
+
run_command="python agent.py",
|
|
156
|
+
dataset=sample_dataset,
|
|
157
|
+
rubric=vf.Rubric(),
|
|
158
|
+
timeout_seconds=600.0,
|
|
159
|
+
sandbox_timeout_minutes=30,
|
|
160
|
+
)
|
|
161
|
+
assert env.get_sandbox_resources({})["timeout_minutes"] == 30
|
|
149
162
|
|
|
150
163
|
@pytest.mark.asyncio
|
|
151
164
|
async def test_env_response_returns_empty(self, sample_dataset):
|
|
@@ -26,7 +26,6 @@ class SimpleEnvironment(Environment):
|
|
|
26
26
|
|
|
27
27
|
async def setup_state(self, state):
|
|
28
28
|
"""Setup state for SimpleEnvironment."""
|
|
29
|
-
return state
|
|
30
29
|
|
|
31
30
|
async def rollout(
|
|
32
31
|
self,
|
|
@@ -38,7 +37,7 @@ class SimpleEnvironment(Environment):
|
|
|
38
37
|
"""Simple test rollout implementation."""
|
|
39
38
|
state = await self.init_state(input, client=client, model=model)
|
|
40
39
|
try:
|
|
41
|
-
|
|
40
|
+
await self.setup_state(state)
|
|
42
41
|
|
|
43
42
|
prompt_messages = state["prompt"]
|
|
44
43
|
response = await self.get_model_response(state, prompt_messages)
|
|
@@ -551,8 +550,6 @@ class RetryCounterEnv(SimpleEnvironment):
|
|
|
551
550
|
f"Simulated failure {self.call_counts[example_id]}/{self.fail_count}"
|
|
552
551
|
)
|
|
553
552
|
|
|
554
|
-
return state
|
|
555
|
-
|
|
556
553
|
|
|
557
554
|
class TestMaybeRetry:
|
|
558
555
|
"""Test cases for maybe_retry functionality in Environment.generate()."""
|
|
@@ -40,7 +40,7 @@ from verifiers.utils.save_utils import state_to_output
|
|
|
40
40
|
# Local simple concrete Environment for testing
|
|
41
41
|
class DummyEnvironment(Environment):
|
|
42
42
|
async def setup_state(self, state):
|
|
43
|
-
|
|
43
|
+
pass
|
|
44
44
|
|
|
45
45
|
async def rollout(
|
|
46
46
|
self,
|
|
@@ -52,7 +52,7 @@ class DummyEnvironment(Environment):
|
|
|
52
52
|
state = await self.init_state(
|
|
53
53
|
input, client=client, model=model, sampling_args=sampling_args
|
|
54
54
|
)
|
|
55
|
-
|
|
55
|
+
await self.setup_state(state)
|
|
56
56
|
|
|
57
57
|
prompt_messages = state["prompt"]
|
|
58
58
|
response = await self.get_model_response(state=state, prompt=prompt_messages)
|
|
@@ -232,6 +232,36 @@ def test_cli_temperature_not_added_when_none(monkeypatch, run_cli):
|
|
|
232
232
|
assert "temperature" not in sa
|
|
233
233
|
|
|
234
234
|
|
|
235
|
+
def test_cli_extra_env_kwargs_support_timeout_seconds(monkeypatch, run_cli):
|
|
236
|
+
captured = run_cli(
|
|
237
|
+
monkeypatch,
|
|
238
|
+
{
|
|
239
|
+
"extra_env_kwargs": {"timeout_seconds": 30, "foo": "bar"},
|
|
240
|
+
},
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
assert captured["configs"][0].extra_env_kwargs == {
|
|
244
|
+
"timeout_seconds": 30,
|
|
245
|
+
"foo": "bar",
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def test_cli_timeout_flag_overrides_extra_env_kwargs(monkeypatch, run_cli):
|
|
250
|
+
"""--timeout wins over timeout_seconds in --extra-env-kwargs."""
|
|
251
|
+
captured = run_cli(
|
|
252
|
+
monkeypatch,
|
|
253
|
+
{
|
|
254
|
+
"extra_env_kwargs": {"timeout_seconds": 30, "foo": "bar"},
|
|
255
|
+
"timeout": 600,
|
|
256
|
+
},
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
assert captured["configs"][0].extra_env_kwargs == {
|
|
260
|
+
"timeout_seconds": 600,
|
|
261
|
+
"foo": "bar",
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
|
|
235
265
|
def test_cli_headers_table_and_list_merge(monkeypatch, run_cli):
|
|
236
266
|
captured = run_cli(
|
|
237
267
|
monkeypatch,
|
|
@@ -874,6 +904,27 @@ def test_load_toml_config_global_values_with_per_eval_override():
|
|
|
874
904
|
assert result[1]["num_examples"] == 50 # per-eval override
|
|
875
905
|
|
|
876
906
|
|
|
907
|
+
def test_load_toml_config_with_extra_env_kwargs():
|
|
908
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
909
|
+
f.write(
|
|
910
|
+
'[[eval]]\nenv_id = "env1"\n[eval.extra_env_kwargs]\ntimeout_seconds = 600\n'
|
|
911
|
+
)
|
|
912
|
+
f.flush()
|
|
913
|
+
result = load_toml_config(Path(f.name))
|
|
914
|
+
|
|
915
|
+
assert result[0]["extra_env_kwargs"] == {"timeout_seconds": 600}
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def test_load_toml_config_with_top_level_timeout():
|
|
919
|
+
"""Top-level `timeout` is a recognized field on [[eval]] tables."""
|
|
920
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
921
|
+
f.write('[[eval]]\nenv_id = "env1"\ntimeout = 600\n')
|
|
922
|
+
f.flush()
|
|
923
|
+
result = load_toml_config(Path(f.name))
|
|
924
|
+
|
|
925
|
+
assert result[0]["timeout"] == 600
|
|
926
|
+
|
|
927
|
+
|
|
877
928
|
def test_load_toml_config_invalid_global_field():
|
|
878
929
|
"""Invalid global field raises ValueError."""
|
|
879
930
|
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
"""Tests for the MathRubric class."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
3
5
|
import pytest
|
|
4
6
|
|
|
5
7
|
import verifiers as vf
|
|
8
|
+
from verifiers.rubrics import math_rubric
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
class TestMathRubric:
|
|
@@ -127,3 +130,85 @@ class TestMathRubric:
|
|
|
127
130
|
assert state["metrics"]["correct_answer"] == 1.0
|
|
128
131
|
else:
|
|
129
132
|
assert state["metrics"]["correct_answer"] == 0.0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class TestVerifyResponseExceptionHandling:
|
|
136
|
+
"""Regression tests for the exception handling in verify_response.
|
|
137
|
+
|
|
138
|
+
See commit narrowing ``except BaseException`` to
|
|
139
|
+
``except (Exception, MathVerifyTimeout)`` so that ``CancelledError``,
|
|
140
|
+
``KeyboardInterrupt``, and ``SystemExit`` propagate instead of being
|
|
141
|
+
silently reported as a 0.0 score.
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
def test_cancellederror_propagates(self, monkeypatch):
|
|
145
|
+
"""CancelledError raised during math_verify must propagate, not
|
|
146
|
+
get swallowed and reported as a score of 0.0."""
|
|
147
|
+
|
|
148
|
+
def raise_cancelled(*args, **kwargs):
|
|
149
|
+
raise asyncio.CancelledError
|
|
150
|
+
|
|
151
|
+
monkeypatch.setattr(math_rubric, "parse", raise_cancelled)
|
|
152
|
+
|
|
153
|
+
with pytest.raises(asyncio.CancelledError):
|
|
154
|
+
math_rubric.verify_response(
|
|
155
|
+
response="\\boxed{1}",
|
|
156
|
+
answer="1",
|
|
157
|
+
max_verify_chars=50_000,
|
|
158
|
+
timeout_seconds=5,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def test_keyboardinterrupt_propagates(self, monkeypatch):
|
|
162
|
+
"""KeyboardInterrupt must propagate so Ctrl-C still works during
|
|
163
|
+
scoring."""
|
|
164
|
+
|
|
165
|
+
def raise_kbd(*args, **kwargs):
|
|
166
|
+
raise KeyboardInterrupt
|
|
167
|
+
|
|
168
|
+
monkeypatch.setattr(math_rubric, "parse", raise_kbd)
|
|
169
|
+
|
|
170
|
+
with pytest.raises(KeyboardInterrupt):
|
|
171
|
+
math_rubric.verify_response(
|
|
172
|
+
response="\\boxed{1}",
|
|
173
|
+
answer="1",
|
|
174
|
+
max_verify_chars=50_000,
|
|
175
|
+
timeout_seconds=5,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def test_math_verify_timeout_returns_zero(self, monkeypatch):
|
|
179
|
+
"""A real math_verify.errors.TimeoutException (which inherits from
|
|
180
|
+
BaseException, not Exception) must still be caught and reported as
|
|
181
|
+
a 0.0 score — that's why the catch is wider than just Exception."""
|
|
182
|
+
from math_verify.errors import TimeoutException
|
|
183
|
+
|
|
184
|
+
def raise_timeout(*args, **kwargs):
|
|
185
|
+
raise TimeoutException("simulated math_verify timeout")
|
|
186
|
+
|
|
187
|
+
monkeypatch.setattr(math_rubric, "parse", raise_timeout)
|
|
188
|
+
|
|
189
|
+
score, elapsed = math_rubric.verify_response(
|
|
190
|
+
response="\\boxed{1}",
|
|
191
|
+
answer="1",
|
|
192
|
+
max_verify_chars=50_000,
|
|
193
|
+
timeout_seconds=5,
|
|
194
|
+
)
|
|
195
|
+
assert score == 0.0
|
|
196
|
+
assert elapsed >= 0.0
|
|
197
|
+
|
|
198
|
+
def test_regular_exception_returns_zero(self, monkeypatch):
|
|
199
|
+
"""A regular Exception from math_verify should continue to be
|
|
200
|
+
swallowed and reported as 0.0 (library-raised something weird)."""
|
|
201
|
+
|
|
202
|
+
def raise_exc(*args, **kwargs):
|
|
203
|
+
raise ValueError("simulated parse failure")
|
|
204
|
+
|
|
205
|
+
monkeypatch.setattr(math_rubric, "parse", raise_exc)
|
|
206
|
+
|
|
207
|
+
score, elapsed = math_rubric.verify_response(
|
|
208
|
+
response="\\boxed{1}",
|
|
209
|
+
answer="1",
|
|
210
|
+
max_verify_chars=50_000,
|
|
211
|
+
timeout_seconds=5,
|
|
212
|
+
)
|
|
213
|
+
assert score == 0.0
|
|
214
|
+
assert elapsed >= 0.0
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""Tests for the MultiTurnEnv class."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
3
5
|
import pytest
|
|
4
6
|
from datasets import Dataset
|
|
5
7
|
|
|
@@ -12,6 +14,7 @@ class TestMultiTurnEnv:
|
|
|
12
14
|
def test_multiturn_env_initialization(self, mock_multiturn_env):
|
|
13
15
|
"""Test MultiTurnEnv initialization."""
|
|
14
16
|
assert mock_multiturn_env.max_turns == 3
|
|
17
|
+
assert mock_multiturn_env.timeout_seconds is None
|
|
15
18
|
assert mock_multiturn_env.message_type == "chat" # Default from parent
|
|
16
19
|
|
|
17
20
|
def test_multiturn_env_default_max_turns(self, mock_client, sample_chat_dataset):
|
|
@@ -26,6 +29,7 @@ class TestMultiTurnEnv:
|
|
|
26
29
|
rubric=Rubric(),
|
|
27
30
|
)
|
|
28
31
|
assert env.max_turns == -1 # Default value
|
|
32
|
+
assert env.timeout_seconds is None
|
|
29
33
|
|
|
30
34
|
@pytest.mark.asyncio
|
|
31
35
|
async def test_basic_multiturn_rollout(self, mock_multiturn_env, make_input):
|
|
@@ -103,6 +107,83 @@ class TestMultiTurnEnv:
|
|
|
103
107
|
assert completion[1]["role"] == "user"
|
|
104
108
|
assert completion[2]["role"] == "assistant"
|
|
105
109
|
|
|
110
|
+
@pytest.mark.asyncio
|
|
111
|
+
async def test_timeout_seconds_limits_rollout(
|
|
112
|
+
self, mock_client, sample_chat_dataset, make_input
|
|
113
|
+
):
|
|
114
|
+
"""Test that rollout stops when the wall-clock timeout is reached."""
|
|
115
|
+
|
|
116
|
+
class SlowMultiTurnEnv(MultiTurnEnv):
|
|
117
|
+
async def env_response(self, messages, state, **kwargs): # type: ignore[override]
|
|
118
|
+
return [{"role": "user", "content": "Continue"}]
|
|
119
|
+
|
|
120
|
+
async def add_model_response(self, state, prompt_messages, response): # type: ignore[override]
|
|
121
|
+
await super().add_model_response(state, prompt_messages, response)
|
|
122
|
+
await asyncio.sleep(0.05)
|
|
123
|
+
|
|
124
|
+
env = SlowMultiTurnEnv(
|
|
125
|
+
client=mock_client,
|
|
126
|
+
model="test-model",
|
|
127
|
+
dataset=sample_chat_dataset,
|
|
128
|
+
parser=Parser(),
|
|
129
|
+
rubric=Rubric(),
|
|
130
|
+
timeout_seconds=0.01,
|
|
131
|
+
)
|
|
132
|
+
mock_client.set_default_response("Still going")
|
|
133
|
+
|
|
134
|
+
prompt = [{"role": "user", "content": "Start conversation"}]
|
|
135
|
+
state = await env.rollout(
|
|
136
|
+
input=make_input(prompt=prompt, answer="target_answer"),
|
|
137
|
+
client=mock_client,
|
|
138
|
+
model="test-model",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
assert len(state["trajectory"]) == 1
|
|
142
|
+
assert state["timed_out"] is True
|
|
143
|
+
assert state["is_completed"] is True
|
|
144
|
+
assert state["stop_condition"] == "timeout_reached"
|
|
145
|
+
completion = state["completion"]
|
|
146
|
+
assert len(completion) == 1
|
|
147
|
+
assert completion[0]["role"] == "assistant"
|
|
148
|
+
assert completion[0]["content"] == "Still going"
|
|
149
|
+
|
|
150
|
+
@pytest.mark.asyncio
|
|
151
|
+
async def test_timeout_seconds_limits_setup(
|
|
152
|
+
self, mock_client, sample_chat_dataset, make_input
|
|
153
|
+
):
|
|
154
|
+
"""Test that the rollout timeout applies while setup is in flight."""
|
|
155
|
+
|
|
156
|
+
class SlowSetupEnv(MultiTurnEnv):
|
|
157
|
+
async def setup_state(self, state): # type: ignore[override]
|
|
158
|
+
await asyncio.sleep(1)
|
|
159
|
+
|
|
160
|
+
async def env_response(self, messages, state, **kwargs): # type: ignore[override]
|
|
161
|
+
return [{"role": "user", "content": "Continue"}]
|
|
162
|
+
|
|
163
|
+
env = SlowSetupEnv(
|
|
164
|
+
client=mock_client,
|
|
165
|
+
model="test-model",
|
|
166
|
+
dataset=sample_chat_dataset,
|
|
167
|
+
parser=Parser(),
|
|
168
|
+
rubric=Rubric(),
|
|
169
|
+
timeout_seconds=0.01,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
state = await env.rollout(
|
|
173
|
+
input=make_input(
|
|
174
|
+
prompt=[{"role": "user", "content": "Start conversation"}],
|
|
175
|
+
answer="target_answer",
|
|
176
|
+
),
|
|
177
|
+
client=mock_client,
|
|
178
|
+
model="test-model",
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
assert state["timed_out"] is True
|
|
182
|
+
assert state["is_completed"] is True
|
|
183
|
+
assert state["stop_condition"] == "timeout_reached"
|
|
184
|
+
assert state["trajectory"] == []
|
|
185
|
+
assert state["completion"] == []
|
|
186
|
+
|
|
106
187
|
@pytest.mark.asyncio
|
|
107
188
|
async def test_override_is_completed_respects_max_turns(
|
|
108
189
|
self, mock_client, sample_chat_dataset, make_input
|
|
@@ -289,13 +289,12 @@ class TestSetupState:
|
|
|
289
289
|
OpenCodeRLMEnv.__bases__[0],
|
|
290
290
|
"setup_state",
|
|
291
291
|
new_callable=AsyncMock,
|
|
292
|
-
return_value=state,
|
|
293
292
|
):
|
|
294
|
-
|
|
295
|
-
assert
|
|
296
|
-
assert
|
|
297
|
-
assert
|
|
298
|
-
assert
|
|
293
|
+
await env.setup_state(state)
|
|
294
|
+
assert state["sub_llm_turns"] == 0
|
|
295
|
+
assert state["sub_llm_prompt_tokens"] == 0
|
|
296
|
+
assert state["sub_llm_completion_tokens"] == 0
|
|
297
|
+
assert state["_sub_llm_tasks"] == set()
|
|
299
298
|
|
|
300
299
|
@pytest.mark.asyncio
|
|
301
300
|
async def test_preserves_existing_sub_metrics(self):
|
|
@@ -305,10 +304,9 @@ class TestSetupState:
|
|
|
305
304
|
OpenCodeRLMEnv.__bases__[0],
|
|
306
305
|
"setup_state",
|
|
307
306
|
new_callable=AsyncMock,
|
|
308
|
-
return_value=state,
|
|
309
307
|
):
|
|
310
|
-
|
|
311
|
-
assert
|
|
308
|
+
await env.setup_state(state)
|
|
309
|
+
assert state["sub_llm_turns"] == 3
|
|
312
310
|
|
|
313
311
|
|
|
314
312
|
# =============================================================================
|