verifiers 0.1.13.dev4__tar.gz → 0.1.13.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/PKG-INFO +1 -1
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_eval_cli.py +2 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_interception_utils.py +73 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/__init__.py +1 -1
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/README.md +4 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/harnesses/__init__.py +12 -0
- verifiers-0.1.13.dev5/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +230 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/eval.py +58 -1
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/eval_utils.py +2 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/interception_utils.py +78 -7
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/.gitignore +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/LICENSE +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/pyproject.toml +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/AGENTS.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/conftest.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_build_script.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_client_config.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_env_group.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_env_server.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_environment.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_envs.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_imports.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_logging.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_rlm_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_save_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/clients/openai_chat_completions_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/decorators.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/environment.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/composable_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/harness.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/task.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/rlm_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/openenv_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/errors.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/prime_rl.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/types.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/eval_display.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/response_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/save_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev5}/verifiers/utils/version_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.13.
|
|
3
|
+
Version: 0.1.13.dev5
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -41,6 +41,8 @@ def run_cli(make_metadata, make_state, make_input):
|
|
|
41
41
|
"api_base_url": "https://api.openai.com/v1",
|
|
42
42
|
"header": None,
|
|
43
43
|
"headers": None,
|
|
44
|
+
"header_from_state": None,
|
|
45
|
+
"headers_from_state": None,
|
|
44
46
|
"num_examples": 1,
|
|
45
47
|
"rollouts_per_example": 1,
|
|
46
48
|
"max_concurrent": 1,
|
|
@@ -131,3 +131,76 @@ async def test_streaming_write_failure_surfaces_to_state(monkeypatch):
|
|
|
131
131
|
|
|
132
132
|
assert isinstance(state["error"], StreamInterrupted)
|
|
133
133
|
assert "ConnectionResetError" in str(state["error"])
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
async def test_keepalive_emitted_during_idle(monkeypatch):
|
|
137
|
+
"""During the idle window (no chunks on chunk_queue) the handler must
|
|
138
|
+
emit SSE keepalive comments so upstream idle-timeouts don't fire."""
|
|
139
|
+
monkeypatch.setattr(interception_utils, "KEEPALIVE_INTERVAL_SECONDS", 0.05)
|
|
140
|
+
server = InterceptionServer(port=0)
|
|
141
|
+
state: dict = {}
|
|
142
|
+
server.register_rollout("r1", state=state)
|
|
143
|
+
|
|
144
|
+
writes: list[bytes] = []
|
|
145
|
+
|
|
146
|
+
async def fake_write(data: bytes) -> None:
|
|
147
|
+
writes.append(data)
|
|
148
|
+
|
|
149
|
+
fake_response = MagicMock()
|
|
150
|
+
fake_response.prepare = AsyncMock()
|
|
151
|
+
fake_response.write = AsyncMock(side_effect=fake_write)
|
|
152
|
+
fake_response.write_eof = AsyncMock()
|
|
153
|
+
monkeypatch.setattr(
|
|
154
|
+
interception_utils.web, "StreamResponse", lambda **_: fake_response
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
chunk_queue: asyncio.Queue = asyncio.Queue() # starts empty
|
|
158
|
+
response_future: asyncio.Future = asyncio.Future()
|
|
159
|
+
intercept = {
|
|
160
|
+
"chunk_queue": chunk_queue,
|
|
161
|
+
"response_future": response_future,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
task = asyncio.create_task(
|
|
165
|
+
server._handle_streaming_response(MagicMock(), "r1", intercept)
|
|
166
|
+
)
|
|
167
|
+
await asyncio.sleep(0.2) # enough for a few keepalive cycles
|
|
168
|
+
|
|
169
|
+
# Close the loop cleanly: EOF sentinel + resolved future → handler returns.
|
|
170
|
+
response_future.set_result(None)
|
|
171
|
+
await chunk_queue.put(None)
|
|
172
|
+
await task
|
|
173
|
+
|
|
174
|
+
assert any(w == b": keepalive\n\n" for w in writes), (
|
|
175
|
+
f"expected at least one keepalive write, got writes={writes}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
async def test_keepalive_write_failure_surfaces_to_state(monkeypatch):
|
|
180
|
+
"""A failed keepalive write (upstream already cut the TCP connection)
|
|
181
|
+
must funnel into ``state["error"]`` with elapsed-time instrumentation."""
|
|
182
|
+
monkeypatch.setattr(interception_utils, "KEEPALIVE_INTERVAL_SECONDS", 0.05)
|
|
183
|
+
server = InterceptionServer(port=0)
|
|
184
|
+
state: dict = {}
|
|
185
|
+
server.register_rollout("r1", state=state)
|
|
186
|
+
|
|
187
|
+
fake_response = MagicMock()
|
|
188
|
+
fake_response.prepare = AsyncMock()
|
|
189
|
+
fake_response.write = AsyncMock(side_effect=ConnectionResetError("tunnel died"))
|
|
190
|
+
fake_response.write_eof = AsyncMock()
|
|
191
|
+
monkeypatch.setattr(
|
|
192
|
+
interception_utils.web, "StreamResponse", lambda **_: fake_response
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
chunk_queue: asyncio.Queue = asyncio.Queue() # never produces
|
|
196
|
+
intercept = {
|
|
197
|
+
"chunk_queue": chunk_queue,
|
|
198
|
+
"response_future": asyncio.Future(),
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
await server._handle_streaming_response(MagicMock(), "r1", intercept)
|
|
202
|
+
|
|
203
|
+
assert isinstance(state["error"], StreamInterrupted)
|
|
204
|
+
msg = str(state["error"])
|
|
205
|
+
assert "keepalive write failed" in msg
|
|
206
|
+
assert "ConnectionResetError" in msg
|
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Newer and more experimental environment classes that may have some sharper edges + change more frequently.
|
|
4
4
|
|
|
5
|
+
## SandboxMixin
|
|
6
|
+
|
|
7
|
+
`SandboxMixin` works with both container and VM sandboxes. If your environment needs a VM, pass `CreateSandboxRequest(..., vm=True)` to `create_sandbox`. For a GPU VM, also set `gpu_count` and `gpu_type`. Everyday sandbox operations like file upload, file reads, background jobs, and cleanup work the same way. Port exposure and SSH are currently container-only.
|
|
8
|
+
|
|
5
9
|
## GymEnv
|
|
6
10
|
|
|
7
11
|
Universal runner for Gym-compatible environments. Wraps any environment that implements `reset(seed)` and `step(action)` methods (following the OpenAI Gym / Gymnasium API). Supports both old-style 4-tuple and new-style 5-tuple step returns.
|
|
@@ -16,6 +16,13 @@ from verifiers.envs.experimental.composable.harnesses.opencode import (
|
|
|
16
16
|
build_opencode_run_command,
|
|
17
17
|
opencode_harness,
|
|
18
18
|
)
|
|
19
|
+
from verifiers.envs.experimental.composable.harnesses.mini_swe_agent import (
|
|
20
|
+
MINI_SWE_AGENT_CONFIG,
|
|
21
|
+
MINI_SWE_AGENT_INSTALL_SCRIPT,
|
|
22
|
+
build_mini_swe_agent_install_script,
|
|
23
|
+
build_mini_swe_agent_run_command,
|
|
24
|
+
mini_swe_agent_harness,
|
|
25
|
+
)
|
|
19
26
|
|
|
20
27
|
__all__ = [
|
|
21
28
|
"rlm_harness",
|
|
@@ -32,4 +39,9 @@ __all__ = [
|
|
|
32
39
|
"DEFAULT_DISABLED_TOOLS",
|
|
33
40
|
"DEFAULT_RELEASE_SHA256",
|
|
34
41
|
"DEFAULT_SYSTEM_PROMPT",
|
|
42
|
+
"mini_swe_agent_harness",
|
|
43
|
+
"build_mini_swe_agent_install_script",
|
|
44
|
+
"build_mini_swe_agent_run_command",
|
|
45
|
+
"MINI_SWE_AGENT_INSTALL_SCRIPT",
|
|
46
|
+
"MINI_SWE_AGENT_CONFIG",
|
|
35
47
|
]
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""mini-SWE-agent harness configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import PurePosixPath
|
|
6
|
+
import shlex
|
|
7
|
+
|
|
8
|
+
DEFAULT_INSTALL_DIR = "/opt/mini-swe-agent"
|
|
9
|
+
DEFAULT_PREFIX_DIR = f"{DEFAULT_INSTALL_DIR}/prefix"
|
|
10
|
+
DEFAULT_SITE_PACKAGES_DIR = f"{DEFAULT_PREFIX_DIR}/site-packages"
|
|
11
|
+
DEFAULT_UV_SITE_PACKAGES_DIR = f"{DEFAULT_INSTALL_DIR}/uv-site-packages"
|
|
12
|
+
DEFAULT_MINI_BINARY = f"{DEFAULT_PREFIX_DIR}/bin/mini"
|
|
13
|
+
MINI_SWE_AGENT_CLI_PACKAGE = "mini-swe-agent"
|
|
14
|
+
MINI_SWE_AGENT_CLI_VERSION = "2.2.8"
|
|
15
|
+
MINI_SWE_AGENT_CLI_SHA256 = (
|
|
16
|
+
"694df4de1337e665e3cd82e99f93374f573bf52b8e7c362ac5d8045ad9f7c37c"
|
|
17
|
+
)
|
|
18
|
+
MINI_SWE_AGENT_PYTHON_VERSION = "3.11"
|
|
19
|
+
UV_PACKAGE_VERSION = "0.11.7"
|
|
20
|
+
DEFAULT_PACKAGE_VERSION = MINI_SWE_AGENT_CLI_VERSION
|
|
21
|
+
DEFAULT_PACKAGE_SHA256 = MINI_SWE_AGENT_CLI_SHA256
|
|
22
|
+
DEFAULT_INSTRUCTION_PATH = "/mini-swe-agent/prompt.txt"
|
|
23
|
+
DEFAULT_SYSTEM_PROMPT_PATH = "/mini-swe-agent/system.txt"
|
|
24
|
+
DEFAULT_LOG_DIR = "/logs/agent"
|
|
25
|
+
DEFAULT_LOG_PATH = f"{DEFAULT_LOG_DIR}/mini-swe-agent.log"
|
|
26
|
+
DEFAULT_TRAJECTORY_PATH = f"{DEFAULT_LOG_DIR}/mini-swe-agent.traj.json"
|
|
27
|
+
DEFAULT_AGENT_WORKDIR = "${AGENT_WORKDIR:-/app}"
|
|
28
|
+
DEFAULT_CONFIG_SPEC = "mini_textbased"
|
|
29
|
+
DEFAULT_MODEL_CLASS = "litellm_textbased"
|
|
30
|
+
DEFAULT_ENVIRONMENT_TIMEOUT = 120
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def build_mini_swe_agent_install_script(
|
|
34
|
+
package_version: str = DEFAULT_PACKAGE_VERSION,
|
|
35
|
+
package_sha256: str = DEFAULT_PACKAGE_SHA256,
|
|
36
|
+
prefix_dir: str = DEFAULT_PREFIX_DIR,
|
|
37
|
+
install_python: bool = True,
|
|
38
|
+
) -> str:
|
|
39
|
+
"""Build the shell script that installs mini-SWE-agent."""
|
|
40
|
+
install_tools = ""
|
|
41
|
+
if install_python:
|
|
42
|
+
install_tools = """\
|
|
43
|
+
export DEBIAN_FRONTEND=noninteractive
|
|
44
|
+
if ! command -v python3 >/dev/null 2>&1 || ! python3 -m pip --version >/dev/null 2>&1; then
|
|
45
|
+
apt-get update -qq
|
|
46
|
+
apt-get install -y -qq python3 python3-pip ca-certificates
|
|
47
|
+
fi
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
quoted_prefix_dir = shlex.quote(prefix_dir)
|
|
51
|
+
site_packages_dir = f"{prefix_dir}/site-packages"
|
|
52
|
+
wheel_filename = f"mini_swe_agent-{package_version}-py3-none-any.whl"
|
|
53
|
+
wheel_url = (
|
|
54
|
+
f"https://files.pythonhosted.org/packages/py3/m/mini-swe-agent/{wheel_filename}"
|
|
55
|
+
)
|
|
56
|
+
quoted_site_packages_dir = shlex.quote(site_packages_dir)
|
|
57
|
+
quoted_install_dir = shlex.quote(DEFAULT_INSTALL_DIR)
|
|
58
|
+
quoted_uv_site_packages_dir = shlex.quote(DEFAULT_UV_SITE_PACKAGES_DIR)
|
|
59
|
+
return f"""\
|
|
60
|
+
set -e
|
|
61
|
+
{install_tools}
|
|
62
|
+
rm -rf {quoted_prefix_dir}
|
|
63
|
+
mkdir -p {quoted_install_dir} {quoted_prefix_dir}/bin {quoted_site_packages_dir} {quoted_uv_site_packages_dir} {shlex.quote(DEFAULT_LOG_DIR)} /mini-swe-agent
|
|
64
|
+
export PIP_CONFIG_FILE=/dev/null
|
|
65
|
+
export PIP_INDEX_URL=https://pypi.org/simple
|
|
66
|
+
export PIP_BREAK_SYSTEM_PACKAGES=1
|
|
67
|
+
unset PIP_EXTRA_INDEX_URL
|
|
68
|
+
PYTHON_BIN="$(command -v python3)"
|
|
69
|
+
MINI_SWE_AGENT_PYTHON="$PYTHON_BIN"
|
|
70
|
+
if ! "$PYTHON_BIN" -c 'import sys; raise SystemExit(sys.version_info < (3, 10))'; then
|
|
71
|
+
"$PYTHON_BIN" -m pip install --quiet --target {quoted_uv_site_packages_dir} uv=={UV_PACKAGE_VERSION}
|
|
72
|
+
env PYTHONPATH={quoted_uv_site_packages_dir} "$PYTHON_BIN" -m uv python install {MINI_SWE_AGENT_PYTHON_VERSION}
|
|
73
|
+
MINI_SWE_AGENT_PYTHON="$(env PYTHONPATH={quoted_uv_site_packages_dir} "$PYTHON_BIN" -m uv python find {MINI_SWE_AGENT_PYTHON_VERSION})"
|
|
74
|
+
fi
|
|
75
|
+
MINI_SWE_AGENT_WHEEL_DIR="$(mktemp -d)"
|
|
76
|
+
trap 'rm -rf "$MINI_SWE_AGENT_WHEEL_DIR"' EXIT
|
|
77
|
+
MINI_SWE_AGENT_WHEEL="$MINI_SWE_AGENT_WHEEL_DIR/{wheel_filename}"
|
|
78
|
+
MINI_SWE_AGENT_WHEEL_URL={shlex.quote(wheel_url)}
|
|
79
|
+
export MINI_SWE_AGENT_WHEEL MINI_SWE_AGENT_WHEEL_URL
|
|
80
|
+
"$PYTHON_BIN" -c 'import os, urllib.request; urllib.request.urlretrieve(os.environ["MINI_SWE_AGENT_WHEEL_URL"], os.environ["MINI_SWE_AGENT_WHEEL"])'
|
|
81
|
+
echo "{package_sha256} $MINI_SWE_AGENT_WHEEL" | sha256sum -c -
|
|
82
|
+
if [ "$MINI_SWE_AGENT_PYTHON" = "$PYTHON_BIN" ]; then
|
|
83
|
+
"$PYTHON_BIN" -m pip install --quiet --target {quoted_site_packages_dir} "$MINI_SWE_AGENT_WHEEL"
|
|
84
|
+
else
|
|
85
|
+
env PYTHONPATH={quoted_uv_site_packages_dir} "$PYTHON_BIN" -m uv pip install --python "$MINI_SWE_AGENT_PYTHON" --target {quoted_site_packages_dir} "$MINI_SWE_AGENT_WHEEL"
|
|
86
|
+
fi
|
|
87
|
+
echo "$MINI_SWE_AGENT_PYTHON" > {quoted_prefix_dir}/python
|
|
88
|
+
cat > {quoted_prefix_dir}/bin/mini <<'EOF'
|
|
89
|
+
#!/usr/bin/env sh
|
|
90
|
+
export PYTHONPATH={shlex.quote(site_packages_dir)}:${{PYTHONPATH:-}}
|
|
91
|
+
exec "$(cat {quoted_prefix_dir}/python)" -m minisweagent.run.mini "$@"
|
|
92
|
+
EOF
|
|
93
|
+
chmod +x {quoted_prefix_dir}/bin/mini
|
|
94
|
+
test -x {quoted_prefix_dir}/bin/mini
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def build_mini_swe_agent_run_command(
|
|
99
|
+
agent_workdir: str = DEFAULT_AGENT_WORKDIR,
|
|
100
|
+
instruction_path: str = DEFAULT_INSTRUCTION_PATH,
|
|
101
|
+
system_prompt_path: str = DEFAULT_SYSTEM_PROMPT_PATH,
|
|
102
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
103
|
+
trajectory_path: str = DEFAULT_TRAJECTORY_PATH,
|
|
104
|
+
mini_binary: str = DEFAULT_MINI_BINARY,
|
|
105
|
+
config_spec: str = DEFAULT_CONFIG_SPEC,
|
|
106
|
+
model_class: str = DEFAULT_MODEL_CLASS,
|
|
107
|
+
environment_timeout: int = DEFAULT_ENVIRONMENT_TIMEOUT,
|
|
108
|
+
extra_config_specs: list[str] | None = None,
|
|
109
|
+
) -> str:
|
|
110
|
+
"""Build the shell command that configures and runs mini-SWE-agent.
|
|
111
|
+
|
|
112
|
+
Config specs layer the cwd, timeout, LiteLLM model class, optional system
|
|
113
|
+
prompt template, and any caller-provided overrides before writing the
|
|
114
|
+
trajectory and teeing logs.
|
|
115
|
+
"""
|
|
116
|
+
# Keep the default workdir shell-expanded for env-level overrides, mirroring
|
|
117
|
+
# the other harnesses.
|
|
118
|
+
if agent_workdir == DEFAULT_AGENT_WORKDIR:
|
|
119
|
+
workdir_assignment = f"MINI_SWE_AGENT_WORKDIR={DEFAULT_AGENT_WORKDIR}"
|
|
120
|
+
else:
|
|
121
|
+
workdir_assignment = f"MINI_SWE_AGENT_WORKDIR={shlex.quote(agent_workdir)}"
|
|
122
|
+
|
|
123
|
+
config_args = [
|
|
124
|
+
"-c",
|
|
125
|
+
shlex.quote(config_spec),
|
|
126
|
+
"-c",
|
|
127
|
+
"agent.cost_limit=0",
|
|
128
|
+
"-c",
|
|
129
|
+
f"environment.timeout={environment_timeout}",
|
|
130
|
+
"-c",
|
|
131
|
+
f"model.model_class={shlex.quote(model_class)}",
|
|
132
|
+
"-c",
|
|
133
|
+
"model.cost_tracking=ignore_errors",
|
|
134
|
+
"-c",
|
|
135
|
+
"model.model_kwargs.custom_llm_provider=openai",
|
|
136
|
+
]
|
|
137
|
+
# Config specs are the mini CLI's native override format; use them for cwd,
|
|
138
|
+
# timeout, model class, and optional system prompt wiring.
|
|
139
|
+
for spec in extra_config_specs or []:
|
|
140
|
+
config_args.extend(["-c", shlex.quote(spec)])
|
|
141
|
+
|
|
142
|
+
log_dir = str(PurePosixPath(log_path).parent)
|
|
143
|
+
trajectory_dir = str(PurePosixPath(trajectory_path).parent)
|
|
144
|
+
script = f"""\
|
|
145
|
+
set -eo pipefail
|
|
146
|
+
export PATH={shlex.quote(DEFAULT_PREFIX_DIR)}/bin:"$PATH"
|
|
147
|
+
export PYTHONPATH={shlex.quote(DEFAULT_SITE_PACKAGES_DIR)}:"${{PYTHONPATH:-}}"
|
|
148
|
+
export MSWEA_CONFIGURED=true
|
|
149
|
+
export MSWEA_SILENT_STARTUP=true
|
|
150
|
+
export MSWEA_GLOBAL_CONFIG_DIR=/tmp/mini-swe-agent-config
|
|
151
|
+
export OPENAI_API_KEY="${{OPENAI_API_KEY:-intercepted}}"
|
|
152
|
+
|
|
153
|
+
{workdir_assignment}
|
|
154
|
+
mkdir -p {shlex.quote(log_dir)} {shlex.quote(trajectory_dir)} "$MINI_SWE_AGENT_WORKDIR" "$MSWEA_GLOBAL_CONFIG_DIR"
|
|
155
|
+
|
|
156
|
+
MINI_SWE_AGENT_TASK="$(cat {shlex.quote(instruction_path)})"
|
|
157
|
+
CONFIG_ARGS=({" ".join(config_args)})
|
|
158
|
+
CONFIG_ARGS+=(-c "environment.cwd=$MINI_SWE_AGENT_WORKDIR")
|
|
159
|
+
if [ -s {shlex.quote(system_prompt_path)} ]; then
|
|
160
|
+
CONFIG_ARGS+=(-c "agent.system_template=$(cat {shlex.quote(system_prompt_path)})")
|
|
161
|
+
fi
|
|
162
|
+
|
|
163
|
+
cd "$MINI_SWE_AGENT_WORKDIR"
|
|
164
|
+
timeout --kill-after=30s "${{AGENT_TIMEOUT_SECONDS:-3600}}" {shlex.quote(mini_binary)} \\
|
|
165
|
+
--model "$OPENAI_MODEL" \\
|
|
166
|
+
--task "$MINI_SWE_AGENT_TASK" \\
|
|
167
|
+
--output {shlex.quote(trajectory_path)} \\
|
|
168
|
+
--exit-immediately \\
|
|
169
|
+
--yolo \\
|
|
170
|
+
"${{CONFIG_ARGS[@]}}" 2>&1 | tee -a {shlex.quote(log_path)}
|
|
171
|
+
"""
|
|
172
|
+
return f"bash -lc {shlex.quote(script)}"
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
MINI_SWE_AGENT_INSTALL_SCRIPT = build_mini_swe_agent_install_script()
|
|
176
|
+
MINI_SWE_AGENT_CONFIG = {
|
|
177
|
+
"install_script": MINI_SWE_AGENT_INSTALL_SCRIPT,
|
|
178
|
+
"cli_package": MINI_SWE_AGENT_CLI_PACKAGE,
|
|
179
|
+
"cli_version": MINI_SWE_AGENT_CLI_VERSION,
|
|
180
|
+
"cli_sha256": MINI_SWE_AGENT_CLI_SHA256,
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def mini_swe_agent_harness(
|
|
185
|
+
system_prompt: str | None = None,
|
|
186
|
+
task_system_prompt: str | None = None,
|
|
187
|
+
agent_workdir: str = DEFAULT_AGENT_WORKDIR,
|
|
188
|
+
instruction_path: str = DEFAULT_INSTRUCTION_PATH,
|
|
189
|
+
system_prompt_path: str = DEFAULT_SYSTEM_PROMPT_PATH,
|
|
190
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
191
|
+
trajectory_path: str = DEFAULT_TRAJECTORY_PATH,
|
|
192
|
+
package_version: str = DEFAULT_PACKAGE_VERSION,
|
|
193
|
+
package_sha256: str = DEFAULT_PACKAGE_SHA256,
|
|
194
|
+
config_spec: str = DEFAULT_CONFIG_SPEC,
|
|
195
|
+
model_class: str = DEFAULT_MODEL_CLASS,
|
|
196
|
+
environment_timeout: int = DEFAULT_ENVIRONMENT_TIMEOUT,
|
|
197
|
+
extra_config_specs: list[str] | None = None,
|
|
198
|
+
):
|
|
199
|
+
"""Create a Harness configured for mini-SWE-agent."""
|
|
200
|
+
from verifiers.envs.experimental.composable import Harness
|
|
201
|
+
|
|
202
|
+
if task_system_prompt:
|
|
203
|
+
if system_prompt:
|
|
204
|
+
system_prompt = system_prompt + "\n" + task_system_prompt
|
|
205
|
+
else:
|
|
206
|
+
system_prompt = task_system_prompt
|
|
207
|
+
|
|
208
|
+
# The system prompt is passed through ComposableEnv as a file and injected
|
|
209
|
+
# into mini's agent.system_template at runtime.
|
|
210
|
+
return Harness(
|
|
211
|
+
install_script=build_mini_swe_agent_install_script(
|
|
212
|
+
package_version=package_version,
|
|
213
|
+
package_sha256=package_sha256,
|
|
214
|
+
),
|
|
215
|
+
run_command=build_mini_swe_agent_run_command(
|
|
216
|
+
agent_workdir=agent_workdir,
|
|
217
|
+
instruction_path=instruction_path,
|
|
218
|
+
system_prompt_path=system_prompt_path,
|
|
219
|
+
log_path=log_path,
|
|
220
|
+
trajectory_path=trajectory_path,
|
|
221
|
+
config_spec=config_spec,
|
|
222
|
+
model_class=model_class,
|
|
223
|
+
environment_timeout=environment_timeout,
|
|
224
|
+
extra_config_specs=extra_config_specs,
|
|
225
|
+
),
|
|
226
|
+
system_prompt=system_prompt,
|
|
227
|
+
instruction_path=instruction_path,
|
|
228
|
+
system_prompt_path=system_prompt_path,
|
|
229
|
+
log_path=log_path,
|
|
230
|
+
)
|
|
@@ -142,6 +142,47 @@ def build_extra_headers(raw: dict[str, Any]) -> dict[str, str]:
|
|
|
142
142
|
return {**eval_headers_table, **eval_headers_from_list}
|
|
143
143
|
|
|
144
144
|
|
|
145
|
+
def build_extra_headers_from_state(raw: dict[str, Any]) -> dict[str, str]:
|
|
146
|
+
"""Build the header-name → state-key map for `ClientConfig.extra_headers_from_state`.
|
|
147
|
+
|
|
148
|
+
Reads a TOML table (`headers_from_state = { "X-Session-ID" = "trajectory_id" }`)
|
|
149
|
+
and/or a repeatable list (`--header-from-state "X-Session-ID: trajectory_id"`).
|
|
150
|
+
The CLI list wins on key collisions with the table.
|
|
151
|
+
"""
|
|
152
|
+
table: dict[str, str] = {}
|
|
153
|
+
raw_table = raw.get("headers_from_state")
|
|
154
|
+
if raw_table is not None:
|
|
155
|
+
table = _validate_extra_headers_value(raw_table)
|
|
156
|
+
|
|
157
|
+
raw_list = raw.get("header_from_state")
|
|
158
|
+
if raw_list is None:
|
|
159
|
+
raw_list = []
|
|
160
|
+
if not isinstance(raw_list, list):
|
|
161
|
+
raise ValueError(
|
|
162
|
+
"'header_from_state' must be a list of 'Name: state_key' strings"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
from_list: dict[str, str] = {}
|
|
166
|
+
for entry in raw_list:
|
|
167
|
+
if not isinstance(entry, str):
|
|
168
|
+
raise ValueError(
|
|
169
|
+
f"Each 'header_from_state' entry must be a string 'Name: state_key', got: {entry!r}"
|
|
170
|
+
)
|
|
171
|
+
if ":" not in entry:
|
|
172
|
+
raise ValueError(
|
|
173
|
+
f"--header-from-state must be 'Name: state_key', got: {entry!r}"
|
|
174
|
+
)
|
|
175
|
+
key, value = entry.split(":", 1)
|
|
176
|
+
key, value = key.strip(), value.strip()
|
|
177
|
+
if not key:
|
|
178
|
+
raise ValueError("--header-from-state name cannot be empty")
|
|
179
|
+
if not value:
|
|
180
|
+
raise ValueError("--header-from-state state_key cannot be empty")
|
|
181
|
+
from_list[key] = value
|
|
182
|
+
|
|
183
|
+
return {**table, **from_list}
|
|
184
|
+
|
|
185
|
+
|
|
145
186
|
def get_env_eval_defaults(env_id: str) -> dict[str, Any]:
|
|
146
187
|
"""Get eval config defaults from the environment module's pyproject.toml.
|
|
147
188
|
|
|
@@ -279,6 +320,16 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
279
320
|
default=None,
|
|
280
321
|
help="Extra HTTP header to pass to inference API. 'Name: Value'. Repeatable.",
|
|
281
322
|
)
|
|
323
|
+
parser.add_argument(
|
|
324
|
+
"--header-from-state",
|
|
325
|
+
action="append",
|
|
326
|
+
default=None,
|
|
327
|
+
help=(
|
|
328
|
+
"Per-request HTTP header whose value is read from the rollout state. "
|
|
329
|
+
"'Name: state_key' (e.g. 'X-Session-ID: trajectory_id'). Repeatable. "
|
|
330
|
+
"Defaults to X-Session-ID=example_id if unset."
|
|
331
|
+
),
|
|
332
|
+
)
|
|
282
333
|
parser.add_argument(
|
|
283
334
|
"--num-examples",
|
|
284
335
|
"-n",
|
|
@@ -639,6 +690,12 @@ def main(argv: list[str] | None = None):
|
|
|
639
690
|
)
|
|
640
691
|
# Build headers: registry < [[eval]] headers table < header list / --header
|
|
641
692
|
eval_headers_merged = build_extra_headers(raw)
|
|
693
|
+
# Default X-Session-ID → example_id for sticky DP-aware routing;
|
|
694
|
+
# user-supplied headers_from_state / --header-from-state override.
|
|
695
|
+
eval_headers_from_state = {
|
|
696
|
+
"X-Session-ID": "example_id",
|
|
697
|
+
**build_extra_headers_from_state(raw),
|
|
698
|
+
}
|
|
642
699
|
|
|
643
700
|
registry_headers_base: dict[str, str] = {}
|
|
644
701
|
if endpoint_group is not None:
|
|
@@ -683,7 +740,7 @@ def main(argv: list[str] | None = None):
|
|
|
683
740
|
api_base_url=primary_api_base_url,
|
|
684
741
|
endpoint_configs=endpoint_configs,
|
|
685
742
|
extra_headers=merged_headers,
|
|
686
|
-
extra_headers_from_state=
|
|
743
|
+
extra_headers_from_state=eval_headers_from_state,
|
|
687
744
|
)
|
|
688
745
|
|
|
689
746
|
# Backward-compatible TOML field: resume_path
|
|
@@ -26,11 +26,14 @@ from openai.types.chat.chat_completion_chunk import (
|
|
|
26
26
|
|
|
27
27
|
from verifiers.errors import InfraError
|
|
28
28
|
from verifiers.types import Response
|
|
29
|
-
from verifiers.utils.logging_utils import truncate
|
|
29
|
+
from verifiers.utils.logging_utils import print_time, truncate
|
|
30
30
|
|
|
31
31
|
logger = logging.getLogger(__name__)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
+
KEEPALIVE_INTERVAL_SECONDS = 10.0
|
|
35
|
+
|
|
36
|
+
|
|
34
37
|
class StreamInterrupted(InfraError):
|
|
35
38
|
"""Raised when the intercepted streaming response to the agent is cut short.
|
|
36
39
|
|
|
@@ -231,11 +234,56 @@ class InterceptionServer:
|
|
|
231
234
|
"Connection": "keep-alive",
|
|
232
235
|
},
|
|
233
236
|
)
|
|
234
|
-
await response.prepare(http_request)
|
|
235
237
|
|
|
238
|
+
start = time.monotonic()
|
|
239
|
+
|
|
240
|
+
# Half-open transport at accept raises here; surface it so the
|
|
241
|
+
# rollout reschedules instead of looking like a clean empty stream.
|
|
242
|
+
try:
|
|
243
|
+
await response.prepare(http_request)
|
|
244
|
+
except Exception as e:
|
|
245
|
+
logger.warning(
|
|
246
|
+
f"[{rollout_id}] Streaming response.prepare failed: "
|
|
247
|
+
f"{type(e).__name__}: {e}"
|
|
248
|
+
)
|
|
249
|
+
self._set_rollout_error(
|
|
250
|
+
rollout_id,
|
|
251
|
+
StreamInterrupted(f"prepare failed: {type(e).__name__}: {e}"),
|
|
252
|
+
)
|
|
253
|
+
return response
|
|
254
|
+
# Reuse one get() task across keepalive cycles; asyncio.wait_for on
|
|
255
|
+
# Py 3.10/3.11 can silently drop an item when its timeout cancels.
|
|
256
|
+
get_task: asyncio.Task | None = None
|
|
236
257
|
try:
|
|
237
258
|
while True:
|
|
238
|
-
|
|
259
|
+
if get_task is None:
|
|
260
|
+
get_task = asyncio.create_task(chunk_queue.get())
|
|
261
|
+
done, _ = await asyncio.wait(
|
|
262
|
+
{get_task}, timeout=KEEPALIVE_INTERVAL_SECONDS
|
|
263
|
+
)
|
|
264
|
+
if get_task not in done:
|
|
265
|
+
# SSE comment keeps the TCP path warm across the vLLM wait
|
|
266
|
+
# so idle-timeouts in any intermediary don't reap it.
|
|
267
|
+
try:
|
|
268
|
+
await response.write(b": keepalive\n\n")
|
|
269
|
+
except Exception as e:
|
|
270
|
+
waited_s = time.monotonic() - start
|
|
271
|
+
logger.debug(
|
|
272
|
+
f"[{rollout_id}] Streaming error during keepalive "
|
|
273
|
+
f"after {print_time(waited_s)}: {e}"
|
|
274
|
+
)
|
|
275
|
+
self._set_rollout_error(
|
|
276
|
+
rollout_id,
|
|
277
|
+
StreamInterrupted(
|
|
278
|
+
f"keepalive write failed after {print_time(waited_s)}: "
|
|
279
|
+
f"{type(e).__name__}: {e}"
|
|
280
|
+
),
|
|
281
|
+
)
|
|
282
|
+
return response
|
|
283
|
+
continue
|
|
284
|
+
|
|
285
|
+
chunk_dict = get_task.result()
|
|
286
|
+
get_task = None
|
|
239
287
|
|
|
240
288
|
if chunk_dict is None:
|
|
241
289
|
await response.write(b"data: [DONE]\n\n")
|
|
@@ -243,18 +291,28 @@ class InterceptionServer:
|
|
|
243
291
|
|
|
244
292
|
chunk_json = json.dumps(chunk_dict)
|
|
245
293
|
await response.write(f"data: {chunk_json}\n\n".encode())
|
|
294
|
+
# Force a loop yield so the transport flushes before close;
|
|
295
|
+
# otherwise burst contention can truncate the final chunk.
|
|
296
|
+
await asyncio.sleep(0)
|
|
246
297
|
|
|
247
298
|
except asyncio.CancelledError:
|
|
248
299
|
logger.debug(f"[{rollout_id}] Streaming cancelled")
|
|
249
300
|
except Exception as e:
|
|
250
|
-
|
|
301
|
+
waited_s = time.monotonic() - start
|
|
302
|
+
logger.debug(
|
|
303
|
+
f"[{rollout_id}] Streaming error after {print_time(waited_s)}: {e}"
|
|
304
|
+
)
|
|
251
305
|
self._set_rollout_error(
|
|
252
306
|
rollout_id,
|
|
253
307
|
StreamInterrupted(
|
|
254
|
-
f"
|
|
308
|
+
f"stream write failed after {print_time(waited_s)}: "
|
|
309
|
+
f"{type(e).__name__}: {e}"
|
|
255
310
|
),
|
|
256
311
|
)
|
|
257
312
|
return response
|
|
313
|
+
finally:
|
|
314
|
+
if get_task is not None and not get_task.done():
|
|
315
|
+
get_task.cancel()
|
|
258
316
|
|
|
259
317
|
try:
|
|
260
318
|
await response_future
|
|
@@ -263,10 +321,23 @@ class InterceptionServer:
|
|
|
263
321
|
f"[{rollout_id}] Rollout error surfaced in stream: {type(e).__name__}: {e}"
|
|
264
322
|
)
|
|
265
323
|
|
|
324
|
+
# Surface any write_eof failure so a tail truncation becomes a
|
|
325
|
+
# reschedulable error instead of a silent zero-turn completion.
|
|
266
326
|
try:
|
|
267
327
|
await response.write_eof()
|
|
268
|
-
except
|
|
269
|
-
|
|
328
|
+
except Exception as e:
|
|
329
|
+
waited_s = time.monotonic() - start
|
|
330
|
+
logger.warning(
|
|
331
|
+
f"[{rollout_id}] write_eof failed after {print_time(waited_s)}: "
|
|
332
|
+
f"{type(e).__name__}: {e}"
|
|
333
|
+
)
|
|
334
|
+
self._set_rollout_error(
|
|
335
|
+
rollout_id,
|
|
336
|
+
StreamInterrupted(
|
|
337
|
+
f"write_eof failed after {print_time(waited_s)}: "
|
|
338
|
+
f"{type(e).__name__}: {e}"
|
|
339
|
+
),
|
|
340
|
+
)
|
|
270
341
|
return response
|
|
271
342
|
|
|
272
343
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|