verifiers 0.1.13.dev4__tar.gz → 0.1.13.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/PKG-INFO +1 -1
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_eval_cli.py +2 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_interception_utils.py +73 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/__init__.py +1 -1
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/README.md +4 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/harnesses/__init__.py +16 -0
- verifiers-0.1.13.dev6/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +230 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/harnesses/rlm.py +25 -8
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/task.py +3 -3
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/eval.py +58 -1
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/eval_utils.py +2 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/interception_utils.py +78 -7
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/.gitignore +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/LICENSE +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/pyproject.toml +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/AGENTS.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/conftest.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_build_script.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_client_config.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_env_group.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_env_server.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_environment.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_envs.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_imports.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_logging.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_rlm_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_save_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/clients/openai_chat_completions_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/decorators.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/environment.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/composable_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/harness.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/rlm_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/openenv_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/errors.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/prime_rl.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/types.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/eval_display.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/response_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/save_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/utils/version_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.13.
|
|
3
|
+
Version: 0.1.13.dev6
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -41,6 +41,8 @@ def run_cli(make_metadata, make_state, make_input):
|
|
|
41
41
|
"api_base_url": "https://api.openai.com/v1",
|
|
42
42
|
"header": None,
|
|
43
43
|
"headers": None,
|
|
44
|
+
"header_from_state": None,
|
|
45
|
+
"headers_from_state": None,
|
|
44
46
|
"num_examples": 1,
|
|
45
47
|
"rollouts_per_example": 1,
|
|
46
48
|
"max_concurrent": 1,
|
|
@@ -131,3 +131,76 @@ async def test_streaming_write_failure_surfaces_to_state(monkeypatch):
|
|
|
131
131
|
|
|
132
132
|
assert isinstance(state["error"], StreamInterrupted)
|
|
133
133
|
assert "ConnectionResetError" in str(state["error"])
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
async def test_keepalive_emitted_during_idle(monkeypatch):
|
|
137
|
+
"""During the idle window (no chunks on chunk_queue) the handler must
|
|
138
|
+
emit SSE keepalive comments so upstream idle-timeouts don't fire."""
|
|
139
|
+
monkeypatch.setattr(interception_utils, "KEEPALIVE_INTERVAL_SECONDS", 0.05)
|
|
140
|
+
server = InterceptionServer(port=0)
|
|
141
|
+
state: dict = {}
|
|
142
|
+
server.register_rollout("r1", state=state)
|
|
143
|
+
|
|
144
|
+
writes: list[bytes] = []
|
|
145
|
+
|
|
146
|
+
async def fake_write(data: bytes) -> None:
|
|
147
|
+
writes.append(data)
|
|
148
|
+
|
|
149
|
+
fake_response = MagicMock()
|
|
150
|
+
fake_response.prepare = AsyncMock()
|
|
151
|
+
fake_response.write = AsyncMock(side_effect=fake_write)
|
|
152
|
+
fake_response.write_eof = AsyncMock()
|
|
153
|
+
monkeypatch.setattr(
|
|
154
|
+
interception_utils.web, "StreamResponse", lambda **_: fake_response
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
chunk_queue: asyncio.Queue = asyncio.Queue() # starts empty
|
|
158
|
+
response_future: asyncio.Future = asyncio.Future()
|
|
159
|
+
intercept = {
|
|
160
|
+
"chunk_queue": chunk_queue,
|
|
161
|
+
"response_future": response_future,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
task = asyncio.create_task(
|
|
165
|
+
server._handle_streaming_response(MagicMock(), "r1", intercept)
|
|
166
|
+
)
|
|
167
|
+
await asyncio.sleep(0.2) # enough for a few keepalive cycles
|
|
168
|
+
|
|
169
|
+
# Close the loop cleanly: EOF sentinel + resolved future → handler returns.
|
|
170
|
+
response_future.set_result(None)
|
|
171
|
+
await chunk_queue.put(None)
|
|
172
|
+
await task
|
|
173
|
+
|
|
174
|
+
assert any(w == b": keepalive\n\n" for w in writes), (
|
|
175
|
+
f"expected at least one keepalive write, got writes={writes}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
async def test_keepalive_write_failure_surfaces_to_state(monkeypatch):
|
|
180
|
+
"""A failed keepalive write (upstream already cut the TCP connection)
|
|
181
|
+
must funnel into ``state["error"]`` with elapsed-time instrumentation."""
|
|
182
|
+
monkeypatch.setattr(interception_utils, "KEEPALIVE_INTERVAL_SECONDS", 0.05)
|
|
183
|
+
server = InterceptionServer(port=0)
|
|
184
|
+
state: dict = {}
|
|
185
|
+
server.register_rollout("r1", state=state)
|
|
186
|
+
|
|
187
|
+
fake_response = MagicMock()
|
|
188
|
+
fake_response.prepare = AsyncMock()
|
|
189
|
+
fake_response.write = AsyncMock(side_effect=ConnectionResetError("tunnel died"))
|
|
190
|
+
fake_response.write_eof = AsyncMock()
|
|
191
|
+
monkeypatch.setattr(
|
|
192
|
+
interception_utils.web, "StreamResponse", lambda **_: fake_response
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
chunk_queue: asyncio.Queue = asyncio.Queue() # never produces
|
|
196
|
+
intercept = {
|
|
197
|
+
"chunk_queue": chunk_queue,
|
|
198
|
+
"response_future": asyncio.Future(),
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
await server._handle_streaming_response(MagicMock(), "r1", intercept)
|
|
202
|
+
|
|
203
|
+
assert isinstance(state["error"], StreamInterrupted)
|
|
204
|
+
msg = str(state["error"])
|
|
205
|
+
assert "keepalive write failed" in msg
|
|
206
|
+
assert "ConnectionResetError" in msg
|
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Newer and more experimental environment classes that may have some sharper edges + change more frequently.
|
|
4
4
|
|
|
5
|
+
## SandboxMixin
|
|
6
|
+
|
|
7
|
+
`SandboxMixin` works with both container and VM sandboxes. If your environment needs a VM, pass `CreateSandboxRequest(..., vm=True)` to `create_sandbox`. For a GPU VM, also set `gpu_count` and `gpu_type`. Everyday sandbox operations like file upload, file reads, background jobs, and cleanup work the same way. Port exposure and SSH are currently container-only.
|
|
8
|
+
|
|
5
9
|
## GymEnv
|
|
6
10
|
|
|
7
11
|
Universal runner for Gym-compatible environments. Wraps any environment that implements `reset(seed)` and `step(action)` methods (following the OpenAI Gym / Gymnasium API). Supports both old-style 4-tuple and new-style 5-tuple step returns.
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from verifiers.envs.experimental.composable.harnesses.rlm import (
|
|
2
|
+
DEFAULT_RLM_EXEC_TIMEOUT,
|
|
2
3
|
DEFAULT_RLM_MAX_TURNS,
|
|
4
|
+
DEFAULT_RLM_MAX_TURNS_IN_CONTEXT,
|
|
3
5
|
DEFAULT_RLM_REF,
|
|
4
6
|
DEFAULT_RLM_REPO_URL,
|
|
5
7
|
build_install_script as build_rlm_install_script,
|
|
@@ -16,6 +18,13 @@ from verifiers.envs.experimental.composable.harnesses.opencode import (
|
|
|
16
18
|
build_opencode_run_command,
|
|
17
19
|
opencode_harness,
|
|
18
20
|
)
|
|
21
|
+
from verifiers.envs.experimental.composable.harnesses.mini_swe_agent import (
|
|
22
|
+
MINI_SWE_AGENT_CONFIG,
|
|
23
|
+
MINI_SWE_AGENT_INSTALL_SCRIPT,
|
|
24
|
+
build_mini_swe_agent_install_script,
|
|
25
|
+
build_mini_swe_agent_run_command,
|
|
26
|
+
mini_swe_agent_harness,
|
|
27
|
+
)
|
|
19
28
|
|
|
20
29
|
__all__ = [
|
|
21
30
|
"rlm_harness",
|
|
@@ -24,6 +33,8 @@ __all__ = [
|
|
|
24
33
|
"DEFAULT_RLM_REF",
|
|
25
34
|
"DEFAULT_RLM_REPO_URL",
|
|
26
35
|
"DEFAULT_RLM_MAX_TURNS",
|
|
36
|
+
"DEFAULT_RLM_MAX_TURNS_IN_CONTEXT",
|
|
37
|
+
"DEFAULT_RLM_EXEC_TIMEOUT",
|
|
27
38
|
"opencode_harness",
|
|
28
39
|
"build_opencode_install_script",
|
|
29
40
|
"build_opencode_config",
|
|
@@ -32,4 +43,9 @@ __all__ = [
|
|
|
32
43
|
"DEFAULT_DISABLED_TOOLS",
|
|
33
44
|
"DEFAULT_RELEASE_SHA256",
|
|
34
45
|
"DEFAULT_SYSTEM_PROMPT",
|
|
46
|
+
"mini_swe_agent_harness",
|
|
47
|
+
"build_mini_swe_agent_install_script",
|
|
48
|
+
"build_mini_swe_agent_run_command",
|
|
49
|
+
"MINI_SWE_AGENT_INSTALL_SCRIPT",
|
|
50
|
+
"MINI_SWE_AGENT_CONFIG",
|
|
35
51
|
]
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""mini-SWE-agent harness configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import PurePosixPath
|
|
6
|
+
import shlex
|
|
7
|
+
|
|
8
|
+
DEFAULT_INSTALL_DIR = "/opt/mini-swe-agent"
|
|
9
|
+
DEFAULT_PREFIX_DIR = f"{DEFAULT_INSTALL_DIR}/prefix"
|
|
10
|
+
DEFAULT_SITE_PACKAGES_DIR = f"{DEFAULT_PREFIX_DIR}/site-packages"
|
|
11
|
+
DEFAULT_UV_SITE_PACKAGES_DIR = f"{DEFAULT_INSTALL_DIR}/uv-site-packages"
|
|
12
|
+
DEFAULT_MINI_BINARY = f"{DEFAULT_PREFIX_DIR}/bin/mini"
|
|
13
|
+
MINI_SWE_AGENT_CLI_PACKAGE = "mini-swe-agent"
|
|
14
|
+
MINI_SWE_AGENT_CLI_VERSION = "2.2.8"
|
|
15
|
+
MINI_SWE_AGENT_CLI_SHA256 = (
|
|
16
|
+
"694df4de1337e665e3cd82e99f93374f573bf52b8e7c362ac5d8045ad9f7c37c"
|
|
17
|
+
)
|
|
18
|
+
MINI_SWE_AGENT_PYTHON_VERSION = "3.11"
|
|
19
|
+
UV_PACKAGE_VERSION = "0.11.7"
|
|
20
|
+
DEFAULT_PACKAGE_VERSION = MINI_SWE_AGENT_CLI_VERSION
|
|
21
|
+
DEFAULT_PACKAGE_SHA256 = MINI_SWE_AGENT_CLI_SHA256
|
|
22
|
+
DEFAULT_INSTRUCTION_PATH = "/mini-swe-agent/prompt.txt"
|
|
23
|
+
DEFAULT_SYSTEM_PROMPT_PATH = "/mini-swe-agent/system.txt"
|
|
24
|
+
DEFAULT_LOG_DIR = "/logs/agent"
|
|
25
|
+
DEFAULT_LOG_PATH = f"{DEFAULT_LOG_DIR}/mini-swe-agent.log"
|
|
26
|
+
DEFAULT_TRAJECTORY_PATH = f"{DEFAULT_LOG_DIR}/mini-swe-agent.traj.json"
|
|
27
|
+
DEFAULT_AGENT_WORKDIR = "${AGENT_WORKDIR:-/app}"
|
|
28
|
+
DEFAULT_CONFIG_SPEC = "mini_textbased"
|
|
29
|
+
DEFAULT_MODEL_CLASS = "litellm_textbased"
|
|
30
|
+
DEFAULT_ENVIRONMENT_TIMEOUT = 120
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def build_mini_swe_agent_install_script(
|
|
34
|
+
package_version: str = DEFAULT_PACKAGE_VERSION,
|
|
35
|
+
package_sha256: str = DEFAULT_PACKAGE_SHA256,
|
|
36
|
+
prefix_dir: str = DEFAULT_PREFIX_DIR,
|
|
37
|
+
install_python: bool = True,
|
|
38
|
+
) -> str:
|
|
39
|
+
"""Build the shell script that installs mini-SWE-agent."""
|
|
40
|
+
install_tools = ""
|
|
41
|
+
if install_python:
|
|
42
|
+
install_tools = """\
|
|
43
|
+
export DEBIAN_FRONTEND=noninteractive
|
|
44
|
+
if ! command -v python3 >/dev/null 2>&1 || ! python3 -m pip --version >/dev/null 2>&1; then
|
|
45
|
+
apt-get update -qq
|
|
46
|
+
apt-get install -y -qq python3 python3-pip ca-certificates
|
|
47
|
+
fi
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
quoted_prefix_dir = shlex.quote(prefix_dir)
|
|
51
|
+
site_packages_dir = f"{prefix_dir}/site-packages"
|
|
52
|
+
wheel_filename = f"mini_swe_agent-{package_version}-py3-none-any.whl"
|
|
53
|
+
wheel_url = (
|
|
54
|
+
f"https://files.pythonhosted.org/packages/py3/m/mini-swe-agent/{wheel_filename}"
|
|
55
|
+
)
|
|
56
|
+
quoted_site_packages_dir = shlex.quote(site_packages_dir)
|
|
57
|
+
quoted_install_dir = shlex.quote(DEFAULT_INSTALL_DIR)
|
|
58
|
+
quoted_uv_site_packages_dir = shlex.quote(DEFAULT_UV_SITE_PACKAGES_DIR)
|
|
59
|
+
return f"""\
|
|
60
|
+
set -e
|
|
61
|
+
{install_tools}
|
|
62
|
+
rm -rf {quoted_prefix_dir}
|
|
63
|
+
mkdir -p {quoted_install_dir} {quoted_prefix_dir}/bin {quoted_site_packages_dir} {quoted_uv_site_packages_dir} {shlex.quote(DEFAULT_LOG_DIR)} /mini-swe-agent
|
|
64
|
+
export PIP_CONFIG_FILE=/dev/null
|
|
65
|
+
export PIP_INDEX_URL=https://pypi.org/simple
|
|
66
|
+
export PIP_BREAK_SYSTEM_PACKAGES=1
|
|
67
|
+
unset PIP_EXTRA_INDEX_URL
|
|
68
|
+
PYTHON_BIN="$(command -v python3)"
|
|
69
|
+
MINI_SWE_AGENT_PYTHON="$PYTHON_BIN"
|
|
70
|
+
if ! "$PYTHON_BIN" -c 'import sys; raise SystemExit(sys.version_info < (3, 10))'; then
|
|
71
|
+
"$PYTHON_BIN" -m pip install --quiet --target {quoted_uv_site_packages_dir} uv=={UV_PACKAGE_VERSION}
|
|
72
|
+
env PYTHONPATH={quoted_uv_site_packages_dir} "$PYTHON_BIN" -m uv python install {MINI_SWE_AGENT_PYTHON_VERSION}
|
|
73
|
+
MINI_SWE_AGENT_PYTHON="$(env PYTHONPATH={quoted_uv_site_packages_dir} "$PYTHON_BIN" -m uv python find {MINI_SWE_AGENT_PYTHON_VERSION})"
|
|
74
|
+
fi
|
|
75
|
+
MINI_SWE_AGENT_WHEEL_DIR="$(mktemp -d)"
|
|
76
|
+
trap 'rm -rf "$MINI_SWE_AGENT_WHEEL_DIR"' EXIT
|
|
77
|
+
MINI_SWE_AGENT_WHEEL="$MINI_SWE_AGENT_WHEEL_DIR/{wheel_filename}"
|
|
78
|
+
MINI_SWE_AGENT_WHEEL_URL={shlex.quote(wheel_url)}
|
|
79
|
+
export MINI_SWE_AGENT_WHEEL MINI_SWE_AGENT_WHEEL_URL
|
|
80
|
+
"$PYTHON_BIN" -c 'import os, urllib.request; urllib.request.urlretrieve(os.environ["MINI_SWE_AGENT_WHEEL_URL"], os.environ["MINI_SWE_AGENT_WHEEL"])'
|
|
81
|
+
echo "{package_sha256} $MINI_SWE_AGENT_WHEEL" | sha256sum -c -
|
|
82
|
+
if [ "$MINI_SWE_AGENT_PYTHON" = "$PYTHON_BIN" ]; then
|
|
83
|
+
"$PYTHON_BIN" -m pip install --quiet --target {quoted_site_packages_dir} "$MINI_SWE_AGENT_WHEEL"
|
|
84
|
+
else
|
|
85
|
+
env PYTHONPATH={quoted_uv_site_packages_dir} "$PYTHON_BIN" -m uv pip install --python "$MINI_SWE_AGENT_PYTHON" --target {quoted_site_packages_dir} "$MINI_SWE_AGENT_WHEEL"
|
|
86
|
+
fi
|
|
87
|
+
echo "$MINI_SWE_AGENT_PYTHON" > {quoted_prefix_dir}/python
|
|
88
|
+
cat > {quoted_prefix_dir}/bin/mini <<'EOF'
|
|
89
|
+
#!/usr/bin/env sh
|
|
90
|
+
export PYTHONPATH={shlex.quote(site_packages_dir)}:${{PYTHONPATH:-}}
|
|
91
|
+
exec "$(cat {quoted_prefix_dir}/python)" -m minisweagent.run.mini "$@"
|
|
92
|
+
EOF
|
|
93
|
+
chmod +x {quoted_prefix_dir}/bin/mini
|
|
94
|
+
test -x {quoted_prefix_dir}/bin/mini
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def build_mini_swe_agent_run_command(
|
|
99
|
+
agent_workdir: str = DEFAULT_AGENT_WORKDIR,
|
|
100
|
+
instruction_path: str = DEFAULT_INSTRUCTION_PATH,
|
|
101
|
+
system_prompt_path: str = DEFAULT_SYSTEM_PROMPT_PATH,
|
|
102
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
103
|
+
trajectory_path: str = DEFAULT_TRAJECTORY_PATH,
|
|
104
|
+
mini_binary: str = DEFAULT_MINI_BINARY,
|
|
105
|
+
config_spec: str = DEFAULT_CONFIG_SPEC,
|
|
106
|
+
model_class: str = DEFAULT_MODEL_CLASS,
|
|
107
|
+
environment_timeout: int = DEFAULT_ENVIRONMENT_TIMEOUT,
|
|
108
|
+
extra_config_specs: list[str] | None = None,
|
|
109
|
+
) -> str:
|
|
110
|
+
"""Build the shell command that configures and runs mini-SWE-agent.
|
|
111
|
+
|
|
112
|
+
Config specs layer the cwd, timeout, LiteLLM model class, optional system
|
|
113
|
+
prompt template, and any caller-provided overrides before writing the
|
|
114
|
+
trajectory and teeing logs.
|
|
115
|
+
"""
|
|
116
|
+
# Keep the default workdir shell-expanded for env-level overrides, mirroring
|
|
117
|
+
# the other harnesses.
|
|
118
|
+
if agent_workdir == DEFAULT_AGENT_WORKDIR:
|
|
119
|
+
workdir_assignment = f"MINI_SWE_AGENT_WORKDIR={DEFAULT_AGENT_WORKDIR}"
|
|
120
|
+
else:
|
|
121
|
+
workdir_assignment = f"MINI_SWE_AGENT_WORKDIR={shlex.quote(agent_workdir)}"
|
|
122
|
+
|
|
123
|
+
config_args = [
|
|
124
|
+
"-c",
|
|
125
|
+
shlex.quote(config_spec),
|
|
126
|
+
"-c",
|
|
127
|
+
"agent.cost_limit=0",
|
|
128
|
+
"-c",
|
|
129
|
+
f"environment.timeout={environment_timeout}",
|
|
130
|
+
"-c",
|
|
131
|
+
f"model.model_class={shlex.quote(model_class)}",
|
|
132
|
+
"-c",
|
|
133
|
+
"model.cost_tracking=ignore_errors",
|
|
134
|
+
"-c",
|
|
135
|
+
"model.model_kwargs.custom_llm_provider=openai",
|
|
136
|
+
]
|
|
137
|
+
# Config specs are the mini CLI's native override format; use them for cwd,
|
|
138
|
+
# timeout, model class, and optional system prompt wiring.
|
|
139
|
+
for spec in extra_config_specs or []:
|
|
140
|
+
config_args.extend(["-c", shlex.quote(spec)])
|
|
141
|
+
|
|
142
|
+
log_dir = str(PurePosixPath(log_path).parent)
|
|
143
|
+
trajectory_dir = str(PurePosixPath(trajectory_path).parent)
|
|
144
|
+
script = f"""\
|
|
145
|
+
set -eo pipefail
|
|
146
|
+
export PATH={shlex.quote(DEFAULT_PREFIX_DIR)}/bin:"$PATH"
|
|
147
|
+
export PYTHONPATH={shlex.quote(DEFAULT_SITE_PACKAGES_DIR)}:"${{PYTHONPATH:-}}"
|
|
148
|
+
export MSWEA_CONFIGURED=true
|
|
149
|
+
export MSWEA_SILENT_STARTUP=true
|
|
150
|
+
export MSWEA_GLOBAL_CONFIG_DIR=/tmp/mini-swe-agent-config
|
|
151
|
+
export OPENAI_API_KEY="${{OPENAI_API_KEY:-intercepted}}"
|
|
152
|
+
|
|
153
|
+
{workdir_assignment}
|
|
154
|
+
mkdir -p {shlex.quote(log_dir)} {shlex.quote(trajectory_dir)} "$MINI_SWE_AGENT_WORKDIR" "$MSWEA_GLOBAL_CONFIG_DIR"
|
|
155
|
+
|
|
156
|
+
MINI_SWE_AGENT_TASK="$(cat {shlex.quote(instruction_path)})"
|
|
157
|
+
CONFIG_ARGS=({" ".join(config_args)})
|
|
158
|
+
CONFIG_ARGS+=(-c "environment.cwd=$MINI_SWE_AGENT_WORKDIR")
|
|
159
|
+
if [ -s {shlex.quote(system_prompt_path)} ]; then
|
|
160
|
+
CONFIG_ARGS+=(-c "agent.system_template=$(cat {shlex.quote(system_prompt_path)})")
|
|
161
|
+
fi
|
|
162
|
+
|
|
163
|
+
cd "$MINI_SWE_AGENT_WORKDIR"
|
|
164
|
+
timeout --kill-after=30s "${{AGENT_TIMEOUT_SECONDS:-3600}}" {shlex.quote(mini_binary)} \\
|
|
165
|
+
--model "$OPENAI_MODEL" \\
|
|
166
|
+
--task "$MINI_SWE_AGENT_TASK" \\
|
|
167
|
+
--output {shlex.quote(trajectory_path)} \\
|
|
168
|
+
--exit-immediately \\
|
|
169
|
+
--yolo \\
|
|
170
|
+
"${{CONFIG_ARGS[@]}}" 2>&1 | tee -a {shlex.quote(log_path)}
|
|
171
|
+
"""
|
|
172
|
+
return f"bash -lc {shlex.quote(script)}"
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
MINI_SWE_AGENT_INSTALL_SCRIPT = build_mini_swe_agent_install_script()
|
|
176
|
+
MINI_SWE_AGENT_CONFIG = {
|
|
177
|
+
"install_script": MINI_SWE_AGENT_INSTALL_SCRIPT,
|
|
178
|
+
"cli_package": MINI_SWE_AGENT_CLI_PACKAGE,
|
|
179
|
+
"cli_version": MINI_SWE_AGENT_CLI_VERSION,
|
|
180
|
+
"cli_sha256": MINI_SWE_AGENT_CLI_SHA256,
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def mini_swe_agent_harness(
|
|
185
|
+
system_prompt: str | None = None,
|
|
186
|
+
task_system_prompt: str | None = None,
|
|
187
|
+
agent_workdir: str = DEFAULT_AGENT_WORKDIR,
|
|
188
|
+
instruction_path: str = DEFAULT_INSTRUCTION_PATH,
|
|
189
|
+
system_prompt_path: str = DEFAULT_SYSTEM_PROMPT_PATH,
|
|
190
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
191
|
+
trajectory_path: str = DEFAULT_TRAJECTORY_PATH,
|
|
192
|
+
package_version: str = DEFAULT_PACKAGE_VERSION,
|
|
193
|
+
package_sha256: str = DEFAULT_PACKAGE_SHA256,
|
|
194
|
+
config_spec: str = DEFAULT_CONFIG_SPEC,
|
|
195
|
+
model_class: str = DEFAULT_MODEL_CLASS,
|
|
196
|
+
environment_timeout: int = DEFAULT_ENVIRONMENT_TIMEOUT,
|
|
197
|
+
extra_config_specs: list[str] | None = None,
|
|
198
|
+
):
|
|
199
|
+
"""Create a Harness configured for mini-SWE-agent."""
|
|
200
|
+
from verifiers.envs.experimental.composable import Harness
|
|
201
|
+
|
|
202
|
+
if task_system_prompt:
|
|
203
|
+
if system_prompt:
|
|
204
|
+
system_prompt = system_prompt + "\n" + task_system_prompt
|
|
205
|
+
else:
|
|
206
|
+
system_prompt = task_system_prompt
|
|
207
|
+
|
|
208
|
+
# The system prompt is passed through ComposableEnv as a file and injected
|
|
209
|
+
# into mini's agent.system_template at runtime.
|
|
210
|
+
return Harness(
|
|
211
|
+
install_script=build_mini_swe_agent_install_script(
|
|
212
|
+
package_version=package_version,
|
|
213
|
+
package_sha256=package_sha256,
|
|
214
|
+
),
|
|
215
|
+
run_command=build_mini_swe_agent_run_command(
|
|
216
|
+
agent_workdir=agent_workdir,
|
|
217
|
+
instruction_path=instruction_path,
|
|
218
|
+
system_prompt_path=system_prompt_path,
|
|
219
|
+
log_path=log_path,
|
|
220
|
+
trajectory_path=trajectory_path,
|
|
221
|
+
config_spec=config_spec,
|
|
222
|
+
model_class=model_class,
|
|
223
|
+
environment_timeout=environment_timeout,
|
|
224
|
+
extra_config_specs=extra_config_specs,
|
|
225
|
+
),
|
|
226
|
+
system_prompt=system_prompt,
|
|
227
|
+
instruction_path=instruction_path,
|
|
228
|
+
system_prompt_path=system_prompt_path,
|
|
229
|
+
log_path=log_path,
|
|
230
|
+
)
|
|
@@ -15,6 +15,8 @@ from verifiers.envs.experimental.utils.git_checkout_cache import (
|
|
|
15
15
|
DEFAULT_RLM_REPO_URL = "github.com/PrimeIntellect-ai/rlm.git"
|
|
16
16
|
DEFAULT_RLM_REF = "main"
|
|
17
17
|
DEFAULT_RLM_MAX_TURNS = 100
|
|
18
|
+
DEFAULT_RLM_MAX_TURNS_IN_CONTEXT = -1
|
|
19
|
+
DEFAULT_RLM_EXEC_TIMEOUT = 300
|
|
18
20
|
DEFAULT_APPEND_TO_SYSTEM_PROMPT_PATH = "/task/append_to_system_prompt.txt"
|
|
19
21
|
DEFAULT_RLM_CHECKOUT_PATH = "/tmp/rlm-checkout"
|
|
20
22
|
DEFAULT_RLM_CHECKOUT_UPLOAD_NAME = "rlm_checkout"
|
|
@@ -98,6 +100,9 @@ def rlm_harness(
|
|
|
98
100
|
instruction_path: str = "/task/instruction.md",
|
|
99
101
|
rlm_repo_url: str = DEFAULT_RLM_REPO_URL,
|
|
100
102
|
rlm_ref: str = DEFAULT_RLM_REF,
|
|
103
|
+
rlm_max_turns: int = DEFAULT_RLM_MAX_TURNS,
|
|
104
|
+
rlm_max_turns_in_context: int = DEFAULT_RLM_MAX_TURNS_IN_CONTEXT,
|
|
105
|
+
rlm_exec_timeout: int = DEFAULT_RLM_EXEC_TIMEOUT,
|
|
101
106
|
append_to_system_prompt: str | None = None,
|
|
102
107
|
local_checkout: str | Path | None = None,
|
|
103
108
|
gh_token: str | None = None,
|
|
@@ -106,13 +111,20 @@ def rlm_harness(
|
|
|
106
111
|
) -> Harness:
|
|
107
112
|
"""Build an RLM harness.
|
|
108
113
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
``
|
|
112
|
-
``
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
114
|
+
The harness is the single source of truth for every ``RLM_*`` sandbox
|
|
115
|
+
env var the RLM subprocess reads. Kwargs map 1:1 onto env vars written
|
|
116
|
+
to ``Harness.environment_vars`` and merged into the sandbox by
|
|
117
|
+
``ComposableEnv`` (harness-wins):
|
|
118
|
+
|
|
119
|
+
- ``rlm_tools`` → ``RLM_TOOLS`` (also drives ``Harness.tool_names`` so
|
|
120
|
+
``ToolMonitorRubric`` tracks exactly the active tools)
|
|
121
|
+
- ``rlm_max_turns`` → ``RLM_MAX_TURNS``
|
|
122
|
+
- ``rlm_max_turns_in_context`` → ``RLM_MAX_TURNS_IN_CONTEXT``
|
|
123
|
+
- ``rlm_exec_timeout`` → ``RLM_EXEC_TIMEOUT``
|
|
124
|
+
|
|
125
|
+
Callers do not need to — and should not — add these keys to
|
|
126
|
+
``ComposableEnv(environment_vars=...)`` themselves; pass the kwargs
|
|
127
|
+
here and the harness owns the env var plumbing.
|
|
116
128
|
|
|
117
129
|
``allow_git`` defaults to False, mirroring opencode's bash tool. When
|
|
118
130
|
False, a ``/usr/local/bin/git`` shim is uploaded that refuses on any
|
|
@@ -163,7 +175,12 @@ def rlm_harness(
|
|
|
163
175
|
metrics_key="metrics",
|
|
164
176
|
metrics_prefix="rlm_",
|
|
165
177
|
tool_names=tool_names,
|
|
166
|
-
environment_vars={
|
|
178
|
+
environment_vars={
|
|
179
|
+
"RLM_TOOLS": ",".join(tool_names),
|
|
180
|
+
"RLM_MAX_TURNS": str(rlm_max_turns),
|
|
181
|
+
"RLM_MAX_TURNS_IN_CONTEXT": str(rlm_max_turns_in_context),
|
|
182
|
+
"RLM_EXEC_TIMEOUT": str(rlm_exec_timeout),
|
|
183
|
+
},
|
|
167
184
|
post_install_uploads=post_install_uploads,
|
|
168
185
|
post_install_script=post_install_script,
|
|
169
186
|
)
|
{verifiers-0.1.13.dev4 → verifiers-0.1.13.dev6}/verifiers/envs/experimental/composable/task.py
RENAMED
|
@@ -32,7 +32,7 @@ from dataclasses import dataclass
|
|
|
32
32
|
from importlib.abc import Traversable
|
|
33
33
|
from pathlib import Path
|
|
34
34
|
from types import ModuleType
|
|
35
|
-
from typing import Any, Callable
|
|
35
|
+
from typing import Any, Callable, Self
|
|
36
36
|
|
|
37
37
|
from verifiers.envs.experimental.composable._filter import _resolve_filter_fn
|
|
38
38
|
from verifiers.types import Messages, State
|
|
@@ -279,13 +279,13 @@ class TaskSet:
|
|
|
279
279
|
|
|
280
280
|
# -- Combinators ---------------------------------------------------------
|
|
281
281
|
|
|
282
|
-
def filter(self, predicate: Callable[[dict], bool]) ->
|
|
282
|
+
def filter(self, predicate: Callable[[dict], bool]) -> Self:
|
|
283
283
|
clone = object.__new__(type(self))
|
|
284
284
|
clone.__dict__.update(self.__dict__)
|
|
285
285
|
clone._dataset = self._dataset.filter(predicate)
|
|
286
286
|
return clone
|
|
287
287
|
|
|
288
|
-
def take(self, n: int) ->
|
|
288
|
+
def take(self, n: int) -> Self:
|
|
289
289
|
clone = object.__new__(type(self))
|
|
290
290
|
clone.__dict__.update(self.__dict__)
|
|
291
291
|
clone._dataset = self._dataset.select(range(min(n, len(self._dataset))))
|
|
@@ -142,6 +142,47 @@ def build_extra_headers(raw: dict[str, Any]) -> dict[str, str]:
|
|
|
142
142
|
return {**eval_headers_table, **eval_headers_from_list}
|
|
143
143
|
|
|
144
144
|
|
|
145
|
+
def build_extra_headers_from_state(raw: dict[str, Any]) -> dict[str, str]:
|
|
146
|
+
"""Build the header-name → state-key map for `ClientConfig.extra_headers_from_state`.
|
|
147
|
+
|
|
148
|
+
Reads a TOML table (`headers_from_state = { "X-Session-ID" = "trajectory_id" }`)
|
|
149
|
+
and/or a repeatable list (`--header-from-state "X-Session-ID: trajectory_id"`).
|
|
150
|
+
The CLI list wins on key collisions with the table.
|
|
151
|
+
"""
|
|
152
|
+
table: dict[str, str] = {}
|
|
153
|
+
raw_table = raw.get("headers_from_state")
|
|
154
|
+
if raw_table is not None:
|
|
155
|
+
table = _validate_extra_headers_value(raw_table)
|
|
156
|
+
|
|
157
|
+
raw_list = raw.get("header_from_state")
|
|
158
|
+
if raw_list is None:
|
|
159
|
+
raw_list = []
|
|
160
|
+
if not isinstance(raw_list, list):
|
|
161
|
+
raise ValueError(
|
|
162
|
+
"'header_from_state' must be a list of 'Name: state_key' strings"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
from_list: dict[str, str] = {}
|
|
166
|
+
for entry in raw_list:
|
|
167
|
+
if not isinstance(entry, str):
|
|
168
|
+
raise ValueError(
|
|
169
|
+
f"Each 'header_from_state' entry must be a string 'Name: state_key', got: {entry!r}"
|
|
170
|
+
)
|
|
171
|
+
if ":" not in entry:
|
|
172
|
+
raise ValueError(
|
|
173
|
+
f"--header-from-state must be 'Name: state_key', got: {entry!r}"
|
|
174
|
+
)
|
|
175
|
+
key, value = entry.split(":", 1)
|
|
176
|
+
key, value = key.strip(), value.strip()
|
|
177
|
+
if not key:
|
|
178
|
+
raise ValueError("--header-from-state name cannot be empty")
|
|
179
|
+
if not value:
|
|
180
|
+
raise ValueError("--header-from-state state_key cannot be empty")
|
|
181
|
+
from_list[key] = value
|
|
182
|
+
|
|
183
|
+
return {**table, **from_list}
|
|
184
|
+
|
|
185
|
+
|
|
145
186
|
def get_env_eval_defaults(env_id: str) -> dict[str, Any]:
|
|
146
187
|
"""Get eval config defaults from the environment module's pyproject.toml.
|
|
147
188
|
|
|
@@ -279,6 +320,16 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
279
320
|
default=None,
|
|
280
321
|
help="Extra HTTP header to pass to inference API. 'Name: Value'. Repeatable.",
|
|
281
322
|
)
|
|
323
|
+
parser.add_argument(
|
|
324
|
+
"--header-from-state",
|
|
325
|
+
action="append",
|
|
326
|
+
default=None,
|
|
327
|
+
help=(
|
|
328
|
+
"Per-request HTTP header whose value is read from the rollout state. "
|
|
329
|
+
"'Name: state_key' (e.g. 'X-Session-ID: trajectory_id'). Repeatable. "
|
|
330
|
+
"Defaults to X-Session-ID=example_id if unset."
|
|
331
|
+
),
|
|
332
|
+
)
|
|
282
333
|
parser.add_argument(
|
|
283
334
|
"--num-examples",
|
|
284
335
|
"-n",
|
|
@@ -639,6 +690,12 @@ def main(argv: list[str] | None = None):
|
|
|
639
690
|
)
|
|
640
691
|
# Build headers: registry < [[eval]] headers table < header list / --header
|
|
641
692
|
eval_headers_merged = build_extra_headers(raw)
|
|
693
|
+
# Default X-Session-ID → example_id for sticky DP-aware routing;
|
|
694
|
+
# user-supplied headers_from_state / --header-from-state override.
|
|
695
|
+
eval_headers_from_state = {
|
|
696
|
+
"X-Session-ID": "example_id",
|
|
697
|
+
**build_extra_headers_from_state(raw),
|
|
698
|
+
}
|
|
642
699
|
|
|
643
700
|
registry_headers_base: dict[str, str] = {}
|
|
644
701
|
if endpoint_group is not None:
|
|
@@ -683,7 +740,7 @@ def main(argv: list[str] | None = None):
|
|
|
683
740
|
api_base_url=primary_api_base_url,
|
|
684
741
|
endpoint_configs=endpoint_configs,
|
|
685
742
|
extra_headers=merged_headers,
|
|
686
|
-
extra_headers_from_state=
|
|
743
|
+
extra_headers_from_state=eval_headers_from_state,
|
|
687
744
|
)
|
|
688
745
|
|
|
689
746
|
# Backward-compatible TOML field: resume_path
|