verifiers 0.1.12.dev6__tar.gz → 0.1.13.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/PKG-INFO +3 -3
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/README.md +1 -1
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/pyproject.toml +1 -1
- verifiers-0.1.13.dev1/tests/test_composable_env.py +592 -0
- verifiers-0.1.13.dev1/tests/test_nemorl_client.py +219 -0
- verifiers-0.1.13.dev1/tests/test_rlm_composable_env.py +262 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/__init__.py +1 -1
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/clients/__init__.py +6 -0
- verifiers-0.1.13.dev1/verifiers/clients/nemorl_chat_completions_client.py +87 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/clients/openai_chat_completions_token_client.py +39 -4
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/cli_agent_env.py +7 -2
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/README.md +23 -1
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/__init__.py +2 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/composable_env.py +143 -28
- verifiers-0.1.13.dev1/verifiers/envs/experimental/composable/harness.py +100 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/harnesses/__init__.py +2 -0
- verifiers-0.1.13.dev1/verifiers/envs/experimental/composable/harnesses/rlm.py +82 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/task.py +74 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/__init__.py +2 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +4 -1
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +2 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +4 -1
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +6 -1
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +6 -1
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +14 -1
- verifiers-0.1.13.dev1/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +384 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +15 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/eval.py +1 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/types.py +1 -0
- verifiers-0.1.12.dev6/tests/test_composable_env.py +0 -260
- verifiers-0.1.12.dev6/verifiers/envs/experimental/composable/harness.py +0 -58
- verifiers-0.1.12.dev6/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -50
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/.gitignore +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/LICENSE +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/AGENTS.md +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/README.md +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/conftest.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_build_script.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_client_config.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_env_group.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_env_server.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_environment.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_envs.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_eval_cli.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_imports.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_logging.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_parser.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_rlm_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_rubric.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_save_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/clients/openai_chat_completions_client.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/decorators.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/environment.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/harbor_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/rlm_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/openenv_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/errors.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/prime_rl.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/eval_display.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/eval_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/interception_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/response_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/save_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.12.dev6 → verifiers-0.1.13.dev1}/verifiers/utils/version_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.13.dev1
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -34,7 +34,7 @@ Requires-Dist: nest-asyncio>=1.6.0
|
|
|
34
34
|
Requires-Dist: numpy
|
|
35
35
|
Requires-Dist: openai-agents>=0.0.7
|
|
36
36
|
Requires-Dist: openai>=1.108.1
|
|
37
|
-
Requires-Dist: prime-sandboxes>=0.2.
|
|
37
|
+
Requires-Dist: prime-sandboxes>=0.2.20
|
|
38
38
|
Requires-Dist: prime-tunnel>=0.1.6
|
|
39
39
|
Requires-Dist: pydantic>=2.11.9
|
|
40
40
|
Requires-Dist: pyzmq>=27.1.0
|
|
@@ -107,7 +107,7 @@ Verifiers: Environments for LLM Reinforcement Learning
|
|
|
107
107
|
|
|
108
108
|
## News & Updates
|
|
109
109
|
|
|
110
|
-
- [
|
|
110
|
+
- [04/17/26] v0.1.12 is released, featuring a new composable Task/Agent/Environment architecture, upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
|
|
111
111
|
- [03/12/26] v0.1.11 is released, featuring a unified client stack, major `RLMEnv` and env server reliability improvements, a substantially refined eval TUI, new pass@k and ablation sweep support, and bundled opencode environments.
|
|
112
112
|
- [02/10/26] v0.1.10 is released, featuring OpenEnv and BrowserEnv integrations, resumed evals, improved rollout and token tracking, safer sandbox lifecycle behavior, refreshed workspace setup, and opencode harbor improvements.
|
|
113
113
|
- [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
|
|
@@ -34,7 +34,7 @@ Verifiers: Environments for LLM Reinforcement Learning
|
|
|
34
34
|
|
|
35
35
|
## News & Updates
|
|
36
36
|
|
|
37
|
-
- [
|
|
37
|
+
- [04/17/26] v0.1.12 is released, featuring a new composable Task/Agent/Environment architecture, upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
|
|
38
38
|
- [03/12/26] v0.1.11 is released, featuring a unified client stack, major `RLMEnv` and env server reliability improvements, a substantially refined eval TUI, new pass@k and ablation sweep support, and bundled opencode environments.
|
|
39
39
|
- [02/10/26] v0.1.10 is released, featuring OpenEnv and BrowserEnv integrations, resumed evals, improved rollout and token tracking, safer sandbox lifecycle behavior, refreshed workspace setup, and opencode harbor improvements.
|
|
40
40
|
- [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
|
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
"""Tests for the composable architecture: Task, TaskSet, SandboxTaskSet, SandboxSpec."""
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import json
|
|
5
|
+
from types import SimpleNamespace
|
|
6
|
+
from unittest.mock import AsyncMock, call
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
import verifiers as vf
|
|
11
|
+
from verifiers.envs.experimental.composable import (
|
|
12
|
+
ComposableEnv,
|
|
13
|
+
Harness,
|
|
14
|
+
SandboxSpec,
|
|
15
|
+
SandboxTaskSet,
|
|
16
|
+
Task,
|
|
17
|
+
TaskSet,
|
|
18
|
+
discover_sibling_dir,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ── Mock Rubrics ──────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MockSandboxRubric(vf.Rubric):
|
|
26
|
+
def __init__(self, **kwargs):
|
|
27
|
+
super().__init__(**kwargs)
|
|
28
|
+
self.add_reward_func(self.solved)
|
|
29
|
+
|
|
30
|
+
async def solved(self, state, **kwargs) -> float:
|
|
31
|
+
return 1.0 if state.get("test_output") == "PASS" else 0.0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MockMathRubric(vf.Rubric):
|
|
35
|
+
def __init__(self, **kwargs):
|
|
36
|
+
super().__init__(**kwargs)
|
|
37
|
+
self.add_reward_func(self.correct)
|
|
38
|
+
|
|
39
|
+
async def correct(self, state, **kwargs) -> float:
|
|
40
|
+
return 1.0 if state.get("info", {}).get("id") == 0 else 0.0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ── Mock TaskSets ───────────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class MockSandboxTaskSet(SandboxTaskSet):
|
|
47
|
+
"""SandboxTaskSet for testing."""
|
|
48
|
+
|
|
49
|
+
def get_instruction(self, info):
|
|
50
|
+
return f"Fix bug #{info.get('id', 0)}"
|
|
51
|
+
|
|
52
|
+
def get_sandbox_spec(self, info):
|
|
53
|
+
return SandboxSpec(image="python:3.11-slim", cpu_cores=2, memory_gb=2)
|
|
54
|
+
|
|
55
|
+
def get_rubric(self):
|
|
56
|
+
return MockSandboxRubric()
|
|
57
|
+
|
|
58
|
+
def get_workdir(self, info):
|
|
59
|
+
return "/testbed"
|
|
60
|
+
|
|
61
|
+
def get_env_vars(self):
|
|
62
|
+
return {"FOO": "bar"}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class MockTaskSet(TaskSet):
|
|
66
|
+
"""Plain TaskSet (no sandbox) for testing."""
|
|
67
|
+
|
|
68
|
+
def get_instruction(self, info):
|
|
69
|
+
return info.get("question", "")
|
|
70
|
+
|
|
71
|
+
def get_rubric(self):
|
|
72
|
+
return MockMathRubric()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _make_dataset(n=3):
|
|
76
|
+
from datasets import Dataset
|
|
77
|
+
|
|
78
|
+
return Dataset.from_dict(
|
|
79
|
+
{
|
|
80
|
+
"info": [{"id": i, "question": f"q{i}"} for i in range(n)],
|
|
81
|
+
"answer": ["" for _ in range(n)],
|
|
82
|
+
}
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ── SandboxSpec ─────────────────────────────────────────────────────────
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_sandbox_spec_defaults():
|
|
90
|
+
spec = SandboxSpec()
|
|
91
|
+
assert spec.image == "python:3.11-slim"
|
|
92
|
+
assert spec.cpu_cores == 4
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_sandbox_spec_custom():
|
|
96
|
+
spec = SandboxSpec(image="lean-tactic:v4.27", gpu_count=1)
|
|
97
|
+
assert spec.image == "lean-tactic:v4.27"
|
|
98
|
+
assert spec.gpu_count == 1
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ── Task from SandboxTaskSet ───────────────────────────────────────────
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_task_sandbox_spec():
|
|
105
|
+
ts = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
106
|
+
task = ts[0]
|
|
107
|
+
assert isinstance(task, Task)
|
|
108
|
+
assert task.sandbox_spec is not None
|
|
109
|
+
assert task.sandbox_spec.image == "python:3.11-slim"
|
|
110
|
+
assert task.sandbox_spec.cpu_cores == 2
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def test_task_image():
|
|
114
|
+
ts = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
115
|
+
task = ts[0]
|
|
116
|
+
assert task.image == "python:3.11-slim"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_task_workdir():
|
|
120
|
+
ts = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
121
|
+
task = ts[0]
|
|
122
|
+
assert task.workdir == "/testbed"
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_task_repr_sandbox():
|
|
126
|
+
ts = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
127
|
+
task = ts[0]
|
|
128
|
+
assert "python:3.11-slim" in repr(task)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ── Task from plain TaskSet ────────────────────────────────────────────
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_task_no_sandbox():
|
|
135
|
+
ts = MockTaskSet(dataset=_make_dataset(), name="math")
|
|
136
|
+
task = ts[0]
|
|
137
|
+
assert task.sandbox_spec is None
|
|
138
|
+
assert task.image is None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_task_repr_no_sandbox():
|
|
142
|
+
ts = MockTaskSet(dataset=_make_dataset(), name="math")
|
|
143
|
+
task = ts[0]
|
|
144
|
+
assert "no sandbox" in repr(task)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ── TaskSet ─────────────────────────────────────────────────────────────
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def test_taskset_isinstance():
|
|
151
|
+
ts = MockTaskSet(dataset=_make_dataset(), name="math")
|
|
152
|
+
assert not isinstance(ts, SandboxTaskSet)
|
|
153
|
+
|
|
154
|
+
ts2 = MockSandboxTaskSet(dataset=_make_dataset(), name="swe")
|
|
155
|
+
assert isinstance(ts2, SandboxTaskSet)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def test_taskset_len():
|
|
159
|
+
ts = MockTaskSet(dataset=_make_dataset(5), name="test")
|
|
160
|
+
assert len(ts) == 5
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def test_taskset_iter():
|
|
164
|
+
ts = MockTaskSet(dataset=_make_dataset(3), name="test")
|
|
165
|
+
tasks = list(ts)
|
|
166
|
+
assert len(tasks) == 3
|
|
167
|
+
assert all(isinstance(t, Task) for t in tasks)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def test_taskset_filter():
|
|
171
|
+
ts = MockSandboxTaskSet(dataset=_make_dataset(5), name="test")
|
|
172
|
+
filtered = ts.filter(lambda ex: ex["info"]["id"] < 3)
|
|
173
|
+
assert len(filtered) == 3
|
|
174
|
+
assert isinstance(filtered, MockSandboxTaskSet)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def test_taskset_take():
|
|
178
|
+
ts = MockSandboxTaskSet(dataset=_make_dataset(5), name="test")
|
|
179
|
+
taken = ts.take(2)
|
|
180
|
+
assert len(taken) == 2
|
|
181
|
+
assert isinstance(taken, MockSandboxTaskSet)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def test_taskset_repr():
|
|
185
|
+
ts = MockTaskSet(dataset=_make_dataset(), name="mytest")
|
|
186
|
+
assert "mytest" in repr(ts)
|
|
187
|
+
assert "3" in repr(ts)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@pytest.mark.asyncio
|
|
191
|
+
async def test_composable_env_exports_task_workdir():
|
|
192
|
+
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
193
|
+
env = ComposableEnv(
|
|
194
|
+
taskset=taskset,
|
|
195
|
+
harness=Harness(run_command="true"),
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
env_vars = await env.build_env_vars(
|
|
199
|
+
{
|
|
200
|
+
"info": {"id": 0},
|
|
201
|
+
"interception_base_url": "https://test.trycloudflare.com/v1",
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
assert env_vars["AGENT_WORKDIR"] == "/testbed"
|
|
206
|
+
assert env_vars["FOO"] == "bar"
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@pytest.mark.asyncio
|
|
210
|
+
async def test_composable_env_quotes_paths_in_mkdir_command():
|
|
211
|
+
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
212
|
+
env = ComposableEnv(
|
|
213
|
+
taskset=taskset,
|
|
214
|
+
harness=Harness(
|
|
215
|
+
run_command="true",
|
|
216
|
+
instruction_path="/tmp/with space/prompt.txt",
|
|
217
|
+
system_prompt="system",
|
|
218
|
+
system_prompt_path="/tmp/other path/system.txt",
|
|
219
|
+
),
|
|
220
|
+
)
|
|
221
|
+
env.sandbox_client = SimpleNamespace(
|
|
222
|
+
execute_command=AsyncMock(),
|
|
223
|
+
teardown=lambda: None,
|
|
224
|
+
)
|
|
225
|
+
env.taskset.setup = AsyncMock()
|
|
226
|
+
env.upload_content = AsyncMock()
|
|
227
|
+
|
|
228
|
+
await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
|
|
229
|
+
|
|
230
|
+
env.sandbox_client.execute_command.assert_awaited_once_with(
|
|
231
|
+
"sbx",
|
|
232
|
+
"mkdir -p '/tmp/other path' '/tmp/with space'",
|
|
233
|
+
timeout=10,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
@pytest.mark.asyncio
|
|
238
|
+
async def test_composable_env_quotes_log_path_when_collecting_logs():
|
|
239
|
+
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
240
|
+
env = ComposableEnv(
|
|
241
|
+
taskset=taskset,
|
|
242
|
+
harness=Harness(
|
|
243
|
+
run_command="true",
|
|
244
|
+
log_path="/tmp/log dir/agent.log",
|
|
245
|
+
),
|
|
246
|
+
)
|
|
247
|
+
env.sandbox_client = SimpleNamespace(
|
|
248
|
+
execute_command=AsyncMock(
|
|
249
|
+
return_value=SimpleNamespace(stdout="agent log\n", stderr="", exit_code=0)
|
|
250
|
+
),
|
|
251
|
+
teardown=lambda: None,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
state = {"sandbox_id": "sbx", "timing": {"total_ms": 0}}
|
|
255
|
+
|
|
256
|
+
await env.post_rollout(state)
|
|
257
|
+
|
|
258
|
+
env.sandbox_client.execute_command.assert_awaited_once_with(
|
|
259
|
+
"sbx",
|
|
260
|
+
"cat '/tmp/log dir/agent.log' 2>/dev/null || echo '<no logs>'",
|
|
261
|
+
working_dir=None,
|
|
262
|
+
)
|
|
263
|
+
assert state["agent_logs"] == "agent log"
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# ── install_env ──────────────────────────────────────────────────────────
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@pytest.mark.asyncio
|
|
270
|
+
async def test_composable_env_install_env_passes_to_execute():
|
|
271
|
+
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
272
|
+
env = ComposableEnv(
|
|
273
|
+
taskset=taskset,
|
|
274
|
+
harness=Harness(
|
|
275
|
+
run_command="true",
|
|
276
|
+
install_script="install-agent",
|
|
277
|
+
instruction_path="/tmp/prompt.txt",
|
|
278
|
+
),
|
|
279
|
+
install_env={"GH_TOKEN": "secret"},
|
|
280
|
+
)
|
|
281
|
+
env.sandbox_client = SimpleNamespace(
|
|
282
|
+
execute_command=AsyncMock(
|
|
283
|
+
return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
|
|
284
|
+
),
|
|
285
|
+
teardown=lambda: None,
|
|
286
|
+
)
|
|
287
|
+
env.taskset.setup = AsyncMock()
|
|
288
|
+
env.upload_content = AsyncMock()
|
|
289
|
+
|
|
290
|
+
await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
|
|
291
|
+
|
|
292
|
+
install_call = env.sandbox_client.execute_command.await_args_list[-1]
|
|
293
|
+
assert install_call == call(
|
|
294
|
+
"sbx", "install-agent", timeout=300, env={"GH_TOKEN": "secret"}
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
@pytest.mark.asyncio
|
|
299
|
+
async def test_composable_env_install_env_none_by_default():
|
|
300
|
+
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
301
|
+
env = ComposableEnv(
|
|
302
|
+
taskset=taskset,
|
|
303
|
+
harness=Harness(
|
|
304
|
+
run_command="true",
|
|
305
|
+
install_script="install-agent",
|
|
306
|
+
instruction_path="/tmp/prompt.txt",
|
|
307
|
+
),
|
|
308
|
+
)
|
|
309
|
+
env.sandbox_client = SimpleNamespace(
|
|
310
|
+
execute_command=AsyncMock(
|
|
311
|
+
return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
|
|
312
|
+
),
|
|
313
|
+
teardown=lambda: None,
|
|
314
|
+
)
|
|
315
|
+
env.taskset.setup = AsyncMock()
|
|
316
|
+
env.upload_content = AsyncMock()
|
|
317
|
+
|
|
318
|
+
await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
|
|
319
|
+
|
|
320
|
+
install_call = env.sandbox_client.execute_command.await_args_list[-1]
|
|
321
|
+
assert install_call == call("sbx", "install-agent", timeout=300)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
# ── get_upload_dirs ──────────────────────────────────────────────────────
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _make_temp_taskset_package(tmp_path, monkeypatch, *, with_skills: bool):
|
|
328
|
+
package_name = f"fixture_{tmp_path.name.replace('-', '_')}"
|
|
329
|
+
pkg_dir = tmp_path / package_name
|
|
330
|
+
pkg_dir.mkdir()
|
|
331
|
+
(pkg_dir / "__init__.py").write_text("")
|
|
332
|
+
(pkg_dir / "taskset_mod.py").write_text("MARKER = 1\n")
|
|
333
|
+
|
|
334
|
+
if with_skills:
|
|
335
|
+
skill_dir = pkg_dir / "skills" / "demo"
|
|
336
|
+
skill_dir.mkdir(parents=True)
|
|
337
|
+
(skill_dir / "SKILL.md").write_text("---\nname: demo\n---\n")
|
|
338
|
+
(skill_dir / "pyproject.toml").write_text(
|
|
339
|
+
"[project]\nname = 'skill-demo'\nversion = '0.0.0'\n"
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
monkeypatch.syspath_prepend(str(tmp_path))
|
|
343
|
+
importlib.invalidate_caches()
|
|
344
|
+
mod = importlib.import_module(f"{package_name}.taskset_mod")
|
|
345
|
+
return mod, package_name
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
class MockSandboxTaskSetWithSkills(SandboxTaskSet):
|
|
349
|
+
"""SandboxTaskSet — skills auto-discovered via get_skills_dir()."""
|
|
350
|
+
|
|
351
|
+
def get_instruction(self, info):
|
|
352
|
+
return f"Fix bug #{info.get('id', 0)}"
|
|
353
|
+
|
|
354
|
+
def get_sandbox_spec(self, info):
|
|
355
|
+
return SandboxSpec(image="python:3.11-slim", cpu_cores=2, memory_gb=2)
|
|
356
|
+
|
|
357
|
+
def get_rubric(self):
|
|
358
|
+
return MockSandboxRubric()
|
|
359
|
+
|
|
360
|
+
def get_workdir(self, info):
|
|
361
|
+
return "/testbed"
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
@pytest.mark.asyncio
|
|
365
|
+
async def test_composable_env_uploads_task_dirs(tmp_path, monkeypatch):
|
|
366
|
+
mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
|
|
367
|
+
monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
|
|
368
|
+
taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
|
|
369
|
+
env = ComposableEnv(
|
|
370
|
+
taskset=taskset,
|
|
371
|
+
harness=Harness(
|
|
372
|
+
run_command="true",
|
|
373
|
+
install_script="install-agent",
|
|
374
|
+
skills_path="/task/skills",
|
|
375
|
+
),
|
|
376
|
+
)
|
|
377
|
+
env.sandbox_client = SimpleNamespace(
|
|
378
|
+
execute_command=AsyncMock(
|
|
379
|
+
return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
|
|
380
|
+
),
|
|
381
|
+
teardown=lambda: None,
|
|
382
|
+
)
|
|
383
|
+
env.taskset.setup = AsyncMock()
|
|
384
|
+
env.upload_content = AsyncMock()
|
|
385
|
+
env.upload_file = AsyncMock()
|
|
386
|
+
|
|
387
|
+
await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
|
|
388
|
+
|
|
389
|
+
env.upload_file.assert_awaited_once()
|
|
390
|
+
upload_call = env.upload_file.await_args
|
|
391
|
+
assert upload_call.args[0] == "sbx"
|
|
392
|
+
assert upload_call.args[1] == "/tmp/_upload_task_skills.tar.gz"
|
|
393
|
+
|
|
394
|
+
install_call = env.sandbox_client.execute_command.await_args_list[-1]
|
|
395
|
+
assert install_call == call("sbx", "install-agent", timeout=300)
|
|
396
|
+
extract_call = env.sandbox_client.execute_command.await_args_list[1]
|
|
397
|
+
assert extract_call == call(
|
|
398
|
+
"sbx",
|
|
399
|
+
"mkdir -p /task && tar -xzf /tmp/_upload_task_skills.tar.gz -C / && rm -f /tmp/_upload_task_skills.tar.gz",
|
|
400
|
+
timeout=60,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
@pytest.mark.asyncio
|
|
405
|
+
async def test_composable_env_no_upload_when_no_dirs(tmp_path, monkeypatch):
|
|
406
|
+
mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=False)
|
|
407
|
+
monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
|
|
408
|
+
taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
|
|
409
|
+
env = ComposableEnv(
|
|
410
|
+
taskset=taskset,
|
|
411
|
+
harness=Harness(
|
|
412
|
+
run_command="true",
|
|
413
|
+
install_script="install-agent",
|
|
414
|
+
skills_path="/task/skills",
|
|
415
|
+
),
|
|
416
|
+
)
|
|
417
|
+
env.sandbox_client = SimpleNamespace(
|
|
418
|
+
execute_command=AsyncMock(
|
|
419
|
+
return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
|
|
420
|
+
),
|
|
421
|
+
teardown=lambda: None,
|
|
422
|
+
)
|
|
423
|
+
env.taskset.setup = AsyncMock()
|
|
424
|
+
env.upload_content = AsyncMock()
|
|
425
|
+
env.upload_file = AsyncMock()
|
|
426
|
+
|
|
427
|
+
await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
|
|
428
|
+
|
|
429
|
+
assert env.upload_file.await_count == 0
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
# ── discover_sibling_dir ─────────────────────────────────────────────────
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def test_discover_sibling_dir_finds_skills(tmp_path, monkeypatch):
|
|
436
|
+
mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
|
|
437
|
+
monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
|
|
438
|
+
result = discover_sibling_dir(MockSandboxTaskSetWithSkills, "skills")
|
|
439
|
+
assert result is not None
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def test_discover_sibling_dir_returns_none_without_skills(tmp_path, monkeypatch):
|
|
443
|
+
mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=False)
|
|
444
|
+
monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
|
|
445
|
+
result = discover_sibling_dir(MockSandboxTaskSetWithSkills, "skills")
|
|
446
|
+
assert result is None
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
# ── get_skills_dir / auto-discovery ──────────────────────────────────────
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def test_get_skills_dir_auto_discovers(tmp_path, monkeypatch):
|
|
453
|
+
mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
|
|
454
|
+
monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
|
|
455
|
+
taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
|
|
456
|
+
assert taskset.get_skills_dir() is not None
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def test_get_skills_dir_returns_none_without_skills(tmp_path, monkeypatch):
|
|
460
|
+
mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=False)
|
|
461
|
+
monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
|
|
462
|
+
taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
|
|
463
|
+
assert taskset.get_skills_dir() is None
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def test_get_upload_dirs_includes_skills_automatically(tmp_path, monkeypatch):
|
|
467
|
+
mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
|
|
468
|
+
monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
|
|
469
|
+
taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
|
|
470
|
+
upload_dirs = taskset.get_upload_dirs()
|
|
471
|
+
assert "skills" in upload_dirs
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def test_get_upload_dirs_empty_without_skills(tmp_path, monkeypatch):
|
|
475
|
+
mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=False)
|
|
476
|
+
monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
|
|
477
|
+
taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
|
|
478
|
+
assert taskset.get_upload_dirs() == {}
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
# ── Harness metrics collection ───────────────────────────────────────────
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
@pytest.mark.asyncio
|
|
485
|
+
async def test_composable_env_collects_harness_metrics():
|
|
486
|
+
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
487
|
+
metrics_data = {
|
|
488
|
+
"turns": 3,
|
|
489
|
+
"stop_reason": "done",
|
|
490
|
+
"prompt_tokens": 100,
|
|
491
|
+
"completion_tokens": 25,
|
|
492
|
+
}
|
|
493
|
+
env = ComposableEnv(
|
|
494
|
+
taskset=taskset,
|
|
495
|
+
harness=Harness(
|
|
496
|
+
run_command="true",
|
|
497
|
+
log_path="/tmp/log dir/agent.log",
|
|
498
|
+
metrics_path="{workdir}/.rlm/sessions/*/meta.json",
|
|
499
|
+
metrics_key="metrics",
|
|
500
|
+
metrics_prefix="rlm_",
|
|
501
|
+
),
|
|
502
|
+
)
|
|
503
|
+
env.sandbox_client = SimpleNamespace(
|
|
504
|
+
execute_command=AsyncMock(
|
|
505
|
+
side_effect=[
|
|
506
|
+
SimpleNamespace(stdout="agent log\n", stderr="", exit_code=0),
|
|
507
|
+
SimpleNamespace(
|
|
508
|
+
stdout=json.dumps({"metrics": metrics_data}),
|
|
509
|
+
stderr="",
|
|
510
|
+
exit_code=0,
|
|
511
|
+
),
|
|
512
|
+
]
|
|
513
|
+
),
|
|
514
|
+
teardown=lambda: None,
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
state = {
|
|
518
|
+
"sandbox_id": "sbx",
|
|
519
|
+
"info": {"id": 0},
|
|
520
|
+
"timing": {"total_ms": 0},
|
|
521
|
+
"trajectory": [],
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
await env.post_rollout(state)
|
|
525
|
+
|
|
526
|
+
assert state["agent_logs"] == "agent log"
|
|
527
|
+
assert state["rlm_turns"] == 3
|
|
528
|
+
assert state["rlm_stop_reason"] == "done"
|
|
529
|
+
assert state["rlm_prompt_tokens"] == 100
|
|
530
|
+
assert state["rlm_completion_tokens"] == 25
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
@pytest.mark.asyncio
|
|
534
|
+
async def test_composable_env_metrics_with_key_whitelist():
|
|
535
|
+
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
536
|
+
env = ComposableEnv(
|
|
537
|
+
taskset=taskset,
|
|
538
|
+
harness=Harness(
|
|
539
|
+
run_command="true",
|
|
540
|
+
metrics_path="{workdir}/metrics.json",
|
|
541
|
+
metrics_prefix="agent_",
|
|
542
|
+
metrics_keys=["turns", "tokens"],
|
|
543
|
+
),
|
|
544
|
+
)
|
|
545
|
+
env.sandbox_client = SimpleNamespace(
|
|
546
|
+
execute_command=AsyncMock(
|
|
547
|
+
return_value=SimpleNamespace(
|
|
548
|
+
stdout=json.dumps({"turns": 5, "tokens": 200, "secret": "hidden"}),
|
|
549
|
+
stderr="",
|
|
550
|
+
exit_code=0,
|
|
551
|
+
)
|
|
552
|
+
),
|
|
553
|
+
teardown=lambda: None,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
state = {
|
|
557
|
+
"sandbox_id": "sbx",
|
|
558
|
+
"info": {"id": 0},
|
|
559
|
+
"timing": {"total_ms": 0},
|
|
560
|
+
"trajectory": [],
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
await env.post_rollout(state)
|
|
564
|
+
|
|
565
|
+
assert state["agent_turns"] == 5
|
|
566
|
+
assert state["agent_tokens"] == 200
|
|
567
|
+
assert "agent_secret" not in state
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
@pytest.mark.asyncio
|
|
571
|
+
async def test_composable_env_no_metrics_when_path_not_set():
|
|
572
|
+
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
573
|
+
env = ComposableEnv(
|
|
574
|
+
taskset=taskset,
|
|
575
|
+
harness=Harness(run_command="true"),
|
|
576
|
+
)
|
|
577
|
+
env.sandbox_client = SimpleNamespace(
|
|
578
|
+
execute_command=AsyncMock(),
|
|
579
|
+
teardown=lambda: None,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
state = {
|
|
583
|
+
"sandbox_id": "sbx",
|
|
584
|
+
"info": {"id": 0},
|
|
585
|
+
"timing": {"total_ms": 0},
|
|
586
|
+
"trajectory": [],
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
await env.post_rollout(state)
|
|
590
|
+
|
|
591
|
+
# No execute_command calls since no log_path and no metrics_path
|
|
592
|
+
env.sandbox_client.execute_command.assert_not_awaited()
|